In [68]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
import sklearn as skl
import tensorflow as tf

---

The goal of the analysis is to identify patterns, trends, and factors contributing to crime in LA. Depending on the specific question we want to answer, our analysis might focus on predicting the occurrence of crimes, identifying crime hotspots, or understanding the factors that influence crime rates.

## Split the Data into Training and Testing Sets

### Step 1: Read the `Crime_Data_from_2020_to_Present_20240611.csv.csv` data from the `static\data\` folder into a Pandas DataFrame.

In [69]:
# Read the CSV file from the Resources folder into a Pandas DataFrame

crime_df = pd.read_csv(
    Path("static\data\Crime_Data_from_2020_to_Present_20240611.csv")            

)

# Review the DataFrame
crime_df.head()
# Review the DataFrame


Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Status,Status Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON
0,190326475,3/1/2020 0:00,3/1/2020 0:00,2130,7,Wilshire,784,1,510,VEHICLE - STOLEN,...,AA,Adult Arrest,510.0,998.0,,,1900 S LONGWOOD AV,,34.0375,-118.3506
1,200106753,2/9/2020 0:00,2/8/2020 0:00,1800,1,Central,182,1,330,BURGLARY FROM VEHICLE,...,IC,Invest Cont,330.0,998.0,,,1000 S FLOWER ST,,34.0444,-118.2628
2,200320258,11/11/2020 0:00,11/4/2020 0:00,1700,3,Southwest,356,1,480,BIKE - STOLEN,...,IC,Invest Cont,480.0,,,,1400 W 37TH ST,,34.021,-118.3002
3,200907217,5/10/2023 0:00,3/10/2020 0:00,2037,9,Van Nuys,964,1,343,SHOPLIFTING-GRAND THEFT ($950.01 & OVER),...,IC,Invest Cont,343.0,,,,14000 RIVERSIDE DR,,34.1576,-118.4387
4,220614831,8/18/2022 0:00,8/17/2020 0:00,1200,6,Hollywood,666,2,354,THEFT OF IDENTITY,...,IC,Invest Cont,354.0,,,,1900 TRANSIENT,,34.0944,-118.3277


In [70]:
#Generate our categorical variable list
crime_cat = crime_df.dtypes[crime_df.dtypes == "object"].index.tolist()

In [71]:
#Check the number of unique columns in each column
crime_df[crime_cat].nunique()

Date Rptd         1609
DATE OCC          1609
AREA NAME           21
Crm Cd Desc        139
Mocodes         306996
Vict Sex             5
Vict Descent        20
Premis Desc        306
Weapon Desc         79
Status               6
Status Desc          6
LOCATION         65757
Cross Street     10181
dtype: int64

In [72]:

# Read the CSV file from the Resources folder into a Pandas DataFrame

crimebin_df = pd.read_csv(
    Path("static\data\crime_bins.csv")            

)

# Review the DataFrame
crimebin_df.head()
# Review the DataFrame

Unnamed: 0,Crm Cd Desc,CRIMEBIN
0,ARSON,VANDALISM
1,ASSAULT WITH DEADLY WEAPON ON POLICE OFFICER,VIOLENT
2,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",VIOLENT
3,ATTEMPTED ROBBERY,ROBBERY
4,BATTERY - SIMPLE ASSAULT,VIOLENT


In [73]:
merged_df = pd.merge(crime_df, crimebin_df, on='Crm Cd Desc', how='inner')
merged_df

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Status Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON,CRIMEBIN
0,190326475,3/1/2020 0:00,3/1/2020 0:00,2130,7,Wilshire,784,1,510,VEHICLE - STOLEN,...,Adult Arrest,510.0,998.0,,,1900 S LONGWOOD AV,,34.0375,-118.3506,VEHICLE
1,221008844,5/6/2022 0:00,11/1/2020 0:00,130,10,West Valley,1029,1,510,VEHICLE - STOLEN,...,Invest Cont,510.0,,,,VALJEAN ST,VANOWEN AV,34.1939,-118.4859,VEHICLE
2,200412582,9/9/2020 0:00,9/9/2020 0:00,630,4,Hollenbeck,413,1,510,VEHICLE - STOLEN,...,Invest Cont,510.0,,,,200 E AVENUE 28,,34.0820,-118.2130,VEHICLE
3,201810154,4/26/2020 0:00,4/22/2020 0:00,1900,18,Southeast,1802,1,510,VEHICLE - STOLEN,...,Invest Cont,510.0,,,,90TH,WALL,33.9547,-118.2717,VEHICLE
4,231510293,5/27/2023 0:00,11/24/2020 0:00,200,15,N Hollywood,1504,1,510,VEHICLE - STOLEN,...,Invest Cont,510.0,,,,7500 LAUREL CANYON BL,,34.2071,-118.3965,VEHICLE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
947678,231008207,4/20/2023 0:00,11/1/2022 0:00,1900,10,West Valley,1077,1,445,DISHONEST EMPLOYEE ATTEMPTED THEFT,...,Adult Arrest,445.0,,,,17300 VENTURA BL,,34.1608,-118.5098,theft
947679,231507120,3/16/2023 0:00,3/12/2023 0:00,2015,15,N Hollywood,1533,1,445,DISHONEST EMPLOYEE ATTEMPTED THEFT,...,Invest Cont,445.0,,,,6000 LAUREL CANYON BL,,34.1794,-118.3965,theft
947680,241708301,4/13/2024 0:00,10/30/2023 0:00,2130,17,Devonshire,1782,1,445,DISHONEST EMPLOYEE ATTEMPTED THEFT,...,Invest Cont,445.0,,,,8800 CORBIN AV,,34.2302,-118.5623,theft
947681,241707918,4/2/2024 0:00,1/11/2024 0:00,1959,17,Devonshire,1782,1,445,DISHONEST EMPLOYEE ATTEMPTED THEFT,...,Invest Cont,445.0,,,,8800 CORBIN AV,,34.2302,-118.5623,theft


# Binning. tried with this binning and got 19% 

In [7]:
# Binning. tried with this binning and got 19% 
# from sklearn.preprocessing import KBinsDiscretizer
# encoder = LabelEncoder()
# merged_df['crimecode'] = encoder.fit_transform(merged_df['Crm Cd Desc'])
# kbins = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
# merged_df['crime_bins'] = kbins.fit_transform(merged_df[['crimecode']])
# merged.head()

In [74]:
# crime_df.to_csv("static\data\Crimebinned.csv")

### Step 2: Create the labels set (`y`)  from the “crime” column, and then create the features (`X`) DataFrame from the remaining columns.

Features are input variable that we will use to predict the target variable.  For Crime analysis, potential features could be 
1. Temporal Features: Date, time,day of the week,month or season and year of crime
2. Spatial Features: Location coordinates, District, Zipcode ( we need to add county and zipcode)
3. Crime Characteristic :  Type of crime (Crm cd column),Description of the crime (Crm cd Desc),Crime Severity ( we do not have this)
4. socio economic and Demographic features :  like population Density,Median household income, unemployment rate, education level, age distribution, ethnicity distribution  - We do not have this information.

Target is dependent Variable  : This is what we want to predict or understand.  potential targets.
1. Crime Occurence (whether a crime will occur at a given time and location (binary Classification))
2. Crime Type ( the type of crime that will occur (multi-class classification))
3. Crime Severity ( the severity level of Crime (regression or classification))




In [75]:
#Encode the categorical Variables.
label_encoder = LabelEncoder()
#crime_df['AREA NAME'] = label_encoder.fit_transform(crime_df['AREA NAME'])
merged_df['LOCATION'] = label_encoder.fit_transform(merged_df['LOCATION'])


In [76]:
# Separate the data into labels and features
y = merged_df['CRIMEBIN']
print(y.value_counts())

CRIMEBIN
theft        273056
VIOLENT      235848
BURGLARY     120716
VEHICLE      109134
VANDALISM    100081
ROBBERY       36798
SEX           23669
fraud         22293
OTHER         15602
CHILD          8924
Homicide       1562
Name: count, dtype: int64


In [77]:
merged_df.dtypes

DR_NO               int64
Date Rptd          object
DATE OCC           object
TIME OCC            int64
AREA                int64
AREA NAME          object
Rpt Dist No         int64
Part 1-2            int64
Crm Cd              int64
Crm Cd Desc        object
Mocodes            object
Vict Age            int64
Vict Sex           object
Vict Descent       object
Premis Cd         float64
Premis Desc        object
Weapon Used Cd    float64
Weapon Desc        object
Status             object
Status Desc        object
Crm Cd 1          float64
Crm Cd 2          float64
Crm Cd 3          float64
Crm Cd 4          float64
LOCATION            int32
Cross Street       object
LAT               float64
LON               float64
CRIMEBIN           object
dtype: object

In [78]:


# Separate the X variable, the features
#X = crime_df.drop(columns='')

#X['TIME OCC'] = pd.to_datetime(X['TIME OCC'],format='%H:%M').dt.time
#X['TIME OCC'] = X['TIME OCC'].apply(lambda x: f'{str(x)[:-2]}:{str(x)[-2:]}')
# Function to clean and convert time
def clean_time(time_str):

    try:
        # Convert to string and strip any non-numeric characters
        time_str = str(time_str).zfill(4)
           # Check if the time string is in HHMM format
        if len(time_str) == 4:
            return pd.to_datetime(time_str, format='%H%M').time()
        else:
            return None  # Handle or log the invalid format
    except ValueError:
        print(time_str)
        return None
        
merged_df['TIMEOCC'] = merged_df['TIME OCC'].apply(clean_time)
merged_df['HOUR OCC'] = merged_df['TIMEOCC'].apply(lambda x:x.hour if x is not None else None)

#X['TIME OCC'] = pd.to_datetime(X['TIME OCC'], format='%H:%M').dt.hour 

#X.loc[:,'Date Rptd'] = pd.to_datetime(X['Date Rptd'])
#X.loc[:,'DATE OCC'] = pd.to_datetime(X['DATE OCC'])



In [79]:
X = merged_df[['HOUR OCC','AREA','LOCATION','LAT','LON']]

In [80]:
print(y.value_counts())

CRIMEBIN
theft        273056
VIOLENT      235848
BURGLARY     120716
VEHICLE      109134
VANDALISM    100081
ROBBERY       36798
SEX           23669
fraud         22293
OTHER         15602
CHILD          8924
Homicide       1562
Name: count, dtype: int64


In [81]:
# Review the y variable Series
y[:5]

0    VEHICLE
1    VEHICLE
2    VEHICLE
3    VEHICLE
4    VEHICLE
Name: CRIMEBIN, dtype: object

In [82]:
print(y.value_counts())

CRIMEBIN
theft        273056
VIOLENT      235848
BURGLARY     120716
VEHICLE      109134
VANDALISM    100081
ROBBERY       36798
SEX           23669
fraud         22293
OTHER         15602
CHILD          8924
Homicide       1562
Name: count, dtype: int64


In [83]:
# Review the X variable DataFrame
X[:5]

Unnamed: 0,HOUR OCC,AREA,LOCATION,LAT,LON
0,21,7,21444,34.0375,-118.3506
1,1,10,64769,34.1939,-118.4859
2,6,4,22715,34.082,-118.213
3,19,18,55360,33.9547,-118.2717
4,2,15,49706,34.2071,-118.3965


In [84]:
print(y.value_counts())

CRIMEBIN
theft        273056
VIOLENT      235848
BURGLARY     120716
VEHICLE      109134
VANDALISM    100081
ROBBERY       36798
SEX           23669
fraud         22293
OTHER         15602
CHILD          8924
Homicide       1562
Name: count, dtype: int64


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [85]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [86]:
print(y.value_counts())

CRIMEBIN
theft        273056
VIOLENT      235848
BURGLARY     120716
VEHICLE      109134
VANDALISM    100081
ROBBERY       36798
SEX           23669
fraud         22293
OTHER         15602
CHILD          8924
Homicide       1562
Name: count, dtype: int64


---

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Print the shapes to verify
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(758146, 5) (189537, 5) (758146,) (189537,)


In [88]:
print(y.value_counts())

CRIMEBIN
theft        273056
VIOLENT      235848
BURGLARY     120716
VEHICLE      109134
VANDALISM    100081
ROBBERY       36798
SEX           23669
fraud         22293
OTHER         15602
CHILD          8924
Homicide       1562
Name: count, dtype: int64


## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [89]:
#impute missing values 
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)
#Standardize the features 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [94]:

# Instantiate the Logistic Regression model
logistic_regression_model = LogisticRegression(random_state=42,max_iter=1000, solver='saga',n_jobs=-1)

# Fit the model using training data
lr_model =logistic_regression_model.fit(X_train, y_train)


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [95]:
# Make a prediction using the testing data
predictions = lr_model.predict(X_test)

In [41]:
print(X_test)

[[ 0.42894584  0.04484097  0.60273844  0.07631285 -0.02021205]
 [-1.25707362  1.68401612  1.65951004  0.12279212 -0.09212577]
 [-0.18415215  1.68401612  0.66606518  0.11750765 -0.09238563]
 ...
 [ 1.50186731  0.20875848  0.970129   -0.0082386  -0.03753648]
 [-1.71689711 -1.10258164  1.54951033  0.0155415  -0.02255085]
 [-0.49070114 -0.11907655 -0.61547923  0.13282059 -0.08365412]]


In [96]:
print(y_test)

370428          SEX
337786      VIOLENT
741938        theft
791739    VANDALISM
105786     BURGLARY
            ...    
53442       VEHICLE
143807     BURGLARY
57967       VEHICLE
355556      VIOLENT
497135     BURGLARY
Name: CRIMEBIN, Length: 189537, dtype: object


### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [97]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, predictions)

array([[    0,     0,     0,     0,     0,     0,     1,     0,  6033,
            0, 18161],
       [    0,     0,     0,     0,     0,     0,     0,     0,   466,
            0,  1337],
       [    0,     0,     0,     0,     0,     0,     0,     0,   102,
            0,   221],
       [    0,     0,     0,     0,     0,     0,     2,     0,  1068,
            0,  2045],
       [    0,     0,     0,     0,     0,     0,     0,     0,  3156,
            0,  4249],
       [    0,     0,     0,     0,     0,     0,     2,     0,  1201,
            0,  3550],
       [    0,     0,     0,     0,     0,     0,    12,     0,  5632,
            0, 14430],
       [    0,     0,     0,     0,     0,     0,     0,     0,  7527,
            0, 14461],
       [    0,     0,     0,     0,     0,     0,     8,     0, 16058,
            0, 31018],
       [    0,     0,     0,     0,     0,     0,     2,     0,  1018,
            0,  3358],
       [    0,     0,     0,     0,     0,     0,     3,    

In [98]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    BURGLARY       0.00      0.00      0.00     24195
       CHILD       0.00      0.00      0.00      1803
    Homicide       0.00      0.00      0.00       323
       OTHER       0.00      0.00      0.00      3115
     ROBBERY       0.00      0.00      0.00      7405
         SEX       0.00      0.00      0.00      4753
   VANDALISM       0.40      0.00      0.00     20074
     VEHICLE       0.00      0.00      0.00     21988
     VIOLENT       0.29      0.34      0.31     47084
       fraud       0.00      0.00      0.00      4378
       theft       0.31      0.77      0.44     54419

    accuracy                           0.30    189537
   macro avg       0.09      0.10      0.07    189537
weighted avg       0.20      0.30      0.20    189537



  _warn_prf(average, modifier, msg_start, len(result))
