In [1]:
import pandas as pd
import numpy as np
import config

import sqlalchemy
from sqlalchemy import create_engine

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

pd.options.mode.chained_assignment = None

In [2]:
engine = create_engine(f'postgresql://postgres:{config.password}@localhost:5432/crime_db')

In [3]:
crime_df = pd.read_sql("chicago", con=engine)
crime_df.head()

Unnamed: 0,index,ID,Date,Primary_Type,Description,Location_Description,Arrest,District,Year,Latitude,Longitude,Historical_Wards,Zip_Codes,Police_Districts
0,0,11824091,2019-09-10 23:55:00,WEAPONS VIOLATION,UNLAWFUL USE HANDGUN,RESIDENCE PORCH/HALLWAY,False,7,2019,41.775402,-87.653178,17,21559,17
1,1,11824149,2019-09-10 23:50:00,OTHER OFFENSE,VEHICLE TITLE/REG OFFENSE,STREET,True,6,2019,41.750582,-87.647984,17,21554,20
2,2,11824121,2019-09-10 23:50:00,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,15,2019,41.880829,-87.752634,11,22216,25
3,3,11824152,2019-09-10 23:47:00,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE,False,6,2019,41.751657,-87.650131,17,21554,20
4,5,11824113,2019-09-10 23:42:00,ASSAULT,SIMPLE,SIDEWALK,False,15,2019,41.89988,-87.748366,4,4299,25


In [4]:
crime_types = ['THEFT', 'BATTERY', 'CRIMINAL DAMAGE', 'NARCOTICS', 'ASSAULT', 
    'MOTOR VEHICLE THEFT', 'ROBBERY', 'WEAPONS VIOLATION', 'HOMICIDE', 'ARSON']

filtered_crime_df = crime_df[crime_df.Primary_Type.isin(crime_types)]
filtered_crime_df.head()

Unnamed: 0,index,ID,Date,Primary_Type,Description,Location_Description,Arrest,District,Year,Latitude,Longitude,Historical_Wards,Zip_Codes,Police_Districts
0,0,11824091,2019-09-10 23:55:00,WEAPONS VIOLATION,UNLAWFUL USE HANDGUN,RESIDENCE PORCH/HALLWAY,False,7,2019,41.775402,-87.653178,17,21559,17
2,2,11824121,2019-09-10 23:50:00,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,15,2019,41.880829,-87.752634,11,22216,25
3,3,11824152,2019-09-10 23:47:00,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE,False,6,2019,41.751657,-87.650131,17,21554,20
4,5,11824113,2019-09-10 23:42:00,ASSAULT,SIMPLE,SIDEWALK,False,15,2019,41.89988,-87.748366,4,4299,25
7,8,11824122,2019-09-10 23:33:00,BATTERY,DOMESTIC BATTERY SIMPLE,ALLEY,True,25,2019,41.919319,-87.758462,4,22615,6


In [5]:
filtered_crime_df['Hour'] = filtered_crime_df['Date'].dt.hour
filtered_crime_df['Month'] = filtered_crime_df['Date'].dt.month
filtered_crime_df['Day'] = filtered_crime_df['Date'].dt.day
filtered_crime_df['Day_of_Week'] = filtered_crime_df['Date'].dt.weekday_name
filtered_crime_df['Day_of_Year'] = filtered_crime_df['Date'].dt.dayofyear

filtered_crime_df.head()

Unnamed: 0,index,ID,Date,Primary_Type,Description,Location_Description,Arrest,District,Year,Latitude,Longitude,Historical_Wards,Zip_Codes,Police_Districts,Hour,Month,Day,Day_of_Week,Day_of_Year
0,0,11824091,2019-09-10 23:55:00,WEAPONS VIOLATION,UNLAWFUL USE HANDGUN,RESIDENCE PORCH/HALLWAY,False,7,2019,41.775402,-87.653178,17,21559,17,23,9,10,Tuesday,253
2,2,11824121,2019-09-10 23:50:00,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,15,2019,41.880829,-87.752634,11,22216,25,23,9,10,Tuesday,253
3,3,11824152,2019-09-10 23:47:00,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE,False,6,2019,41.751657,-87.650131,17,21554,20,23,9,10,Tuesday,253
4,5,11824113,2019-09-10 23:42:00,ASSAULT,SIMPLE,SIDEWALK,False,15,2019,41.89988,-87.748366,4,4299,25,23,9,10,Tuesday,253
7,8,11824122,2019-09-10 23:33:00,BATTERY,DOMESTIC BATTERY SIMPLE,ALLEY,True,25,2019,41.919319,-87.758462,4,22615,6,23,9,10,Tuesday,253


In [6]:
# inflation_df = pd.read_csv("assets/data/Consumer_Price_Index_All_Urban_Consumers.csv")
# inflation_df.head()

# unemployment_df = pd.read_csv("assets/data/Illinois_Unemployment_Rate_2009-2019.csv")
# unemployment_df.head()

In [7]:
# merged_df = pd.merge(filtered_crime_df, inflation_df, on=(['Year', 'Month']), how='inner').fillna('Unknown')
# second_merged_df = pd.merge(merged_df, unemployment_df, on=(['Year', 'Month']), how='inner').fillna('Unknown')
# second_merged_df.head()

In [8]:
# bins = [0, 6, 12, 18, 23]
# labels = ['Night','Morning','Afternoon','Evening']
# model_df['Hour_Binned'] = pd.cut(model_df['Hour'], bins=bins, labels=labels)

group_names = ['Night','Morning','Afternoon','Evening']
filtered_crime_df["Hour_Bins"] = pd.cut(filtered_crime_df.Hour, bins=4, labels=group_names)

In [9]:
filtered_crime_df.round({'Latitude': 2, 'Longitude': 2})

Unnamed: 0,index,ID,Date,Primary_Type,Description,Location_Description,Arrest,District,Year,Latitude,Longitude,Historical_Wards,Zip_Codes,Police_Districts,Hour,Month,Day,Day_of_Week,Day_of_Year,Hour_Bins
0,0,11824091,2019-09-10 23:55:00,WEAPONS VIOLATION,UNLAWFUL USE HANDGUN,RESIDENCE PORCH/HALLWAY,False,007,2019,41.78,-87.65,17,21559,17,23,9,10,Tuesday,253,Evening
2,2,11824121,2019-09-10 23:50:00,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,015,2019,41.88,-87.75,11,22216,25,23,9,10,Tuesday,253,Evening
3,3,11824152,2019-09-10 23:47:00,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE,False,006,2019,41.75,-87.65,17,21554,20,23,9,10,Tuesday,253,Evening
4,5,11824113,2019-09-10 23:42:00,ASSAULT,SIMPLE,SIDEWALK,False,015,2019,41.90,-87.75,4,4299,25,23,9,10,Tuesday,253,Evening
7,8,11824122,2019-09-10 23:33:00,BATTERY,DOMESTIC BATTERY SIMPLE,ALLEY,True,025,2019,41.92,-87.76,4,22615,6,23,9,10,Tuesday,253,Evening
8,9,11824162,2019-09-10 23:30:00,ROBBERY,STRONGARM - NO WEAPON,SIDEWALK,False,015,2019,41.89,-87.77,52,22216,25,23,9,10,Tuesday,253,Evening
9,10,11824148,2019-09-10 23:30:00,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,025,2019,41.91,-87.72,10,22535,6,23,9,10,Tuesday,253,Evening
10,11,11824756,2019-09-10 23:30:00,CRIMINAL DAMAGE,TO PROPERTY,RESIDENTIAL YARD (FRONT/BACK),False,006,2019,41.75,-87.66,18,21554,20,23,9,10,Tuesday,253,Evening
11,12,11824127,2019-09-10 23:30:00,THEFT,OVER $500,STREET,False,012,2019,41.88,-87.66,41,14917,15,23,9,10,Tuesday,253,Evening
14,15,11824100,2019-09-10 23:10:00,NARCOTICS,POSS: CANNABIS MORE THAN 30GMS,STREET,True,018,2019,41.90,-87.63,51,14926,14,23,9,10,Tuesday,253,Evening


In [10]:
#List of relevant columns for model
col_list = [ 'Year','Month','Day_of_Week','Hour_Bins','Primary_Type','Latitude',
            'Longitude','Location_Description', 'Arrest','Zip_Codes', 'Police_Districts']

#Dataframe created from list of relevant columns
model_df = filtered_crime_df[col_list]
model_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2199080 entries, 0 to 2868956
Data columns (total 11 columns):
Year                    int64
Month                   int64
Day_of_Week             object
Hour_Bins               category
Primary_Type            object
Latitude                float64
Longitude               float64
Location_Description    object
Arrest                  bool
Zip_Codes               object
Police_Districts        object
dtypes: bool(1), category(1), float64(2), int64(2), object(5)
memory usage: 172.0+ MB


In [11]:
null_columns = model_df.columns[model_df.isnull().any()]
model_df[null_columns].isnull().sum()

Location_Description    707
dtype: int64

In [12]:
model_df["Location_Description"].fillna("Unknown", inplace = True)

In [13]:
# model_df = model_df[:25000]

In [14]:
X = model_df.drop(['Primary_Type'], axis=1)
y = model_df['Primary_Type']

print(X.shape)
print(y.shape)

(2199080, 10)
(2199080,)


In [15]:
#encode string variables for input into the classification model:
encoded_X = pd.get_dummies(X, columns=['Location_Description', 'Zip_Codes', 'Hour_Bins', 'Day_of_Week','Police_Districts'])
print(encoded_X.shape)
# encoded_y = pd.get_dummies(y, columns=['Primary_Type'])

(2199080, 286)


In [16]:
# create scaler
scaler = MinMaxScaler()
normalized_X = scaler.fit_transform(encoded_X)

  return self.partial_fit(X, y)


In [17]:
X_train, X_test, y_train, y_test = train_test_split(normalized_X, y, test_size = 0.25, random_state = 21)

In [18]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")



Training Data Score: 0.49186387034578094
Testing Data Score: 0.49194754169925603


In [19]:
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
620197,BATTERY,ASSAULT
177652,NARCOTICS,WEAPONS VIOLATION
936012,BATTERY,BATTERY
2341060,CRIMINAL DAMAGE,CRIMINAL DAMAGE
1319042,THEFT,ROBBERY
2220580,THEFT,ROBBERY
1351423,CRIMINAL DAMAGE,THEFT
1364771,BATTERY,BATTERY
1913384,THEFT,THEFT
2654588,BATTERY,CRIMINAL DAMAGE


In [21]:
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

0.4752096331193044
[[    34     22    313    232      1     23     36     15    372      5]
 [    17   3119  18702   4200     15   1260   4309   1312  13337    255]
 [    39   6049  68538  11473     34   2766   8990   4233  27311    574]
 [    31   2396  20694  17784     32   4868   1888   1154  27947    195]
 [     2     22    272    140    257     96    195     31    217     18]
 [     8    715   3210   6009     23   6521   1467    670  12874     89]
 [     6    816   5694    313     13    184  53333    135    866    828]
 [     7   1024   9519   2335     16   1313   1381   3960   8623     73]
 [    48   3876  24754  14981     20   6330   2083   2706 107137    153]
 [     1    312   2643    376      8    145   5018    148    608    573]]
                     precision    recall  f1-score   support

              ARSON       0.18      0.03      0.05      1053
            ASSAULT       0.17      0.07      0.10     46526
            BATTERY       0.44      0.53      0.48    130007
    C

In [22]:
grad_class = GradientBoostingClassifier(learning_rate=0.1,n_estimators = 10, random_state = 42)
grad_class.fit(X_train, y_train)
y_pred_grad = grad_class.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred)) 
print(classification_report(y_test,y_pred_grad)) 

0.4752096331193044
[[    34     22    313    232      1     23     36     15    372      5]
 [    17   3119  18702   4200     15   1260   4309   1312  13337    255]
 [    39   6049  68538  11473     34   2766   8990   4233  27311    574]
 [    31   2396  20694  17784     32   4868   1888   1154  27947    195]
 [     2     22    272    140    257     96    195     31    217     18]
 [     8    715   3210   6009     23   6521   1467    670  12874     89]
 [     6    816   5694    313     13    184  53333    135    866    828]
 [     7   1024   9519   2335     16   1313   1381   3960   8623     73]
 [    48   3876  24754  14981     20   6330   2083   2706 107137    153]
 [     1    312   2643    376      8    145   5018    148    608    573]]


  'precision', 'predicted', average, warn_for)


                     precision    recall  f1-score   support

              ARSON       0.00      0.00      0.00      1053
            ASSAULT       0.00      0.00      0.00     46526
            BATTERY       0.46      0.53      0.49    130007
    CRIMINAL DAMAGE       0.58      0.04      0.08     76989
           HOMICIDE       1.00      0.22      0.37      1250
MOTOR VEHICLE THEFT       0.00      0.00      0.00     31586
          NARCOTICS       0.57      0.93      0.71     62188
            ROBBERY       0.40      0.01      0.02     28251
              THEFT       0.44      0.80      0.57    162088
  WEAPONS VIOLATION       0.00      0.00      0.00      9832

          micro avg       0.47      0.47      0.47    549770
          macro avg       0.35      0.25      0.22    549770
       weighted avg       0.41      0.47      0.38    549770



In [None]:
from tpot import TPOTClassifier
pipeline_optimizer = TPOTClassifier()

pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=42, verbosity=2, warm_start=True)

pipeline_optimizer.fit(X_train, y_train)

print(pipeline_optimizer.score(X_test, y_test))

In [None]:
pipeline_optimizer.export('tpot_exported_pipeline.py')