In [77]:
# Import Dependencies
import pandas as pd
import numpy as np
import os
from sklearn.metrics import confusion_matrix

In [78]:
# Make a reference to the crimes.csv file path
csv_path = "Resources/Crimes_2018.csv"

# Import the crimes.csv file as a DataFrame
crimes_df = pd.read_csv(csv_path, encoding="utf-8")
crimes_df.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Longitude,Location,Historical Wards 2003-2015,Zip Codes,Community Areas,Census Tracts,Wards,Boundaries - ZIP Codes,Police Districts,Police Beats
0,11871253,JC485146,03/18/2018 09:00:00 AM,020XX W ADDISON ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,MEDICAL/DENTAL OFFICE,False,False,...,,,,,,,,,,
1,11870881,JC484692,10/21/2018 12:00:00 AM,043XX W CORTEZ ST,1562,SEX OFFENSE,AGG CRIMINAL SEXUAL ABUSE,RESIDENCE,False,False,...,,,,,,,,,,
2,11871304,JC485127,07/10/2018 06:50:00 AM,082XX S WOODLAWN AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,...,,,,,,,,,,
3,11871288,JC484913,10/29/2018 08:00:00 AM,108XX S WABASH AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,OTHER,False,False,...,,,,,,,,,,
4,11871319,JC484113,10/18/2018 10:05:00 PM,079XX S ESSEX AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,False,False,...,,,,,,,,,,


In [79]:
# Drop null values
crimes_df=crimes_df.dropna()

In [80]:
# Check how many arrests there are
crimes_df.Arrest.value_counts()

False    209155
True      52462
Name: Arrest, dtype: int64

In [81]:
# Remove unnecessary columns
crimes_2018 = crimes_df.drop(["Date", "ID", "Case Number", "Block", "IUCR", "Description", "X Coordinate", "Y Coordinate", "Year", "Updated On",\
                             "Latitude", "Longitude", "Location","Boundaries - ZIP Codes", "Zip Codes","Historical Wards 2003-2015"\
                            ,"Census Tracts","Police Beats", "Police Districts","Community Areas", "Wards", "Location Description", "FBI Code"], axis=1)
crimes_2018.head()

Unnamed: 0,Primary Type,Arrest,Domestic,Beat,District,Ward,Community Area
6,MOTOR VEHICLE THEFT,False,False,823,8,17.0,66.0
7,SEX OFFENSE,True,False,2522,25,36.0,19.0
8,INTIMIDATION,False,False,1713,17,33.0,14.0
9,ROBBERY,False,False,412,4,8.0,48.0
10,SEX OFFENSE,True,True,2515,25,36.0,19.0


In [82]:
# Get crime counts
crimes_2018["Primary Type"].value_counts()

THEFT                                63678
BATTERY                              49498
CRIMINAL DAMAGE                      27602
ASSAULT                              20241
DECEPTIVE PRACTICE                   17132
OTHER OFFENSE                        16874
NARCOTICS                            12778
BURGLARY                             11638
MOTOR VEHICLE THEFT                   9902
ROBBERY                               9624
CRIMINAL TRESPASS                     6842
WEAPONS VIOLATION                     5434
OFFENSE INVOLVING CHILDREN            2137
CRIM SEXUAL ASSAULT                   1553
PUBLIC PEACE VIOLATION                1359
INTERFERENCE WITH PUBLIC OFFICER      1305
SEX OFFENSE                           1048
PROSTITUTION                           716
HOMICIDE                               589
ARSON                                  371
LIQUOR LAW VIOLATION                   263
GAMBLING                               201
STALKING                               199
KIDNAPPING 

In [83]:
# Use Pandas to get_dummies to convert categorical data
crimes_2018_update = pd.get_dummies(crimes_2018)
crimes_2018_update.head()

Unnamed: 0,Arrest,Domestic,Beat,District,Ward,Community Area,Primary Type_ARSON,Primary Type_ASSAULT,Primary Type_BATTERY,Primary Type_BURGLARY,...,Primary Type_OTHER NARCOTIC VIOLATION,Primary Type_OTHER OFFENSE,Primary Type_PROSTITUTION,Primary Type_PUBLIC INDECENCY,Primary Type_PUBLIC PEACE VIOLATION,Primary Type_ROBBERY,Primary Type_SEX OFFENSE,Primary Type_STALKING,Primary Type_THEFT,Primary Type_WEAPONS VIOLATION
6,False,False,823,8,17.0,66.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,True,False,2522,25,36.0,19.0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
8,False,False,1713,17,33.0,14.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,False,False,412,4,8.0,48.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
10,True,True,2515,25,36.0,19.0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [84]:
# List columns
list(crimes_2018_update.columns)

['Arrest',
 'Domestic',
 'Beat',
 'District',
 'Ward',
 'Community Area',
 'Primary Type_ARSON',
 'Primary Type_ASSAULT',
 'Primary Type_BATTERY',
 'Primary Type_BURGLARY',
 'Primary Type_CONCEALED CARRY LICENSE VIOLATION',
 'Primary Type_CRIM SEXUAL ASSAULT',
 'Primary Type_CRIMINAL DAMAGE',
 'Primary Type_CRIMINAL TRESPASS',
 'Primary Type_DECEPTIVE PRACTICE',
 'Primary Type_GAMBLING',
 'Primary Type_HOMICIDE',
 'Primary Type_HUMAN TRAFFICKING',
 'Primary Type_INTERFERENCE WITH PUBLIC OFFICER',
 'Primary Type_INTIMIDATION',
 'Primary Type_KIDNAPPING',
 'Primary Type_LIQUOR LAW VIOLATION',
 'Primary Type_MOTOR VEHICLE THEFT',
 'Primary Type_NARCOTICS',
 'Primary Type_NON-CRIMINAL',
 'Primary Type_NON-CRIMINAL (SUBJECT SPECIFIED)',
 'Primary Type_OBSCENITY',
 'Primary Type_OFFENSE INVOLVING CHILDREN',
 'Primary Type_OTHER NARCOTIC VIOLATION',
 'Primary Type_OTHER OFFENSE',
 'Primary Type_PROSTITUTION',
 'Primary Type_PUBLIC INDECENCY',
 'Primary Type_PUBLIC PEACE VIOLATION',
 'Primary 

In [85]:
# Remove unnecessary column (not in new data set and only 1)
crimes_2018_updated = crimes_2018_update.drop(["Primary Type_NON-CRIMINAL (SUBJECT SPECIFIED)"], axis=1)
list(crimes_2018_updated.columns)

['Arrest',
 'Domestic',
 'Beat',
 'District',
 'Ward',
 'Community Area',
 'Primary Type_ARSON',
 'Primary Type_ASSAULT',
 'Primary Type_BATTERY',
 'Primary Type_BURGLARY',
 'Primary Type_CONCEALED CARRY LICENSE VIOLATION',
 'Primary Type_CRIM SEXUAL ASSAULT',
 'Primary Type_CRIMINAL DAMAGE',
 'Primary Type_CRIMINAL TRESPASS',
 'Primary Type_DECEPTIVE PRACTICE',
 'Primary Type_GAMBLING',
 'Primary Type_HOMICIDE',
 'Primary Type_HUMAN TRAFFICKING',
 'Primary Type_INTERFERENCE WITH PUBLIC OFFICER',
 'Primary Type_INTIMIDATION',
 'Primary Type_KIDNAPPING',
 'Primary Type_LIQUOR LAW VIOLATION',
 'Primary Type_MOTOR VEHICLE THEFT',
 'Primary Type_NARCOTICS',
 'Primary Type_NON-CRIMINAL',
 'Primary Type_OBSCENITY',
 'Primary Type_OFFENSE INVOLVING CHILDREN',
 'Primary Type_OTHER NARCOTIC VIOLATION',
 'Primary Type_OTHER OFFENSE',
 'Primary Type_PROSTITUTION',
 'Primary Type_PUBLIC INDECENCY',
 'Primary Type_PUBLIC PEACE VIOLATION',
 'Primary Type_ROBBERY',
 'Primary Type_SEX OFFENSE',
 'Prim

In [86]:
crimes_2018_updated.Arrest.value_counts()

False    209155
True      52462
Name: Arrest, dtype: int64

In [87]:
# Push the remade DataFrame to a new CSV file
crimes_2018_updated.to_csv("Output/data_2018_updated.csv",
                  encoding="utf-8", index=False, header=True)

### We have the data saved into a file.

In [88]:
# Assign X (data) and y (target)
X = crimes_2018_updated.drop("Arrest", axis=1)
y = crimes_2018_updated["Arrest"]
print(X.shape, y.shape)

(261617, 36) (261617,)


In [89]:
# Split our data into training and testing

from sklearn.model_selection import train_test_split
#strafity helps to make the data more random

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [90]:
# Check to ensure we have the the desired 75% train, 25% test split of the data

print("{0:0.2f}% in training set".format((len(X_train)/len(crimes_2018_updated.index)) * 100))
print("{0:0.2f}% in test set".format((len(X_test)/len(crimes_2018_updated.index)) * 100))

75.00% in training set
25.00% in test set


In [91]:
# Create a logisitc regression model 

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [92]:
# Fit (train) model with training data 
classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [93]:
# Validate the model using the test data

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8698703443214483
Testing Data Score: 0.8704686186071401


In [94]:
# Make predictions
predictions = classifier.predict(X_test)
print(f"First 50 Predictions:   {predictions[:50]}")
print(f"First 50 Actual labels: {y_test[:50].tolist()}")

First 50 Predictions:   [False False False False False False  True False False False False False
 False False False  True  True False False False False False False  True
 False False False False False False False False False  True False False
 False False False False  True False False False False False False False
 False False]
First 50 Actual labels: [True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, True, True, False, False, False, False, False, False, True, False, False, False, False, False, False, True, False, False, True, False, False, False, True, False, False, True, False, False, False, False, True, False, False, False, False]


In [95]:
# Confusion model
from sklearn import metrics

predictions = classifier.predict(X_test)

print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, predictions)))
print(metrics.confusion_matrix(y_test, predictions) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, predictions))


Accuracy: 0.8705
[[50984  1305]
 [ 7167  5949]]

Classification Report
              precision    recall  f1-score   support

       False       0.88      0.98      0.92     52289
        True       0.82      0.45      0.58     13116

   micro avg       0.87      0.87      0.87     65405
   macro avg       0.85      0.71      0.75     65405
weighted avg       0.87      0.87      0.86     65405



In [96]:
# predictions versus actual results
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,False,True
1,False,False
2,False,False
3,False,False
4,False,False
5,False,False
6,True,True
7,False,False
8,False,False
9,False,False


In [97]:
# Confusion matrix
confusion_matrix(y_test, predictions)

array([[50984,  1305],
       [ 7167,  5949]], dtype=int64)

In [98]:
tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
print(f"tn: {tn:3d}   fp: {fp:3d}")
print(f"fn: {fn:3d}   tp: {tp:3d}")

tn: 50984   fp: 1305
fn: 7167   tp: 5949


In [99]:
### Save the trained model
from sklearn.externals import joblib

joblib.dump(classifier, "./chicago-crime-trained-model-Frances.pkl")

['./chicago-crime-trained-model-Frances.pkl']