In [137]:
# Import Dependencies
import pandas as pd
import numpy as np
import os
from sklearn.metrics import confusion_matrix

In [138]:
# Make a reference to the crimes.csv file path
csv_path = "Resources/Crimes_2019.csv"

# Import the crimes.csv file as a DataFrame
crimes_2019 = pd.read_csv(csv_path, encoding="utf-8")
crimes_2019.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Longitude,Location,Historical Wards 2003-2015,Zip Codes,Community Areas,Census Tracts,Wards,Boundaries - ZIP Codes,Police Districts,Police Beats
0,11865111,JC477762,9/21/2019 0:00,036XX W 62ND ST,1750,OFFENSE INVOLVING CHILDREN,CHILD ABUSE,RESIDENTIAL YARD (FRONT/BACK),False,True,...,-87.713894,"(41.7806197, -87.713893847)",23.0,21867.0,63.0,339.0,6.0,58.0,13.0,276.0
1,11865784,JC478477,9/30/2019 12:00,021XX W ARTHUR AVE,5002,OTHER OFFENSE,OTHER VEHICLE OFFENSE,RESIDENCE,False,True,...,-87.683325,"(41.999652591, -87.683324684)",42.0,22528.0,20.0,320.0,27.0,12.0,11.0,46.0
2,11862475,JC474623,10/16/2019 9:59,017XX W 79TH ST,143A,WEAPONS VIOLATION,UNLAWFUL POSS OF HANDGUN,STREET,True,False,...,-87.667398,"(41.750304023, -87.667398352)",18.0,21554.0,70.0,574.0,31.0,59.0,20.0,231.0
3,11862816,JC475002,10/16/2019 14:35,071XX S RIDGELAND AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,True,True,...,-87.58241,"(41.764807775, -87.582410292)",32.0,22538.0,39.0,452.0,37.0,24.0,18.0,217.0
4,11868795,JC482189,10/13/2019 21:40,027XX N CLARK ST,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,DEPARTMENT STORE,False,False,...,,,,,,,,,,


In [139]:
# Remove null values
crimes_2019_complete=crimes_2019.dropna()

In [140]:
# Reduce file removing unnecessary columns (including arrest column)
reduced_2019 = crimes_2019_complete.drop(["Date", "ID", "Case Number", "Block", "IUCR", "Description", "X Coordinate", "Y Coordinate", "Year", "Updated On",\
                             "Latitude", "Longitude", "Location","Boundaries - ZIP Codes", "Zip Codes","Historical Wards 2003-2015"\
                            ,"Census Tracts","Police Beats", "Police Districts","Community Areas", "Wards", "FBI Code","Location Description"], axis=1)
reduced_2019.head()

Unnamed: 0,Primary Type,Arrest,Domestic,Beat,District,Ward,Community Area
0,OFFENSE INVOLVING CHILDREN,False,True,823,8,23.0,65.0
1,OTHER OFFENSE,False,True,2412,24,50.0,2.0
2,WEAPONS VIOLATION,True,False,611,6,17.0,71.0
3,BATTERY,True,True,324,3,7.0,43.0
5,ASSAULT,False,False,331,3,5.0,43.0


In [141]:
# Use Pandas to get_dummies to convert categorical data
data_2019 = pd.get_dummies(reduced_2019)
data_2019.head()

Unnamed: 0,Arrest,Domestic,Beat,District,Ward,Community Area,Primary Type_ARSON,Primary Type_ASSAULT,Primary Type_BATTERY,Primary Type_BURGLARY,...,Primary Type_OTHER NARCOTIC VIOLATION,Primary Type_OTHER OFFENSE,Primary Type_PROSTITUTION,Primary Type_PUBLIC INDECENCY,Primary Type_PUBLIC PEACE VIOLATION,Primary Type_ROBBERY,Primary Type_SEX OFFENSE,Primary Type_STALKING,Primary Type_THEFT,Primary Type_WEAPONS VIOLATION
0,False,True,823,8,23.0,65.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,True,2412,24,50.0,2.0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,True,False,611,6,17.0,71.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,True,True,324,3,7.0,43.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,False,False,331,3,5.0,43.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [142]:
# Reduce file removing unnecessary columns (including arrest column)
data_2019_no_arrests = data_2019.drop(["Arrest"], axis=1)
data_2019_no_arrests.head()

Unnamed: 0,Domestic,Beat,District,Ward,Community Area,Primary Type_ARSON,Primary Type_ASSAULT,Primary Type_BATTERY,Primary Type_BURGLARY,Primary Type_CONCEALED CARRY LICENSE VIOLATION,...,Primary Type_OTHER NARCOTIC VIOLATION,Primary Type_OTHER OFFENSE,Primary Type_PROSTITUTION,Primary Type_PUBLIC INDECENCY,Primary Type_PUBLIC PEACE VIOLATION,Primary Type_ROBBERY,Primary Type_SEX OFFENSE,Primary Type_STALKING,Primary Type_THEFT,Primary Type_WEAPONS VIOLATION
0,True,823,8,23.0,65.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,True,2412,24,50.0,2.0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,False,611,6,17.0,71.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,True,324,3,7.0,43.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,False,331,3,5.0,43.0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [143]:
# Push the remade DataFrame to a new CSV file
data_2019_no_arrests.to_csv("Output/data_2019.csv",
                  encoding="utf-8", index=False, header=True)

### We have the data saved into a file.

In [144]:
# Load the model
from sklearn.externals import joblib
model = joblib.load("./chicago-crime-trained-model-Frances.pkl")

In [145]:
# make predictions
model.predict(data_2019_no_arrests)


array([False, False,  True, ..., False, False, False])

In [146]:
# add arrest prediction to df
data_2019["Arrest_Prediction"]=model.predict(data_2019_no_arrests)
data_2019.head

<bound method NDFrame.head of         Arrest  Domestic  Beat  District  Ward  Community Area  \
0        False      True   823         8  23.0            65.0   
1        False      True  2412        24  50.0             2.0   
2         True     False   611         6  17.0            71.0   
3         True      True   324         3   7.0            43.0   
5        False     False   331         3   5.0            43.0   
6        False     False   922         9  15.0            58.0   
7        False      True   433         4  10.0            55.0   
8         True     False   523         5   9.0            53.0   
9        False     False  2525        25  35.0            22.0   
10       False      True  1522        15  29.0            25.0   
11       False      True   726         7  17.0            67.0   
14       False     False   932         9  16.0            61.0   
15       False     False  2432        24  40.0             1.0   
17       False     False   731         7   6.0

In [147]:
# add column to determine if prediction is accurate
data_2019["Correct_Prediction?"] = np.where(data_2019.Arrest_Prediction == data_2019.Arrest, 'True', 'False')
data_2019.head()

Unnamed: 0,Arrest,Domestic,Beat,District,Ward,Community Area,Primary Type_ARSON,Primary Type_ASSAULT,Primary Type_BATTERY,Primary Type_BURGLARY,...,Primary Type_PROSTITUTION,Primary Type_PUBLIC INDECENCY,Primary Type_PUBLIC PEACE VIOLATION,Primary Type_ROBBERY,Primary Type_SEX OFFENSE,Primary Type_STALKING,Primary Type_THEFT,Primary Type_WEAPONS VIOLATION,Arrest_Prediction,Correct_Prediction?
0,False,True,823,8,23.0,65.0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True
1,False,True,2412,24,50.0,2.0,0,0,0,0,...,0,0,0,0,0,0,0,0,False,True
2,True,False,611,6,17.0,71.0,0,0,0,0,...,0,0,0,0,0,0,0,1,True,True
3,True,True,324,3,7.0,43.0,0,0,1,0,...,0,0,0,0,0,0,0,0,False,False
5,False,False,331,3,5.0,43.0,0,1,0,0,...,0,0,0,0,0,0,0,0,False,True


In [148]:
# provide prediction result counts
data_2019["Correct_Prediction?"].value_counts()

True     176662
False     27124
Name: Correct_Prediction?, dtype: int64

In [149]:
data_2019["Arrest"].count()

203786

In [150]:
# calculate model accuracy
print("{0:0.2f}% Accuracy achieved by the model".format(176662/len(data_2019.index) * 100))

86.69% Accuracy achieved by the model
