In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
from geopy.distance import vincenty
from datetime import datetime, timedelta
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

df = pd.read_csv('Crimes_-_2001_to_present.csv')

In [43]:
def dummy(domestic):   
    try: 
        if domestic == False:
            return 0
        else: 
            return 1
    except:
        pass

In [44]:
df['domestic_dummied'] = df['Domestic'].apply(dummy)
df['Arrest_Dummied'] = df['Arrest'].apply(dummy)

In [45]:
df['domestic_dummied'].value_counts()

0    20996
1     3164
Name: domestic_dummied, dtype: int64

In [46]:
df['Arrest_Dummied'].value_counts()

0    20199
1     3961
Name: Arrest_Dummied, dtype: int64

In [47]:
df.drop('FBI Code', axis=1, inplace = True)
df.drop('Updated On', axis=1, inplace = True)
df.drop('Primary Type', axis=1, inplace = True)
df.drop('IUCR', axis=1, inplace= True)
df.drop('X Coordinate', axis=1, inplace= True)
df.drop('Y Coordinate', axis=1, inplace= True)
df.drop('Location', axis=1, inplace= True)
df.drop('Block', axis=1, inplace= True)

In [48]:
df.dropna(subset=['Latitude', 'Longitude'], how='all', inplace=True)

In [49]:
df['Location Description'].fillna('RESIDENCE', inplace=True)

In [50]:
df['District'].dropna(how='all', inplace=True)

In [51]:
df.shape

(22750, 16)

In [52]:
df['District'].replace(' ', np.nan, inplace=True)

In [53]:
df.dropna(subset=['District'], inplace=True)

In [54]:
police_stations = pd.read_csv('Police_Stations.csv')
police_stations.head()

Unnamed: 0,DISTRICT,DISTRICT NAME,ADDRESS,CITY,STATE,ZIP,WEBSITE,PHONE,FAX,TTY,X COORDINATE,Y COORDINATE,LATITUDE,LONGITUDE,LOCATION
0,1,Central,1718 S State St,Chicago,IL,60616,http://home.chicagopolice.org/community/distri...,312-745-4290,312-745-3694,312-745-3693,1176569.052,1891771.704,41.858373,-87.627356,"(41.8583725929, -87.627356171)"
1,2,Wentworth,5101 S Wentworth Ave,Chicago,IL,60609,http://home.chicagopolice.org/community/distri...,312-747-8366,312-747-5396,312-747-6656,1175864.837,1871153.753,41.801811,-87.63056,"(41.8018110912, -87.6305601801)"
2,3,Grand Crossing,7040 S Cottage Grove Ave,Chicago,IL,60637,http://home.chicagopolice.org/community/distri...,312-747-8201,312-747-5479,312-747-9168,1182739.183,1858317.732,41.766431,-87.605748,"(41.7664308925, -87.6057478606)"
3,4,South Chicago,2255 E 103rd St,Chicago,IL,60617,http://home.chicagopolice.org/community/distri...,312-747-7581,312-747-5276,312-747-9169,1193131.299,1837090.265,41.707933,-87.568349,"(41.7079332906, -87.5683491228)"
4,5,Calumet,727 E 111th St,Chicago,IL,60628,http://home.chicagopolice.org/community/distri...,312-747-8210,312-747-5935,312-747-9170,1183305.427,1831462.313,41.692723,-87.604506,"(41.6927233639, -87.6045058667)"


In [55]:
def distance_from_police_station(lon, lat):
    min_dist = None
    for coord in zip(police_stations['LONGITUDE'].values, police_stations['LATITUDE'].values):
        if min_dist is None:
            min_dist = vincenty((lon, lat), coord, ellipsoid='WGS-84').miles
        else:
            dist =  vincenty((lon, lat), coord, ellipsoid='WGS-84').miles
            if dist < min_dist:
                min_dist = dist
    return min_dist

In [56]:
df['nearest_police_station'] = df.apply(lambda x: distance_from_police_station(x['Longitude'], x['Latitude']), axis=1)

In [57]:
chicago_campuses = pd.read_csv('ChicagoCampusCentralAddress.csv')

In [58]:
chicago_campuses

Unnamed: 0,University/College,Campus,Approx. Student Population,Residence Halls/Student Housing,Max Walking Distance From Central Address,Most Central Address,Latitude,Longitude
0,,,,,,,,
1,Depaul University,Lincoln Park (Main),"23, 539",Y,0.2,"2325 N Seminary, Chicago IL 60614",41.924739,-87.655636
2,Loyola University,Roger's Park (Main),"16, 437",Y,0.4,"1032 W Sheridan Rd, Chicago, IL 60660",41.998508,-87.657758
3,Chicago State University,Main,"4, 767",Y,0.2,"9501 S King Dr, Chicago, IL 60628",41.719584,-87.610879
4,Illinois Institute of Technology,Main,7792,Y,0.2,"3241 S. Wabash, Chicago IL 60616",41.835591,-87.624957
5,Northeastern University,Main,9891,Y,0.0,"3659 W Bryn Mawr Ave, Chicago, IL 60625",41.979751,-87.719812
6,Columbia College,Main,"8, 961",Y,0.3,"731 S Plymouth Ct, Chicago, IL 60605",41.8727,-87.628353
7,University of Chicago,Hyde Park (Main),"15, 391",Y,0.7,"1101 E 58th St # 105, Chicago, IL 60637",41.788972,-87.599263
8,University of Illinois at Chicago,East,"29, 048",Y,0.4,"421 S Halsted , Chicago IL 60607",41.868686,-87.647561
9,University of Illinois at Chicago,West,29049,Y,0.1,"809. S Damen Chicago, IL 60612",41.871063,-87.675879


In [59]:
chicago_campuses.dropna(how='all', inplace=True)

In [60]:
def distance_from_campus(lon, lat):
    min_dist = None
    for coord in zip(chicago_campuses['Longitude'].values, chicago_campuses['Latitude'].values):
        if min_dist is None:
            min_dist = vincenty((lon, lat), coord, ellipsoid='WGS-84').miles
        else:
            dist =  vincenty((lon, lat), coord, ellipsoid='WGS-84').miles
            if dist < min_dist:
                min_dist = dist
    return min_dist

In [61]:
df['nearest_campus'] = df.apply(lambda x: distance_from_campus(x['Longitude'], x['Latitude']), axis=1)

In [62]:
df.loc[df['nearest_campus'].idxmax()]

ID                                       6169464
Case Number                             HP247247
Date                      03/26/2008 04:00:00 AM
Description                  AGGRAVATED: HANDGUN
Location Description                      STREET
Arrest                                     False
Domestic                                   False
Beat                                        1434
District                                      14
Ward                                          32
Community Area                                24
Year                                        2008
Latitude                                 36.6194
Longitude                               -91.6866
domestic_dummied                               0
Arrest_Dummied                                 0
nearest_police_station                   12144.3
nearest_campus                           12146.5
Name: 4049, dtype: object

In [63]:
df.drop([4049], inplace=True)

In [64]:
df.loc[df['nearest_campus'].idxmax()]

ID                                                          10650589
Case Number                                                 HZ400550
Date                                          08/20/2016 05:00:00 PM
Description                                           NON-AGGRAVATED
Location Description      AIRPORT TERMINAL UPPER LEVEL - SECURE AREA
Arrest                                                         False
Domestic                                                       False
Beat                                                            1651
District                                                          16
Ward                                                              41
Community Area                                                    76
Year                                                            2016
Latitude                                                      41.979
Longitude                                                   -87.9065
domestic_dummied                  

In [65]:
df.loc[df['nearest_campus'].idxmin()]

ID                                          6197005
Case Number                                HP286082
Date                         04/17/2008 03:00:00 PM
Description                          NON-AGGRAVATED
Location Description      HOSPITAL BUILDING/GROUNDS
Arrest                                        False
Domestic                                      False
Beat                                           1224
District                                         12
Ward                                              2
Community Area                                   28
Year                                           2008
Latitude                                    41.8732
Longitude                                  -87.6759
domestic_dummied                                  0
Arrest_Dummied                                    0
nearest_police_station                     0.475988
nearest_campus                           0.00611198
Name: 4101, dtype: object

In [66]:
df.groupby('Community Area')['Case Number'].nunique()

Community Area
0.0        1
1.0      362
2.0      222
3.0      373
4.0      128
5.0      108
6.0      432
7.0      256
8.0      552
9.0       15
10.0      66
11.0      39
12.0      18
13.0      61
14.0     192
15.0     191
16.0     239
17.0      68
18.0      26
19.0     380
20.0     141
21.0     191
22.0     417
23.0     593
24.0     451
25.0    1425
26.0     338
27.0     343
28.0     489
29.0     598
        ... 
48.0     136
49.0     674
50.0      91
51.0     131
52.0      91
53.0     441
54.0     122
55.0      35
56.0     135
57.0      74
58.0     209
59.0      59
60.0     110
61.0     460
62.0      59
63.0     202
64.0      52
65.0     117
66.0     560
67.0     772
68.0     743
69.0     644
70.0     187
71.0     683
72.0      46
73.0     286
74.0      40
75.0     146
76.0      39
77.0     204
Name: Case Number, dtype: int64

In [67]:
def assign(community):
    if community == 0:
        return int(56)
    else:
        return community


df['Community Area'] = df['Community Area'].apply(assign)

In [68]:
df['Community Area'].isnull().sum()

2043

In [69]:
df.dropna(subset=['Community Area'], how ='all', inplace =True)

In [70]:
df.dropna(subset=['Ward'], how='all', inplace=True)

In [71]:
df['Community Area'] = df['Community Area'].astype(int)

In [72]:
df['Ward'] = df['Ward'].astype(int)

In [73]:
df['District'] = df['District'].astype(int)

In [74]:
df['date_transformed']= pd.to_datetime(df['Date'])

In [75]:
df['time_full'] = df['date_transformed'].apply(lambda x: datetime.strftime(x, "%H:%M"))

In [76]:
df['time_hour'] = df['date_transformed'].apply(lambda x: datetime.strftime(x, "%H"))

In [77]:
df['Day_of_Month'] = df['date_transformed'].apply(lambda x: datetime.strftime(x, "%d"))

In [78]:
df['Month'] = df['date_transformed'].apply(lambda x: datetime.strftime(x, "%m"))

In [79]:
df['Day_of_Week'] = df['date_transformed'].apply(lambda x: datetime.strftime(x, "%A"))

In [80]:
def categorize(crime): 
    if 'ATTEMPT NON-AGGRAVATED' in crime:
        crime = 'ATTEMPT NON-AGGRAVATED'
    elif 'NON-AGGRAVATED' in crime:
        crime = 'NON-AGGRAVATED'
    elif 'AGGRAVATED' in crime:
         crime = 'AGGRAVATED'
    elif 'ATTEMPT AGG' in crime:
        crime = 'ATTEMPT-AGGRAVATED'
    elif 'PREDATORY' in crime:
        crime = 'PREDATORY'
    else:
         crime = crime
    return crime

In [81]:
df['Description_Transformed'] = df['Description'].apply(categorize)

In [82]:
df.groupby('Description_Transformed')['Arrest'].sum().sort_values()

Description_Transformed
ATTEMPT-AGGRAVATED         128.0
ATTEMPT NON-AGGRAVATED     170.0
PREDATORY                  519.0
AGGRAVATED                1162.0
NON-AGGRAVATED            1263.0
Name: Arrest, dtype: float64

In [83]:
description_dummies = pd.get_dummies(df['Description_Transformed'])

In [85]:
def define(location):
    if 'RESIDENCE' in location:
        location = 'Residential Property'
    elif 'CHA' in location:
        location = 'Public Housing'
    elif 'APARTMENT' in location:
        location = 'Residential Property'
    elif 'RESIDENTIAL' in location:
        location = 'Residential Property'
    elif 'BARBER' in location:
        location = 'Business'
    elif 'BUSINESS' in location:
        location = 'Business'
    elif 'RETAIL' in location:
        location = 'Business'
    elif 'GAS' in location:
        location = 'Business'
    elif 'RESTAURANT' in location:
        location = 'Business'
    elif 'BAR' in location:
        location = 'Business'
    elif 'TAVERN' in location:
        location = 'Business'
    elif 'WAREHOUSE' in location:
        location = 'Business'
    elif 'STORE' in location:
        location = 'Business'
    elif 'OFFICE' in location:
        location = 'Business'
    elif 'THEATER' in location:
        location = 'Business'
    elif 'CLUB' in location:
        location = 'Business'
    elif 'BANK' in location:
        location = 'Business'
    elif 'FACTORY' in location:
        location = 'Business'
    elif 'ANIMAL' in location:
        location = 'Business'
    elif 'CAR WASH' in location:
        location = 'Business'
    elif 'HOTEL' in location:
        location = 'Business'
    elif 'STREET' in location:
        location = 'Public'
    elif 'ALLEY' in location:
        location = 'Public'
    elif 'SIDEWALK' in location:
        location = 'Public'
    elif 'LAKEFRONT' in location:
        location = 'Public'
    elif 'PARK' in location:
        location = 'Public'
    elif 'NON-RESIDENTIAL' in location:
        location = 'Public'
    elif 'BRIDGE' in location:
        location = "Public"
    elif 'SCHOOL' in location:
        location = 'School'
    elif 'COLLEGE' in location:
        location = 'School'
    elif 'HOSPITAL' in location:
        location = 'Care Facility'
    elif 'NURSING' in location:
        location = 'Care Facility'
    elif 'DAY CARE' in location:
        location = 'Care Facility'
    elif 'CTA' in location:
        location = 'Public Transportation'
    elif 'TAXI' in location:
        location = 'Public Transportation'
    elif 'RIDE SERVICE' in location:
        location = 'Public Transportation'
    elif 'RAILROAD' in location:
        location = 'Public Transportation'
    elif 'AIRPORT' in location:
        location = 'Public Transportation'
    elif 'VEHICLE NON' in location:
        location = 'Private Vehicle'
    elif 'BOAT' in location:
        location = 'Private Vehicle'
    elif 'ABANDON' in location:
        location = 'Vacant Property'
    elif 'VACANT' in location:
        location = 'Vacant Property'
    else:
        location = 'Other'
    return location   

In [86]:
df['location_transformed'] = df['Location Description'].apply(define)

In [87]:
df.groupby('location_transformed')['Arrest'].sum().sort_values()

location_transformed
Care Facility              15.0
Public Transportation      28.0
Public Housing             47.0
Vacant Property            47.0
School                     56.0
Business                  106.0
Private Vehicle           110.0
Other                     138.0
Public                    394.0
Residential Property     2301.0
Name: Arrest, dtype: float64

In [88]:
location_dummies = pd.get_dummies(df['location_transformed'])

In [89]:
description_dummies = pd.get_dummies(df['Description_Transformed'])

In [90]:
weekday_dummies = pd.get_dummies(df['Day_of_Week'])

In [91]:
dfwd = pd.concat([df, location_dummies, description_dummies, weekday_dummies], axis=1)

In [92]:
y = dfwd['Arrest_Dummied']
x = dfwd[['Wednesday', 'Tuesday', 'Thursday', 'Sunday', 'Saturday', 'Monday', 'Friday', 'PREDATORY', 'NON-AGGRAVATED', 'ATTEMPT-AGGRAVATED', 'AGGRAVATED', 'ATTEMPT NON-AGGRAVATED', 'Vacant Property', 'Private Vehicle', 'Public Transportation', 'Care Facility', 'School', 'Public', 'Business', 'Public Housing', 'Residential Property', 'Other', 'Year', 'domestic_dummied', 'Month', 'Day_of_Month', 'time_hour', 'nearest_campus', 'nearest_police_station', 'Community Area', 'Ward', 'District', 'Beat']]

rfc = RandomForestClassifier()

In [93]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4)
rfc.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [94]:
rfc.score(x_train, y_train)

0.97754165660468484

In [95]:
rfc.score(x_test, y_test)

0.83639217580294611

In [96]:
def classifications(y_test, y_predict):
    print 'Mean accuracy score %s ' % accuracy_score(y_test, y_predict), '\n'
    print confusion_matrix(y_test, y_predict), '\n'
    print classification_report(y_test, y_predict)

In [97]:
rfc.fit(x_train, y_train)
classifications(y_test, rfc.predict(x_test))

Mean accuracy score 0.831320936972  

[[6830  146]
 [1251   55]] 

             precision    recall  f1-score   support

          0       0.85      0.98      0.91      6976
          1       0.27      0.04      0.07      1306

avg / total       0.76      0.83      0.78      8282



In [None]:
parameters = [{'n_estimators': [10, 25, 50, 100, 500], 'max_depth': [2, 3, 5, 7, 9],
                     'max_features': [0.25, 0.5, 0.75, 1.0]}]

gridsearch = GridSearchCV(rfc, param_grid=parameters, cv=7, n_jobs=-1)
gridsearch.fit(x_train,y_train)
gridsearch.best_estimator_
gridsearch.best_estimator_.fit(x_train, y_train) 
gridsearch.best_estimator_.score(x_train, y_train)
gridsearch.best_estimator_.score(x_test, y_test)