### Predictive policing
    - Load data. Clean up.
    - Exploratory plots. Select features.
    - Fit predictive model

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

### Load data, cleanup and select some columns we think are important

In [2]:
#Load daytasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# All the cleaning steps are put inside this function, so that we can apply it both to the test and train dataset without writing twice.
def clean_dataset(df):
    """
    Function for cleaning 'train' and 'test' datasets by the same procedure.
    returns:
        df (pandas DataFrame) a clean version of the input df.
    """
    if 'Resolution' in df.columns:
        df = df.loc[df.Resolution.isin(['ARREST, BOOKED', 'ARREST, CITED'])]

    df.loc[:, 'Dates'] = df['Dates'].astype('datetime64[ns]')
    df['formatted_date'] = [d.date() for d in df['Dates']]
    df['formatted_time'] = [d.time() for d in df['Dates']]

    # Extract month, dat, etc
    df['Year'] = [x.year for x in df['formatted_date']]
    df['Month'] = [x.month for x in df['formatted_date']]
    # df['Day'] = [x.day for x in df['formatted_date']]
    df['Hour'] = [x.hour for x in df['formatted_time']]

    day_dict = {'Wednesday': 3, 'Tuesday': 2, 'Monday':1, 'Sunday':7 , 'Saturday':6, 'Friday':5,
           'Thursday':4}
    df['weekday_no'] = df['DayOfWeek'].map(day_dict)

    df = df.drop(['Descript', 'DayOfWeek', 'Dates', 'formatted_date', 'formatted_time', 'Address'], axis = 1, errors = 'ignore')

    return df

# Example:
train = clean_dataset(train)
test = clean_dataset(test)

In [3]:
# This is what we get after this step
train.head()

Unnamed: 0,Category,PdDistrict,Resolution,X,Y,Year,Month,Hour,weekday_no
0,WARRANTS,NORTHERN,"ARREST, BOOKED",-122.425892,37.774599,2015,5,23,3
1,OTHER OFFENSES,NORTHERN,"ARREST, BOOKED",-122.425892,37.774599,2015,5,23,3
2,OTHER OFFENSES,NORTHERN,"ARREST, BOOKED",-122.424363,37.800414,2015,5,23,3
18,OTHER OFFENSES,BAYVIEW,"ARREST, BOOKED",-122.386401,37.738983,2015,5,21,3
31,NON-CRIMINAL,MISSION,"ARREST, BOOKED",-122.43515,37.76176,2015,5,20,3


In [4]:
# Tell the most common crimes
most_common = train['Category'].value_counts().head(6).index
most_common

Index([u'OTHER OFFENSES', u'DRUG/NARCOTIC', u'WARRANTS', u'ASSAULT',
       u'LARCENY/THEFT', u'PROSTITUTION'],
      dtype='object')

In [5]:
# Count most common crimes by day of the week and time. 
# Groupby is very useful for this!
counts_data = train.loc[train.Category.isin(most_common)].groupby(['Category', 'weekday_no', 'Hour']).count()[['Y']].unstack(0)['Y']
counts_data.head()

Unnamed: 0_level_0,Category,ASSAULT,DRUG/NARCOTIC,LARCENY/THEFT,OTHER OFFENSES,PROSTITUTION,WARRANTS
weekday_no,Hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,207.0,213.0,35.0,542.0,38.0,206.0
1,1,144.0,139.0,24.0,364.0,30.0,166.0
1,2,128.0,92.0,24.0,351.0,23.0,127.0
1,3,70.0,64.0,12.0,229.0,25.0,87.0
1,4,46.0,60.0,11.0,173.0,14.0,96.0


In [6]:
# For example lets do a fancy table showing the times of the week when something happens more often.
# Here the unstack methos converts a long dataframe (as the one above) to a rectangular table.

table = counts_data.unstack()
table = table[[most_common[0]]]

import seaborn as sns
cm = sns.light_palette("green", as_cmap=True)
s = table.style.background_gradient(cmap=cm)
s


Category,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES,OTHER OFFENSES
Hour,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
weekday_no,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
1,542,364,351,229,173,111,193,377,464,522,570,590,625,659,703,664,783,740,673,581,468,487,660,693
2,592,354,311,234,189,148,201,390,552,618,646,708,702,690,702,721,888,910,707,677,508,509,661,677
3,547,359,270,230,166,162,176,435,597,668,711,671,745,813,799,868,1000,1018,801,683,566,525,713,688
4,575,399,299,219,163,145,152,399,539,577,598,572,609,666,683,658,812,787,727,657,564,562,672,689
5,635,362,350,242,146,102,163,363,514,587,522,519,612,621,578,571,714,740,680,635,614,709,877,883
6,711,513,401,224,172,111,130,262,316,404,467,521,520,540,549,574,662,709,683,677,615,761,921,911
7,730,596,431,211,180,109,159,244,272,406,374,461,510,477,482,486,640,693,626,554,488,469,690,657


# Fit a model

In [7]:
# For example, this is sklearn Random Forest, but almost any sklearn model takes an (X, y) in this same way. 
# So you can use X, y with your model.
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

X_train = train[['X', 'Y', 'Year', 'Month', 'Hour', 'weekday_no']].values
y_train = train['Category'].values

clf = RandomForestClassifier(n_estimators=100, max_depth=2,
                             random_state=0)
clf.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [8]:
from sklearn.metrics import log_loss

print(clf.feature_importances_) # See feature importance (works for regresion trees)
# print(log_loss(y_train, clf.predict(X_train))) #works only if outcome is a number, not a 'text'

[ 0.25127921  0.5033029   0.09498806  0.          0.13769173  0.0127381 ]


In [9]:
train['predicted_category'] = clf.predict(X_train)

#Print number of correct predictions...
print((train['Category'] == train['predicted_category']).sum())
# number of mistakes...
print((train['Category'] != train['predicted_category']).sum())
# and what if I predicted random?
train['random'] = train['Category'].sample(frac = 1).values
print((train['Category'] == train['random']).sum())

88489
194918
45591
