In [268]:
# Import Crime data set
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from __future__ import print_function

# read in the csv file
df = pd.read_csv('data/Police_Incidents.csv')

# Reference: http://stackoverflow.com/questions/11346283/renaming-columns-in-pandas
# Removing spaces from headers

df.columns = [x.strip().replace(' ', '') for x in df.columns]
df.columns = [x.strip().replace('/', '') for x in df.columns]
df.columns = [x.strip().replace('(', '') for x in df.columns]
df.columns = [x.strip().replace(')', '') for x in df.columns]
df.columns = [x.strip().replace('-', '') for x in df.columns]

#df.head()

In [269]:
# Subset the dataset to year 2016
# Ref: http://stackoverflow.com/questions/18172851/deleting-dataframe-row-in-pandas-based-on-column-value

df = df[df.Year1ofOccurrence == 2016]

In [270]:
# Narrowing down our focus to the attributes mentioned in the Dataset 
# description by removing columns not used in analysis

for col in ['OffenseServiceNumber','ServiceNumberID','ReportingArea','Beat','Sector','CouncilDistrict',
            'DPDSwornMarshallsinvolved','ComplainantHomeAddress','ComplainantApartment','ComplainantZipCode',
            'ComplainantCity','ComplainantState','ComplainantBusinessName','ComplainantBusinessAddress',
            'YearAssignment','Weather','RespondingOfficer#1BadgeNo','RespondingOfficer#1Name','RespondingOfficer#2BadgeNo',
            'RespondingOfficer#2Name','ReportingOfficerBadgeNo','AssistingOfficerBadgeNo','ReviewingOfficerBadgeNo',
            'ElementNumberAssigned','InvestigatingUnit1','InvestigatingUnit2','SpecialReportPreRMS','UCRDisposition',
            'UCR1','UCR2PreRMS','FinalUCR','RMSCode','CJISCode','PenalCode','HateCrime','VictimPackage', 'UpdateDate',
            'ModusOperandiMO', 'OffenseCodeCC', 'VictimInjuryDescription', 'OffenseStatus', 
            'ComplainantAgeatOffense', 'ComplainantAge', 'ComplainantGender', 'ComplainantRace', 'ComplainantName',
           'OffenseEnteredDateTime', 'OffenseEnteredTime', 'OffenseEnteredDayoftheWeek', 'OffenseEnteredMonth', 
            'OffenseEnteredYear', 'Dateincidentcreated', 'MapDate', 'Day2oftheYear','IncidentNumberwoYear', 'YearofIncident',
           'IncidentNumberwYear', 'Call911Problem', 'StreetBlock', 'StreetDirection', 'StreetName', 
           'IncidentAddress', 'ApartmentNumber', 'ZipCode', 'City', 'State', 'XCoordinate', 'YCoordinate', 'TargetAreaActionGrids', 
           'Community', 'Date1ofOccurrence', 'Date2ofOccurrence', 'Year2ofOccurrence', 'Month2ofOccurence', 'Day2oftheWeek', 
            'Time2ofOccurrence', 'StartingDateTime', 'EndingDateTime', 'DateofReport','CallDateTime',
            'TypeofProperty', 'IncidentNumberwYear', 'Year1ofOccurrence', 'Day1oftheYear', 'TypeofIncident','UCROffenseName',
           'OffenseType','FamilyOffense','PersonInvolvementType','TypeofLocation','VictimCondition','UCROffenseDescription']:
    if col in df:
        del df[col]
        
#Let's get the specs of our subsetted data:
print (df.info())       

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99341 entries, 0 to 196036
Data columns (total 12 columns):
Watch                    99341 non-null object
PenaltyClass             84593 non-null object
Division                 99321 non-null object
Month1ofOccurence        99341 non-null object
Day1oftheWeek            99341 non-null object
Time1ofOccurrence        99341 non-null object
CallReceivedDateTime     99328 non-null object
CallClearedDateTime      99310 non-null object
CallDispatchDateTime     99319 non-null object
UCROffenseDescription    99341 non-null object
GangRelatedOffense       99339 non-null object
DrugRelatedIncident      99339 non-null object
dtypes: object(12)
memory usage: 9.9+ MB
None


In [271]:
# Convert to date and time fields
# Note: Takes long to run, might be a quicker method to do this conversion
# Reference: http://stackoverflow.com/questions/16852911/how-do-i-convert-dates-in-a-pandas-data-frame-to-a-date-data-type
df['CallReceivedDateTime'] = pd.to_datetime(df['CallReceivedDateTime'])
df['CallClearedDateTime'] = pd.to_datetime(df['CallClearedDateTime'])
df['CallDispatchDateTime'] = pd.to_datetime(df['CallDispatchDateTime'])

In [272]:
# Compute the time between call received and call displatched, call Displatch and call cleared.
# Reference: http://www.itgo.me/a/x791470639852744898/calculate-pandas-dataframe-time-difference-between-two-columns-in-hours-and-minu
df['rec_disp_time_m'] = (df['CallDispatchDateTime'] - df['CallReceivedDateTime']).astype('timedelta64[m]')
df['disp_clear_time_m'] = (df['CallClearedDateTime'] - df['CallDispatchDateTime']).astype('timedelta64[m]')

# Remove rows with negative time difference
df = df[df.disp_clear_time_m >= 0]
df = df[df.rec_disp_time_m >= 0]

# Create Hour of Occurrence from Time Occurrence 
# Ref: http://stackoverflow.com/questions/25789445/pandas-make-new-column-from-string-slice-of-another-column
df['Hour'] = df.Time1ofOccurrence.str[:2]
df['Hour'] = (df['Hour']).astype('int64')

# Retrieve first character of penalty class as only F for felony or M misdemeanor
df['Penalty'] = df.PenaltyClass.str[:1]

# Change day of week to indicate weekend   
df['IsWeekend'] = (df['Day1oftheWeek'] == 'Sat') | (df['Day1oftheWeek'] == 'Fri') | (df['Day1oftheWeek'] == 'Sun')
df.IsWeekend = df.IsWeekend.astype(np.int)

# change Gang Related attribute to a more usable value
df['IsGangRelated'] = df.GangRelatedOffense == 'Yes'
df.IsGangRelated = df.IsGangRelated.astype(np.int)

# change Drug Related attribute to a more usable value
df['IsDrugRelated'] = df.DrugRelatedIncident == 'Yes'
df.IsDrugRelated = df.IsDrugRelated.astype(np.int)
        
# Remove call datetime call columns
for col in ['CallReceivedDateTime','CallClearedDateTime','CallDispatchDateTime','Time1ofOccurrence',
            'PenaltyClass','Day1oftheWeek','GangRelatedOffense','DrugRelatedIncident']:
    if col in df:
        del df[col]
        
#df.head()

In [273]:
df.to_csv('Police_Incidents_mini.csv', sep=',', encoding='utf-8')

In [281]:
# creating missing values grouping by day of week and watch
df_grouped = df.groupby(by=['Month1ofOccurence','Watch'])

# Using grouping to impute data in each group and then transform back 
# Impute missing numerical values 
df_imputed = df_grouped.transform(lambda grp: grp.fillna(grp.median()))

# Impute missing categorical values
# Ref: http://stackoverflow.com/questions/32617811/imputation-of-missing-values-for-categories-in-pandas
df_imputed['Penalty'] = df_grouped.Penalty.fillna(df['Penalty'].value_counts().index[0])
df_imputed['Division'] = df_grouped.Division.fillna(df['Division'].value_counts().index[0])

# Filling the grouped variables from original data frame
df_imputed[['Month1ofOccurence','Watch']] = df[['Month1ofOccurence','Watch']]

In [282]:
df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99296 entries, 0 to 196036
Data columns (total 10 columns):
rec_disp_time_m      99296 non-null float64
disp_clear_time_m    99296 non-null float64
Hour                 99296 non-null int64
IsWeekend            99296 non-null int64
IsGangRelated        99296 non-null int64
IsDrugRelated        99296 non-null int64
Penalty              99296 non-null object
Division             99296 non-null object
Month1ofOccurence    99296 non-null object
Watch                99296 non-null object
dtypes: float64(2), int64(4), object(4)
memory usage: 8.3+ MB


In [284]:
# Adding variable for logistic regression based on time from call received to call dispatched
##### need to add 
#df_imputed.rec_disp_time_m <


# Generating dummy variables from categorical variables
# create dummy variables for Penalty
tmp_df = pd.get_dummies(df_imputed.Penalty,prefix='Penalty')
df_imputed = pd.concat((df_imputed,tmp_df),axis=1) # add back into the dataframe

# create dummy variables for Division
tmp_df = pd.get_dummies(df_imputed.Division,prefix='Division')
df_imputed = pd.concat((df_imputed,tmp_df),axis=1) # add back into the dataframe

# create dummy variables for Watch
tmp_df = pd.get_dummies(df_imputed.Watch,prefix='Watch')
df_imputed = pd.concat((df_imputed,tmp_df),axis=1) # add back into the dataframe

# create dummy variables for Month of Occurence
tmp_df = pd.get_dummies(df_imputed.Month1ofOccurence,prefix='Month1ofOccurence')
df_imputed = pd.concat((df_imputed,tmp_df),axis=1) # add back into the dataframe

# Remove columns used for dummy variables
for col in ['Month1ofOccurence','Watch','Division','Penalty']:
    if col in df_imputed:
        del df_imputed[col]

df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99296 entries, 0 to 196036
Data columns (total 58 columns):
rec_disp_time_m                99296 non-null float64
disp_clear_time_m              99296 non-null float64
Hour                           99296 non-null int64
IsWeekend                      99296 non-null int64
IsGangRelated                  99296 non-null int64
IsDrugRelated                  99296 non-null int64
Penalty_F                      99296 non-null uint8
Penalty_M                      99296 non-null uint8
Penalty_N                      99296 non-null uint8
Division_Central               99296 non-null uint8
Division_North Central         99296 non-null uint8
Division_NorthEast             99296 non-null uint8
Division_NorthWest             99296 non-null uint8
Division_South Central         99296 non-null uint8
Division_SouthEast             99296 non-null uint8
Division_SouthWest             99296 non-null uint8
Watch_1                        99296 non-null uint8
Wa

In [285]:
# df_imputed.to_csv('Police_Incidents_imputed.csv', sep=',', encoding='utf-8')
df_imputed.head()

Unnamed: 0,rec_disp_time_m,disp_clear_time_m,Hour,IsWeekend,IsGangRelated,IsDrugRelated,Penalty_F,Penalty_M,Penalty_N,Division_Central,...,Month1ofOccurence_December,Month1ofOccurence_February,Month1ofOccurence_January,Month1ofOccurence_July,Month1ofOccurence_June,Month1ofOccurence_March,Month1ofOccurence_May,Month1ofOccurence_November,Month1ofOccurence_October,Month1ofOccurence_September
0,0.0,207.0,23,1,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,0,0
1,0.0,207.0,23,1,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,0,0
2,0.0,207.0,23,1,0,0,0,1,0,1,...,1,0,0,0,0,0,0,0,0,0
3,0.0,439.0,12,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
5,3.0,122.0,23,1,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


In [None]:
#####  haven't started yet

from sklearn.model_selection import ShuffleSplit

# we want to predict the X and y data as follows:
if 'Survived' in df_imputed:
    y = df_imputed['Survived'].values # get the labels we want
    del df_imputed['Survived'] # get rid of the class label
    X = df_imputed.values # use everything else to predict!

    ## X and y are now numpy matrices, by calling 'values' on the pandas data frames we
    #    have converted them into simple matrices to use with scikit learn
    
    
# to use the cross validation object in scikit learn, we need to grab an instance
#    of the object and set it up. This object will be able to split our data into 
#    training and testing splits
num_cv_iterations = 3
num_instances = len(y)
cv_object = ShuffleSplit(n_splits=num_cv_iterations,
                         test_size  = 0.2)
                         
print(X)
print(y)
print(num_instances)