In [0]:
import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt
import seaborn as sns
import warnings; warnings.simplefilter('ignore')
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

In [0]:
incidents1 = pd.read_csv('incidentData1.csv')
incidents2 = pd.read_csv('incidentData2.csv')

In [28]:
# add column names to incidents2 file
incidents2.columns = incidents1.columns
# Stack the DataFrames on top of each other
incidents = pd.concat([incidents1, incidents2], axis=0)
extracted_df = incidents[['(empty', 'Incident Date', 'Incident Time', 'Incident Year', 'Incident Day of Week','Incident Code',
       'Incident Category']]
extracted_df.head()

Unnamed: 0,(empty,Incident Date,Incident Time,Incident Year,Incident Day of Week,Incident Code,Incident Category
0,(00000,2019/08/15,11:41,2019,Thursday,6374,Larceny Theft
1,(00000,2019/09/17,22:00,2019,Tuesday,6374,Larceny Theft
2,(94121,2019/10/04,14:25,2019,Friday,3474,Robbery
3,(94114,2019/10/03,19:30,2019,Thursday,6244,Larceny Theft
4,(94121,2019/10/04,16:53,2019,Friday,51040,Non-Criminal


In [0]:
extract_month = lambda x : x[5:7]
extract_date = lambda x : x[8:10]
extract_hour = lambda x : x[0:2]
extract_minute = lambda x : x[3:5]

In [0]:
extracted_df['month'] = extracted_df['Incident Date'].apply(extract_month)
extracted_df['date'] = extracted_df['Incident Date'].apply(extract_date)
extracted_df['hour'] = extracted_df['Incident Time'].apply(extract_hour)
extracted_df['minute'] = extracted_df['Incident Time'].apply(extract_minute)

In [0]:
# remove brace from zipcode
remove_bracket = lambda y : y[1:]
extracted_df['zipcode'] = extracted_df['(empty'].apply(remove_bracket)

In [0]:
def weighIncident(incidentType):
    if incidentType=="Non-Criminal" or incidentType=="Other" or incidentType=="Other Miscellaneous" or incidentType=="Case Closure":
        return 1
    elif incidentType=="Miscellaneous Investigation" or incidentType=="Fraud" or incidentType=="Forgery And Counterfeiting" or incidentType=="Warrant" or incidentType=="Traffic Violation Arrest" or incidentType=="Gambling" or incidentType=="Civil Sidewalks" or incidentType=="Courtesy Report":
        return 2
    elif incidentType=="Juvenile Offenses" or incidentType=="Lost Property" or incidentType=="Suspicious Occ" or incidentType=="Suspicious" or incidentType=="Vandalism" or incidentType=="Recovered Vehicle":
        return 3
    elif incidentType=="Vehicle Misplaced" or incidentType=="Vehicle Impounded":
        return 5
    elif incidentType=="Disorderly Conduct" or incidentType=="Traffic Collision" or incidentType=="Fire Report" or incidentType=="Weapons Carrying Etc":
        return 8
    elif incidentType=="Liquor Laws" or incidentType=="Drug Offense" or incidentType=="Drug Violation" or incidentType=="Embezzlement":
        return 10
    elif incidentType=="Motor Vehicle Theft" or incidentType=="Stolen Property" or incidentType=="Robbery" or incidentType=="Motor Vehicle Theft?" or incidentType=="Larceny Theft" or incidentType=="Burglary" or incidentType=="Malicious Mischief":
        return 14
    elif incidentType=="Prostitution" or incidentType=="Other Offenses":
        return 15
    elif incidentType=="Arson" or incidentType=="Offences Against The Family And Children" or incidentType=="Family Offense" or incidentType=="Missing Person" or incidentType=="Weapons Offense" or incidentType=="Weapons Offence":
        return 18
    elif incidentType=="Suicide" or incidentType=="Rape" or incidentType=="Assault" or incidentType=="Sex Offense" or incidentType=="Homicide" or incidentType=="Human Trafficking (A), Commercial Sex Acts" or incidentType=="Human Trafficking, Commercial Sex Acts":
        return 20
    else:
        return 0

In [0]:
extracted_df['weights'] = extracted_df['Incident Category'].apply(weighIncident)

In [0]:
extracted_df = extracted_df.drop(columns=['(empty','Incident Date', 'Incident Time','Incident Category'])

In [0]:
extracted_df.dropna(how='any', inplace=True)
extracted_df.dropna(inplace=True)

In [0]:
extracted_df.drop(extracted_df[extracted_df.zipcode == '00000'].index, inplace=True)
extracted_df.drop(extracted_df[extracted_df.weights == 0].index, inplace=True)

In [37]:
extracted_df

Unnamed: 0,Incident Year,Incident Day of Week,Incident Code,month,date,hour,minute,zipcode,weights
2,2019,Friday,3474,10,04,14,25,94121,14
3,2019,Thursday,6244,10,03,19,30,94114,14
4,2019,Friday,51040,10,04,16,53,94121,1
5,2019,Wednesday,6224,10,02,14,10,94133,14
6,2019,Thursday,12030,10,03,23,30,94133,18
...,...,...,...,...,...,...,...,...,...
136168,2019,Tuesday,10115,06,04,10,00,94121,2
136169,2019,Wednesday,6373,09,18,19,45,94112,14
136170,2019,Thursday,15150,09,19,08,05,94112,18
136171,2019,Saturday,6241,09,07,20,00,94127,14


In [38]:
oneHotEncodedDF = pd.get_dummies(extracted_df, prefix="", prefix_sep="", columns=['Incident Year', 'Incident Day of Week', 'Incident Code', 'month', 'date', 'hour', 'minute', 'zipcode' ])
oneHotEncodedDF

Unnamed: 0,weights,2018,2019,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,1000,1001,1002,1003,1004,1160,2004,2005,2010,2020,2102,2103,2104,2201,2202,2204,3011,3012,3013,3014,3021,3022,3023,3024,3031,3034,3041,3042,3043,3044,...,47,48,49,50,51,52,53,54,55,56,57,58,59,94102,94103,94104,94105,94107,94108,94109,94110,94111,94112,94114,94115,94116,94117,94118,94121,94122,94123,94124,94127,94129,94130,94131,94132,94133,94134,94143
2,14,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,14,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
5,14,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
6,18,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136168,2,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
136169,14,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
136170,18,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
136171,14,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [42]:
y = oneHotEncodedDF['weights']
X = oneHotEncodedDF.drop(columns=['weights'])

X_train, X_test, y_train, y_test = train_test_split(X, y )

model = LogisticRegression()
model.fit(X_train, y_train)
# coefficeints of the trained model
print('Coefficient of model :', model.coef_)

# intercept of the model
print('Intercept of model',model.intercept_)

# predict the target on the train dataset
predict_train = model.predict(X_train)
print('Target on train data',predict_train) 

# Accuray Score on train dataset
accuracy_train = accuracy_score(y_train,predict_train)
print('accuracy_score on train dataset : ', accuracy_train)

# predict the target on the test dataset
predict_test = model.predict(X_test)
print('Target on test data',predict_test) 

# Accuracy Score on test dataset
accuracy_test = accuracy_score(y_test,predict_test)
print('accuracy_score on test dataset : ', accuracy_test)   


Coefficient of model : [[-0.85498353 -0.90615476 -0.25510674 ... -0.10019438 -0.01057664
  -0.00313227]
 [-1.09302319 -1.13184019 -0.28660672 ...  0.02789588 -0.37635895
   0.03751653]
 [-1.59305772 -1.5654555  -0.4542831  ... -0.38220374  0.02953489
  -0.17475064]
 ...
 [-1.49348651 -1.51016239 -0.48707465 ... -0.06038807 -0.14249724
  -0.03948508]
 [-1.24236913 -1.23706376 -0.37905228 ... -0.25670571  0.30106484
  -0.06837626]
 [-1.15621434 -1.15879757 -0.36376249 ... -0.34093228  0.20347398
  -0.08191116]]
Intercept of model [-1.76113828 -2.22486337 -3.15851322 -4.46867551 -2.56064139 -2.48651655
 -0.70711857 -3.0036489  -2.47943289 -2.31501192]
Target on train data [14 20 14 ... 14 14 20]
accuracy_score on train dataset :  0.9990295651790342
Target on test data [14  1 14 ...  1 14 14]
accuracy_score on test dataset :  0.9983623912396202
