# NYC Crimes severity prediction

## Import libraries

In [3]:
# Import numpy, pandas, matpltlib.pyplot, sklearn modules and seaborn
import numpy as np
import pandas as pd
import joblib
import re
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
plt.style.use('ggplot')

# Import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
# Import LightGBM Classifier
import lightgbm as lgbm
# Import XGBoost Classifier
from xgboost import XGBClassifier


from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc

import scikitplot as skplt

## Data selection

In [4]:
# Import the data
df = pd.read_csv('./ny_clean_train.csv')
print(df.info())
# Convert the Event time and category of the crime to a more representatble data.
df.EVENT_TIME = pd.to_datetime(df.EVENT_TIME).dt.hour
df.LAW_CAT_CD = df['LAW_CAT_CD'].replace(['FELONY','MISDEMEANOR',"VIOLATION"],[2,1,0])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6451985 entries, 0 to 6451984
Data columns (total 21 columns):
 #   Column             Dtype  
---  ------             -----  
 0   EVENT_TIME         object 
 1   year               int64  
 2   month              int64  
 3   day                int64  
 4   Latitude           float64
 5   Longitude          float64
 6   CRM_ATPT_CPTD_CD   object 
 7   OFNS_DESC          object 
 8   ADDR_PCT_CD        float64
 9   LAW_CAT_CD         object 
 10  BORO_NM            object 
 11  PREM_TYP_DESC      object 
 12  IN_PARK            int64  
 13  IN_PUBLIC_HOUSING  int64  
 14  IN_STATION         int64  
 15  SUSP_AGE_GROUP     object 
 16  SUSP_RACE          object 
 17  SUSP_SEX           object 
 18  VIC_AGE_GROUP      object 
 19  VIC_RACE           object 
 20  VIC_SEX            object 
dtypes: float64(3), int64(6), object(12)
memory usage: 1.0+ GB
None


In [6]:
df.head()

Unnamed: 0,EVENT_TIME,year,month,day,Latitude,Longitude,CRM_ATPT_CPTD_CD,OFNS_DESC,ADDR_PCT_CD,LAW_CAT_CD,BORO_NM,PREM_TYP_DESC,IN_PARK,IN_PUBLIC_HOUSING,IN_STATION,SUSP_AGE_GROUP,SUSP_RACE,SUSP_SEX,VIC_AGE_GROUP,VIC_RACE,VIC_SEX
0,17,2014,9,4,40.685041,-73.921777,COMPLETED,ASSAULT 3 & RELATED OFFENSES,81.0,1,BROOKLYN,STREET,0,0,0,UNKNOWN,UNKNOWN,U,25-44,WHITE,F
1,7,2016,10,12,40.636991,-74.134093,COMPLETED,GRAND LARCENY,121.0,2,STATEN ISLAND,STREET,0,0,0,UNKNOWN,BLACK,U,45-64,WHITE HISPANIC,F
2,13,2012,9,28,40.823876,-73.891863,COMPLETED,GRAND LARCENY,41.0,2,BRONX,STREET,0,0,0,UNKNOWN,WHITE HISPANIC,M,45-64,WHITE HISPANIC,F
3,15,2015,3,24,40.845707,-73.910398,COMPLETED,PETIT LARCENY,46.0,1,BRONX,STREET,0,0,0,UNKNOWN,BLACK,M,<18,WHITE HISPANIC,F
4,4,2017,5,20,40.763992,-73.828426,COMPLETED,ASSAULT 3 & RELATED OFFENSES,109.0,1,QUEENS,STREET,0,0,0,25-44,WHITE HISPANIC,M,25-44,BLACK,M


In [7]:
# Count the number of row in each category
df.LAW_CAT_CD.value_counts().sort_values(ascending=False)

1    3647183
2    1987476
0     817326
Name: LAW_CAT_CD, dtype: int64

In [None]:
# As the data is not balanced, we want create a new balanced dataset that contains 817326 row of each class
zero,one,two = [],[],[]
zero_c,one_c,two_c = 0,0,0
for i in tqdm(df.iterrows()):
    if i[1].LAW_CAT_CD == 0 and zero_c <= 817326:
        zero.append(i[1].values)
        zero_c += 1
    elif i[1].LAW_CAT_CD == 1 and one_c <= 817326:
        one.append(i[1].values)
        one_c += 1
    elif i[1].LAW_CAT_CD == 2 and two_c <= 817326:
        two.append(i[1].values)
        two_c += 1
    if zero_c == 817326 and one_c == 817326 and two_c == 817326:
        break
zero_df = pd.DataFrame(zero, columns=df.columns.values.tolist())
one_df = pd.DataFrame(one, columns=df.columns.values.tolist())
two_df = pd.DataFrame(two, columns=df.columns.values.tolist())
final_df = pd.concat([zero_df,one_df,two_df])
final.to_csv("./ny_clean_train_balanced.csv",index=False)

### Import the balanced dataset

In [2]:
# Import the data
df = pd.read_csv('./ny_clean_train_balanced.csv')

In [3]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2451980 entries, 0 to 2451979
Data columns (total 21 columns):
 #   Column             Dtype  
---  ------             -----  
 0   EVENT_TIME         int64  
 1   year               int64  
 2   month              int64  
 3   day                int64  
 4   Latitude           float64
 5   Longitude          float64
 6   CRM_ATPT_CPTD_CD   object 
 7   OFNS_DESC          object 
 8   ADDR_PCT_CD        float64
 9   LAW_CAT_CD         int64  
 10  BORO_NM            object 
 11  PREM_TYP_DESC      object 
 12  IN_PARK            int64  
 13  IN_PUBLIC_HOUSING  int64  
 14  IN_STATION         int64  
 15  SUSP_AGE_GROUP     object 
 16  SUSP_RACE          object 
 17  SUSP_SEX           object 
 18  VIC_AGE_GROUP      object 
 19  VIC_RACE           object 
 20  VIC_SEX            object 
dtypes: float64(3), int64(8), object(10)
memory usage: 392.8+ MB


Unnamed: 0,EVENT_TIME,year,month,day,Latitude,Longitude,CRM_ATPT_CPTD_CD,OFNS_DESC,ADDR_PCT_CD,LAW_CAT_CD,BORO_NM,PREM_TYP_DESC,IN_PARK,IN_PUBLIC_HOUSING,IN_STATION,SUSP_AGE_GROUP,SUSP_RACE,SUSP_SEX,VIC_AGE_GROUP,VIC_RACE,VIC_SEX
0,17,2011,2,1,40.617674,-73.962811,COMPLETED,HARRASSMENT 2,70.0,0,BROOKLYN,STREET,0,0,0,UNKNOWN,UNKNOWN,U,18-24,WHITE,F
1,1,2016,9,27,40.624674,-74.027588,COMPLETED,HARRASSMENT 2,68.0,0,BROOKLYN,RESIDENCE - APT. HOUSE,0,0,0,45-64,WHITE,F,65+,WHITE,M
2,14,2015,12,14,40.836653,-73.907143,COMPLETED,HARRASSMENT 2,42.0,0,BRONX,RESIDENCE - PUBLIC HOUSING,0,1,0,UNKNOWN,BLACK HISPANIC,U,25-44,WHITE HISPANIC,F
3,14,2014,8,13,40.59446,-74.065243,COMPLETED,HARRASSMENT 2,122.0,0,STATEN ISLAND,RESIDENCE-HOUSE,0,0,0,65+,WHITE,F,25-44,WHITE,M
4,17,2016,10,26,40.774455,-73.932565,COMPLETED,HARRASSMENT 2,114.0,0,QUEENS,RESIDENCE - PUBLIC HOUSING,0,0,0,18-24,BLACK,M,45-64,BLACK,F


In [4]:
# Define a utility function to reduce memory usage by convert the values types
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df
df = reduce_mem_usage(df)

Memory usage after optimization is: 222.15 MB
Decreased by 43.5%


## Data preparing

### Select only relevant columns
 

In [5]:
# Set the list of features to include in Machine Learning Model
feature_lst=['EVENT_TIME','ADDR_PCT_CD', 'month', 'day', 'Latitude',
       'Longitude', 'BORO_NM',
       'IN_PARK', 'IN_PUBLIC_HOUSING', 'IN_STATION', 'VIC_AGE_GROUP',
       'VIC_RACE', 'VIC_SEX','LAW_CAT_CD', 'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX']

df_sel=df[feature_lst].copy()
df_sel.info()
df_sel.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2451980 entries, 0 to 2451979
Data columns (total 17 columns):
 #   Column             Dtype  
---  ------             -----  
 0   EVENT_TIME         int8   
 1   ADDR_PCT_CD        float16
 2   month              int8   
 3   day                int8   
 4   Latitude           float16
 5   Longitude          float16
 6   BORO_NM            object 
 7   IN_PARK            int8   
 8   IN_PUBLIC_HOUSING  int8   
 9   IN_STATION         int8   
 10  VIC_AGE_GROUP      object 
 11  VIC_RACE           object 
 12  VIC_SEX            object 
 13  LAW_CAT_CD         int8   
 14  SUSP_AGE_GROUP     object 
 15  SUSP_RACE          object 
 16  SUSP_SEX           object 
dtypes: float16(3), int8(7), object(7)
memory usage: 161.3+ MB


Unnamed: 0,EVENT_TIME,ADDR_PCT_CD,month,day,Latitude,Longitude,BORO_NM,IN_PARK,IN_PUBLIC_HOUSING,IN_STATION,VIC_AGE_GROUP,VIC_RACE,VIC_SEX,LAW_CAT_CD,SUSP_AGE_GROUP,SUSP_RACE,SUSP_SEX
0,17,70.0,2,1,40.625,-73.9375,BROOKLYN,0,0,0,18-24,WHITE,F,0,UNKNOWN,UNKNOWN,U
1,1,68.0,9,27,40.625,-74.0,BROOKLYN,0,0,0,65+,WHITE,M,0,45-64,WHITE,F
2,14,42.0,12,14,40.84375,-73.9375,BRONX,0,1,0,25-44,WHITE HISPANIC,F,0,UNKNOWN,BLACK HISPANIC,U
3,14,122.0,8,13,40.59375,-74.0625,STATEN ISLAND,0,0,0,25-44,WHITE,M,0,65+,WHITE,F
4,17,114.0,10,26,40.78125,-73.9375,QUEENS,0,0,0,45-64,BLACK,F,0,18-24,BLACK,M


In [7]:
print(df_sel.shape)
# double check that the data is balanced
df_sel.LAW_CAT_CD.value_counts().sort_values(ascending=False)

(2451980, 17)

In [None]:
# Draw a corrolation matrix of the variables
corr = df_sel.corr()

plt.figure(figsize = (10,8))
sns.heatmap(corr, cmap = "coolwarm", linewidth = 2, linecolor = "white")
plt.title("Correlation")
plt.show()

### Deal with categorical data: pd.get_dummies()

In [9]:
# Generate dummies for categorical data
df_state_dummy = pd.get_dummies(df_sel,drop_first=True)

df_state_dummy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2451980 entries, 0 to 2451979
Data columns (total 45 columns):
 #   Column                              Dtype  
---  ------                              -----  
 0   EVENT_TIME                          int8   
 1   ADDR_PCT_CD                         float16
 2   month                               int8   
 3   day                                 int8   
 4   Latitude                            float16
 5   Longitude                           float16
 6   IN_PARK                             int8   
 7   IN_PUBLIC_HOUSING                   int8   
 8   IN_STATION                          int8   
 9   LAW_CAT_CD                          int8   
 10  BORO_NM_BROOKLYN                    uint8  
 11  BORO_NM_MANHATTAN                   uint8  
 12  BORO_NM_QUEENS                      uint8  
 13  BORO_NM_STATEN ISLAND               uint8  
 14  BORO_NM_UNKNOWN                     uint8  
 15  VIC_AGE_GROUP_25-44                 uint8  
 16  

In [10]:
df_state_dummy.head()

Unnamed: 0,EVENT_TIME,ADDR_PCT_CD,month,day,Latitude,Longitude,IN_PARK,IN_PUBLIC_HOUSING,IN_STATION,LAW_CAT_CD,BORO_NM_BROOKLYN,BORO_NM_MANHATTAN,BORO_NM_QUEENS,BORO_NM_STATEN ISLAND,BORO_NM_UNKNOWN,VIC_AGE_GROUP_25-44,VIC_AGE_GROUP_45-64,VIC_AGE_GROUP_65+,VIC_AGE_GROUP_<18,VIC_AGE_GROUP_UNKNOWN,VIC_RACE_ASIAN / PACIFIC ISLANDER,VIC_RACE_BLACK,VIC_RACE_BLACK HISPANIC,VIC_RACE_OTHER,VIC_RACE_UNKNOWN,VIC_RACE_WHITE,VIC_RACE_WHITE HISPANIC,VIC_SEX_E,VIC_SEX_F,VIC_SEX_M,VIC_SEX_U,SUSP_AGE_GROUP_25-44,SUSP_AGE_GROUP_45-64,SUSP_AGE_GROUP_65+,SUSP_AGE_GROUP_<18,SUSP_AGE_GROUP_UNKNOWN,SUSP_RACE_ASIAN / PACIFIC ISLANDER,SUSP_RACE_BLACK,SUSP_RACE_BLACK HISPANIC,SUSP_RACE_OTHER,SUSP_RACE_UNKNOWN,SUSP_RACE_WHITE,SUSP_RACE_WHITE HISPANIC,SUSP_SEX_M,SUSP_SEX_U
0,17,70.0,2,1,40.625,-73.9375,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1
1,1,68.0,9,27,40.625,-74.0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
2,14,42.0,12,14,40.84375,-73.9375,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1
3,14,122.0,8,13,40.59375,-74.0625,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
4,17,114.0,10,26,40.78125,-73.9375,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0


### Split the data into train and test

In [11]:
# Assign the data
df=df_state_dummy

# Set the target for the prediction
target='LAW_CAT_CD'

# Create arrays for the features and the response variable

# set X and y
y = df[target]
X = df.drop(target, axis=1)

In [12]:
y.unique()
y.value_counts()

array([0, 1, 2], dtype=int8)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,shuffle=True, random_state=21, stratify=y)

## Utils

In [8]:
def plot_cm(y_pred,y_test,algorithm,figure_name):
    mat_RF = confusion_matrix(y_pred,y_test)
    plt.figure(figsize=(16,4))
    sns.heatmap(mat_RF, square=True, annot=True, fmt='d', cbar=False,xticklabels=[0,1,2],yticklabels=[0,1,2])
    plt.xlabel('True labels')
    plt.ylabel('predicted labels')
    plt.title(algorithm)
    plt.savefig(figure_name)

In [9]:
def plot_roc(y_test, model, figure_name):
    pl = skplt.metrics.plot_roc(y_test, model.predict_proba(X_test), figsize=(12,6))
    plt.show()
    pl.figure.savefig(figure_name)

In [10]:
def save_model(model, model_name,is_tree=False):
    joblib.dump(model.estimators_[0] if is_tree else model,f'{model_name}.joblib')
    print(f"Model size: {np.round(os.path.getsize(f'{model_name}.joblib') / 1024 / 1024, 2) } MB")

## Modeling: Random Forest

In [19]:
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100,n_jobs=-1,verbose=2)

#Train the model using the training sets
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

# Get the accuracy score
acc_rf=accuracy_score(y_test, y_pred)

# Model Accuracy, how often is the classifier correct?
print("[Random forest algorithm] accuracy_score: {:.3f}.".format(acc_rf))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100building tree 5 of 100
building tree 6 of 100

building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   31.4s


building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.3min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    1.5s


[Random forest algorithm] accuracy_score: 0.574.


[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:   10.0s finished


In [20]:
# Plot confusion matrix
plot_cm(y_pred,y_test,"Random Forest","cm_random_forest.pdf")

In [None]:
# Plot ROC curve
plot_roc(y_test,clf,"roc_random_forest.pdf")

In [None]:
# Save model
save_model(clf,"random_forest",True)

In [None]:
# Test with one model


## Modeling: LightGBM

In [21]:
# Define the model hyperparameters
lgbm_params = {
          "boosting_type": "gbdt",
          "learning_rate": 0.1,
          "num_leaves": 20,
          "max_bin": 256,
          "verbosity": 1,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 20,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9}
# Train the model
lbm_clf = lgbm.LGBMClassifier(**params)
lbm_clf.fit(X_train,y_train)
# Evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))

In [None]:
# Plot the confusion matrix
plot_cm(y_pred, y_test, "LightGBM", 'cm_LightGBM.pdf')

In [None]:
# Plot the ROC curve
roc_curve(y_test, lbm_clf, "roc_LightGBM.pdf")

In [None]:
# Save the model
save_model(lbm_clf, "lightgbm")

## Modeling XGBoost

In [33]:
# Before training the model, we need to remove some special characters from the columns name
regex = re.compile(r"\[|\]|<", re.IGNORECASE)
X_train.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X_train.columns.values]

In [34]:
# Define the model hyperparameters
params = {
            'objective':'multi:softmax',
            'max_depth': 10,
            'alpha': 10,
            'learning_rate': 0.1,
            'n_estimators':100,
            'use_label_encoder':False
        }

# instantiate the classifier 
xgb_clf = XGBClassifier(**params)

# fit the classifier to the training data
xgb_clf.fit(X_train, y_train)

# test the models performace
y_pred = xgb_clf.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
print('XGBBoost Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))



XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_delta_step=0,
              max_depth=10, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=10, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, ...)

In [None]:
# Plot the confusion matrix
plot_cm(y_pred, y_test, "XGBBoost", 'cm_XGBBoost.pdf')

In [None]:
# Plot the ROC curve
roc_curve(y_test, lbm_clf, "roc_XGBBoost.pdf")

In [None]:
# Save the model
save_model(xgb_clf, "xgboost")