In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', 300)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
df=pd.read_csv("MCI_data2014_2019_binaryclass.csv")

In [3]:
df.head()

Unnamed: 0,occurrenceyear,occurrenceday,occurrencedayofyear,occurrencehour,Hood_ID,Long,Lat,Employment rate,Unemployment rate,Unsuitable rate,population,Postsecondary,Postsecondary rate,income,density,severe,month,day,premise_type
0,2014,24,83,1,132,-79.199081,43.800281,55.5,10.5,17.3,43794,16960,0.387268,29573,4948,1,0,0,0
1,2014,27,270,16,76,-79.386383,43.662472,56.2,10.2,12.6,25797,17505,0.678567,56526,14097,0,1,1,1
2,2014,24,83,6,1,-79.612595,43.720406,58.0,9.6,17.4,33312,13425,0.403008,31771,1117,0,0,0,0
3,2014,24,83,15,47,-79.349121,43.782772,55.1,9.6,15.8,27051,15035,0.555802,37379,6441,0,0,0,2
4,2014,3,123,2,90,-79.458778,43.66449,69.9,5.6,6.7,14366,7790,0.542253,49709,5442,1,2,1,0


In [43]:
df.shape

(205321, 19)

In [45]:
X= df.drop(['severe','population', 'Postsecondary'],axis = 1).values

In [46]:
y = df['severe'].values

### Using SMOTE to balance the dataset

In [47]:
from imblearn import over_sampling
from imblearn.over_sampling import SMOTE

In [48]:
smote = SMOTE('minority')



In [49]:
X_sm,y_sm = smote.fit_sample(X, y)

In [50]:
unique, counts = np.unique(y_sm, return_counts=True)
dict(zip(unique, counts))

{0: 132109, 1: 132109}

### dataset scaling

In [51]:
from sklearn.preprocessing import StandardScaler

In [52]:
scaler = StandardScaler()


In [53]:
X = scaler.fit_transform(X_sm)


### train test dataset split

In [54]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import classification_report

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y_sm, test_size=0.2)

### LogisticRegression

In [56]:
lr = LogisticRegression(penalty='l2', max_iter=500, C=1,random_state=101)

In [57]:
lr.fit(X_train, y_train)

LogisticRegression(C=1, max_iter=500, random_state=101)

In [58]:
y_predict_test = lr.predict(X_test)

In [59]:
print(classification_report(y_test, y_predict_test))

              precision    recall  f1-score   support

           0       0.58      0.56      0.57     26416
           1       0.57      0.59      0.58     26428

    accuracy                           0.57     52844
   macro avg       0.58      0.57      0.57     52844
weighted avg       0.58      0.57      0.57     52844



### DecisionTree

In [60]:
from sklearn import tree
dt_classifier = tree.DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
dt_predictions = dt_classifier.predict(X_test)

In [61]:
print(classification_report(y_test, dt_predictions))

              precision    recall  f1-score   support

           0       0.77      0.76      0.77     26416
           1       0.77      0.77      0.77     26428

    accuracy                           0.77     52844
   macro avg       0.77      0.77      0.77     52844
weighted avg       0.77      0.77      0.77     52844



### Naive Bayes

In [62]:
from sklearn.naive_bayes import GaussianNB

In [63]:
gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train, y_train)
gnb_predictions = gnb_classifier.predict(X_test)

In [64]:
print(classification_report(y_test, gnb_predictions))

              precision    recall  f1-score   support

           0       0.60      0.44      0.51     26416
           1       0.56      0.71      0.62     26428

    accuracy                           0.57     52844
   macro avg       0.58      0.57      0.57     52844
weighted avg       0.58      0.57      0.57     52844



### Random Forest

In [65]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [66]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
rf_predictions = rf_classifier.predict(X_test)

In [67]:
print(classification_report(y_test, rf_predictions))

              precision    recall  f1-score   support

           0       0.86      0.79      0.82     26416
           1       0.80      0.87      0.83     26428

    accuracy                           0.83     52844
   macro avg       0.83      0.83      0.83     52844
weighted avg       0.83      0.83      0.83     52844



### neural network

In [35]:
from sklearn.neural_network import MLPClassifier
snn_classifier = MLPClassifier(max_iter = 1000)
snn_classifier.fit(X_train, y_train)
snn_predictions = snn_classifier.predict(X_test)

In [36]:
print(classification_report(y_test, snn_predictions))

              precision    recall  f1-score   support

           0       0.73      0.66      0.69     26523
           1       0.69      0.75      0.72     26321

    accuracy                           0.71     52844
   macro avg       0.71      0.71      0.71     52844
weighted avg       0.71      0.71      0.71     52844



### KNeighbors

In [38]:
from sklearn.neighbors import KNeighborsClassifier

In [39]:
knn_classifier = KNeighborsClassifier(n_neighbors = 70)
knn_classifier.fit(X_train, y_train)
knn_predictions = knn_classifier.predict(X_test)

In [40]:
print(classification_report(y_test, knn_predictions))

              precision    recall  f1-score   support

           0       0.63      0.71      0.67     26523
           1       0.67      0.59      0.63     26321

    accuracy                           0.65     52844
   macro avg       0.65      0.65      0.65     52844
weighted avg       0.65      0.65      0.65     52844



### SVM

In [41]:
from sklearn.svm import SVC
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)
svm_predictions = svm_classifier.predict(X_test)

In [42]:
print(classification_report(y_test, svm_predictions))

              precision    recall  f1-score   support

           0       0.68      0.66      0.67     26523
           1       0.67      0.69      0.68     26321

    accuracy                           0.67     52844
   macro avg       0.67      0.67      0.67     52844
weighted avg       0.67      0.67      0.67     52844

