In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', 300)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier

In [2]:
df=pd.read_csv("MCI_data2014_2019_multiclass.csv")

In [3]:
df.head()

Unnamed: 0,occurrenceyear,occurrenceday,occurrencedayofyear,occurrencehour,Hood_ID,Long,Lat,Employment rate,Unemployment rate,Unsuitable rate,population,Postsecondary,Postsecondary rate,income,density,target,month,day,premise_type
0,2014,24,83,1,132,-79.199081,43.800281,55.5,10.5,17.3,43794,16960,0.387268,29573,4948,0,0,0,0
1,2014,27,270,16,76,-79.386383,43.662472,56.2,10.2,12.6,25797,17505,0.678567,56526,14097,1,1,1,1
2,2014,24,83,6,1,-79.612595,43.720406,58.0,9.6,17.4,33312,13425,0.403008,31771,1117,1,0,0,0
3,2014,24,83,15,47,-79.349121,43.782772,55.1,9.6,15.8,27051,15035,0.555802,37379,6441,1,0,0,2
4,2014,3,123,2,90,-79.458778,43.66449,69.9,5.6,6.7,14366,7790,0.542253,49709,5442,2,2,1,0


In [4]:
df.shape

(205321, 19)

In [5]:
df['target'].value_counts()

0    110609
1     43214
4     23330
2     21500
3      6668
Name: target, dtype: int64

In [6]:
X= df.drop(['target'],axis = 1).values

In [7]:
y = df['target'].values

### Using SMOTE to balance the dataset

In [8]:
from imblearn import over_sampling
from imblearn.over_sampling import SMOTE

In [9]:
smote = SMOTE('all')



In [10]:
X_sm,y_sm = smote.fit_sample(X, y)

In [11]:
unique, counts = np.unique(y_sm, return_counts=True)
dict(zip(unique, counts))

{0: 110609, 1: 110609, 2: 110609, 3: 110609, 4: 110609}

### dataset scaling

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
scaler = StandardScaler()


In [14]:
X = scaler.fit_transform(X_sm)


### train test dataset split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y_sm, test_size=0.2)

### Ensemble

#### Bagging

In [17]:
from sklearn.ensemble import BaggingClassifier

In [18]:
base_est = LogisticRegression(penalty='l2', max_iter=500, C=1,random_state=101)

In [19]:
bagging_classifier = BaggingClassifier(base_estimator=base_est,
                        n_estimators=20, random_state=0)
bagging_classifier.fit(X_train, y_train)
bagging_predictions = bagging_classifier.predict(X_test)

In [20]:
print(classification_report(y_test, bagging_predictions))

              precision    recall  f1-score   support

           0       0.30      0.15      0.20     22273
           1       0.33      0.42      0.37     22347
           2       0.27      0.23      0.25     21841
           3       0.30      0.23      0.26     22086
           4       0.41      0.66      0.51     22062

    accuracy                           0.34    110609
   macro avg       0.32      0.34      0.32    110609
weighted avg       0.32      0.34      0.32    110609



### Stacking

In [21]:
from sklearn.ensemble import StackingClassifier
from sklearn.svm import LinearSVC
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

estimators = [
    ('nn', MLPClassifier(max_iter = 1000)),
    ('knn', KNeighborsClassifier(n_neighbors = 70))
]
clf = StackingClassifier(
    estimators=estimators, final_estimator=tree.DecisionTreeClassifier()
)

In [22]:
model = clf.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.41      0.41      0.41     22273
           1       0.45      0.45      0.45     22347
           2       0.41      0.41      0.41     21841
           3       0.51      0.51      0.51     22086
           4       0.48      0.49      0.48     22062

    accuracy                           0.45    110609
   macro avg       0.45      0.45      0.45    110609
weighted avg       0.45      0.45      0.45    110609



### LogisticRegression

In [16]:
lr = LogisticRegression(penalty='l2', max_iter=500, C=1,random_state=101)

In [17]:
lr.fit(X_train, y_train)

LogisticRegression(C=1, max_iter=500, random_state=101)

In [18]:
y_predict_test = lr.predict(X_test)

In [19]:
print(classification_report(y_test, y_predict_test))

              precision    recall  f1-score   support

           0       0.30      0.15      0.20     22267
           1       0.33      0.42      0.37     22094
           2       0.27      0.23      0.25     22104
           3       0.31      0.24      0.27     22009
           4       0.42      0.67      0.51     22135

    accuracy                           0.34    110609
   macro avg       0.32      0.34      0.32    110609
weighted avg       0.32      0.34      0.32    110609



### DecisionTree

In [69]:
from sklearn import tree
dt_classifier = tree.DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
dt_predictions = dt_classifier.predict(X_test)

In [70]:
print(classification_report(y_test, dt_predictions))

              precision    recall  f1-score   support

           0       0.69      0.68      0.68     22188
           1       0.65      0.62      0.64     21982
           2       0.66      0.68      0.67     21963
           3       0.74      0.78      0.76     22084
           4       0.71      0.70      0.70     22392

    accuracy                           0.69    110609
   macro avg       0.69      0.69      0.69    110609
weighted avg       0.69      0.69      0.69    110609



### Naive Bayes

In [74]:
from sklearn.naive_bayes import GaussianNB

In [76]:
gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train, y_train)
gnb_predictions = gnb_classifier.predict(X_test)

In [78]:
print(classification_report(y_test, gnb_predictions))

              precision    recall  f1-score   support

           0       0.32      0.11      0.17     22188
           1       0.35      0.37      0.36     21982
           2       0.26      0.24      0.25     21963
           3       0.33      0.22      0.27     22084
           4       0.34      0.68      0.45     22392

    accuracy                           0.33    110609
   macro avg       0.32      0.32      0.30    110609
weighted avg       0.32      0.33      0.30    110609



### Random Forest

In [28]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
rf_predictions = rf_classifier.predict(X_test)

In [29]:
print(classification_report(y_test, rf_predictions))

              precision    recall  f1-score   support

           0       0.77      0.78      0.77     22188
           1       0.78      0.80      0.79     21982
           2       0.86      0.81      0.84     21963
           3       0.89      0.92      0.90     22084
           4       0.85      0.85      0.85     22392

    accuracy                           0.83    110609
   macro avg       0.83      0.83      0.83    110609
weighted avg       0.83      0.83      0.83    110609

