# Modelling

In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
X_train = pd.read_csv('../data/X_train.csv', index_col=0)
y_train = pd.read_csv('../data/y_train.csv', index_col=0)['Attrition']
X_test = pd.read_csv('../data/X_test.csv', index_col=0)
y_test = pd.read_csv('../data/y_test.csv', index_col=0)['Attrition']

## Model Selection

### KNN

In [3]:
knn = KNeighborsClassifier()

In [4]:
algorithm = ['brute', 'BallTree', 'KDTree']
n_neighbors = [i for i in range(1,25)]
weights = ['distance', 'uniform']
param_grid = {"n_neighbors" : n_neighbors, "weights" : weights, "algorithm" : algorithm}

In [5]:
grid = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, n_jobs=-1)
grid = grid.fit(X_train, y_train)

In [6]:
print(grid.best_estimator_) 

KNeighborsClassifier(algorithm='brute', n_neighbors=4, weights='distance')


In [7]:
knn = KNeighborsClassifier(weights='distance', n_neighbors=4, algorithm='brute')
knn.fit(X_train, y_train)
y_predict_knn = knn.predict(X_test)

In [8]:
report = classification_report(y_test, y_predict_knn)
print(report)

              precision    recall  f1-score   support

           0       0.89      0.91      0.90       369
           1       0.91      0.89      0.90       371

    accuracy                           0.90       740
   macro avg       0.90      0.90      0.90       740
weighted avg       0.90      0.90      0.90       740



### Decision Tree

In [9]:
dt = DecisionTreeClassifier()

In [10]:
criterion = ['gini', 'entropy']
splitter = ['random', 'best']
max_depth = [i for i in range(2, 20)]
min_samples_split = [i for i in range (2, 5)]
max_features = [i for i in range(1, 25)]

In [11]:
param_grid = {"criterion" : criterion, 
              "splitter" : splitter, 
              "max_depth" : max_depth,
              "min_samples_split": min_samples_split,
              "max_features": max_features}

In [None]:
grid = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1)
grid = grid.fit(X_train, y_train)

In [None]:
print(grid.best_estimator_) 

In [None]:
best_dt = grid.best_estimator_

In [None]:
best_dt.fit(X_train, y_train)

In [None]:
y_predict_dt = best_dt.predict(X_test)

In [None]:
report = classification_report(y_test, y_predict_dt)
print(report)

### Random Forest

In [None]:
rf = RandomForestClassifier()

In [None]:
criterion = ['gini', 'entropy']
max_depth = [i for i in range(2, 20)]
min_samples_split = [i*2 for i in range (1, 20)]
max_features = [i for i in range(1, 25)]
bootstrap = [True, False]
n_estimators = [i*10 for i in range(1,10)]

In [None]:
param_grid = {"n_estimators": n_estimators}

In [None]:
grid = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='f1', cv=3, n_jobs=-1)

In [None]:
grid = grid.fit(X_train, y_train)

In [None]:
best_rf = grid.best_estimator_

In [None]:
best_rf.fit(X_train, y_train)

In [None]:
y_predict_rf = best_rf.predict(X_test)

### Naive Bayes