In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score


In [2]:
train = pd.read_csv('./assets/data_train.csv')
test = pd.read_csv('./assets/data_test.csv')

In [3]:
X_train = train.iloc[:,0:14]
X_test = test.iloc[:,0:14]
y_train = train.iloc[:,14]
y_test = test.iloc[:,14]


X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)

columns = X_train.columns

In [4]:
# Scaling data
# Random Forest Classifier
# BASELINE CLASSIFIER

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("\nAccuracy is ", accuracy_score(y_test,y_pred)*100,"\n\nPrecision Value is\n",classification_report(y_test,y_pred))


Accuracy is  85.43701246852159 

Precision Value is
               precision    recall  f1-score   support

           0       0.88      0.93      0.91     12435
           1       0.73      0.61      0.66      3846

    accuracy                           0.85     16281
   macro avg       0.81      0.77      0.79     16281
weighted avg       0.85      0.85      0.85     16281



In [None]:
# Setting up parametergrid for tuning
# Doing RandomizedSearchCV to get best parameters

lst = np.arange(1, 50, 5)
lst1 = np.arange(0, 2000, 100)

param_grid = { 
    'n_estimators': lst1,
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : lst,
    'criterion' :['gini', 'entropy']
}

CV_clf = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, cv= 5)
CV_clf.fit(X_train, y_train)

In [None]:
CV_clf.best_params_

In [None]:
# https://www.featureranking.com/tutorials/machine-learning-tutorials/sk-part-3-cross-validation-and-hyperparameter-tuning/

results_clf = pd.DataFrame(CV_clf.cv_results_['params'])
results_clf['test_score'] = CV_clf.cv_results_['mean_test_score']
results_clf.columns

In [None]:
for i in ['gini','entropy']:
    temp = results_clf[results_clf['criterion'] == i]
    temp_avg = temp.groupby('max_depth').agg({'test_score':'mean'})
    plt.plot(temp_avg, marker = '.', label = i)
plt.legend()
plt.xlabel('max_depth')
plt.ylabel('Mean CV Score')
plt.title('clf Performance Comparison')

In [None]:
for i in ['gini','entropy']:
    temp = results_clf[results_clf['criterion'] == i]
    temp_avg = temp.groupby('n_estimators').agg({'test_score':'mean'})
    plt.plot(temp_avg, marker = '.', label = i)
plt.legend()
plt.xlabel('n_estimators')
plt.ylabel('Mean CV Score')
plt.title('clf Performance Comparison')

In [None]:
clf1 = RandomForestClassifier(max_depth=16, max_features='sqrt', n_estimators = 500, criterion ='gini')
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
print("\nAccuracy is ", accuracy_score(y_test,y_pred)*100,"\n\nPrecision Value is\n",classification_report(y_test,y_pred))

In [None]:
importances = pd.DataFrame({'feature':columns ,'importance':np.round(clf1.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.plot.bar()

# Final model

In [None]:
import numpy as np
from hyperopt import hp, tpe, fmin,STATUS_OK,Trials

def accuracy_model(params):
    clf = RandomForestClassifier(**params)
    return cross_val_score(clf, X_train, y_train).mean()

param_space = {'max_depth': hp.choice('max_depth', range(10,100)),
    'max_features': hp.uniform('max_features', 0.1,1),
    'n_estimators': hp.choice('n_estimators', range(50,500)),
    'min_samples_leaf': hp.choice('min_samples_leaf',range(3,5)),
    'min_samples_split': hp.choice('min_samples_split',range(2,10)),
    'criterion': hp.choice('criterion', ["gini", "entropy"])}

best = 0
def f(params):
    global best
    acc = accuracy_model(params)
    if acc > best:
        best = acc
    return {'loss': -acc, 'status': STATUS_OK}

Trials = Trials()
best_params = fmin(f, param_space , algo=tpe.suggest,max_evals=500, trials= Trials)

print('New best:', best, best_params)
print(best_params)