In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, date

from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split

import preprocessing

In [3]:
#Import data
xtrain = pd.read_csv("Xtrain.csv")
ytrain = pd.read_csv("Ytrain.csv")
records = xtrain.copy()
records['ChargeOff'] = ytrain['ChargeOff']

In [4]:
processed_records = preprocessing.all_preprocess_with_label_encoding(records)

In [5]:
x = processed_records.drop(columns='ChargeOff')
y = processed_records['ChargeOff']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state=0)

In [6]:
rf = RandomForestClassifier(n_estimators = 30, random_state = 42, max_depth=30)
rf.fit(x_train,y_train)
accuracy=accuracy_score(y_test, rf.predict(x_test))
accuracy

0.89752

In [7]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 25, stop = 35, num = 4)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(start = 25, stop = 35, num = 2)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(x_train, y_train)

print(rf_random.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   40.4s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.4min finished


{'n_estimators': 35, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 35, 'bootstrap': False}


In [8]:
def evaluate(model, x_test, y_test):
    accuracy=accuracy_score(y_test, model.predict(x_test))
    print('Model Performance')
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    return accuracy

base_model = RandomForestClassifier(n_estimators = 50, random_state = 42)
base_model.fit(x_train, y_train)
base_accuracy = evaluate(base_model, x_test, y_test)

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, x_test, y_test)

print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Model Performance
Accuracy = 0.90%.
Model Performance
Accuracy = 0.90%.
Improvement of 0.17%.


In [9]:
rf.set_params(**rf_random.best_params_)

RandomForestClassifier(bootstrap=False, max_depth=35, min_samples_split=10,
                       n_estimators=35)

In [10]:
rf.fit(x_train,y_train)
accuracy=accuracy_score(y_test, rf.predict(x_test))
accuracy

0.89984

In [11]:
Xtest = pd.read_csv('Xtest.csv')
Xtest_processed = preprocessing.all_preprocess_with_label_encoding(Xtest)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [12]:
submission = pd.DataFrame()
submission['Id'] = Xtest['Id']
submission['ChargeOff'] = best_random.predict(Xtest_processed)
submission.to_csv('submission_RF.csv', index=False)