In [29]:
import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE

from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score

In [30]:
#Call the dataset with its targete
df_numeric = pd.read_csv('data_numeric_norm.csv')
df_target=pd.read_csv('data_target.csv')

# Balance dataset

In [31]:
#Use SMOTE technique to balance dataset
Y = df_target['TARGET'].astype('int')
X_balance,Y_balance = SMOTE().fit_sample(df_numeric,Y)
X_balance = pd.DataFrame(X_balance, columns = df_numeric.columns)

In [32]:
X_balance.shape

(2000, 38)

# Split the data to train and test

In [33]:
#split the data to train and test
train_data, test_data, train_targets, test_targets = train_test_split(
    X_balance, Y_balance, test_size=0.7, random_state=112
)
train_targets = train_targets.values
test_targets = test_targets.values
print("Sizes and class distributions for train/test data")
print("Shape train_data {}".format(train_data.shape))
print("Shape test_data {}".format(test_data.shape))
print(
    "Train data number of 0s {} and 1s {}".format(
        np.sum(train_targets == 0), np.sum(train_targets == 1)
    )
)
print(
    "Test data number of 0s {} and 1s {}".format(
        np.sum(test_targets == 0), np.sum(test_targets == 1)
    )
)

Sizes and class distributions for train/test data
Shape train_data (600, 38)
Shape test_data (1400, 38)
Train data number of 0s 297 and 1s 303
Test data number of 0s 703 and 1s 697


# Running the basic Random Forest Classifier

In [34]:
#Create the random forest and evaluate classifier on our dataset(sample); Find accuracy and confusion matrix
rf = RandomForestClassifier()
rffit = rf.fit(train_data, train_targets)
y_predict = rffit.predict(test_data)

print('Accuracy Score is',accuracy_score(test_targets, y_predict))
print(pd.DataFrame(confusion_matrix(test_targets,y_predict)))

Accuracy Score is 0.5828571428571429
     0    1
0  421  282
1  302  395


# Evaluating the Performance of Random Forest

In [35]:
#Use 10 fold cross-validation 
rf_cv_score = cross_val_score(rf, X_balance, Y_balance, cv=10)

In [36]:
#Print confusion matrix and classification report
print("Confusion Matrix")
print(confusion_matrix(test_targets, y_predict))

print("Classification Report")
print(classification_report(test_targets, y_predict))

Confusion Matrix
[[421 282]
 [302 395]]
Classification Report
              precision    recall  f1-score   support

           0       0.58      0.60      0.59       703
           1       0.58      0.57      0.57       697

    accuracy                           0.58      1400
   macro avg       0.58      0.58      0.58      1400
weighted avg       0.58      0.58      0.58      1400



# Tunning hyperparameter

In [37]:
#Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1, stop = 1000, num = 5)]

#Number of features at every split
max_features = ['auto', 'sqrt','log2','int','float','None']

#Maximum depth
max_depth = [int(x) for x in np.linspace(10, 500, num =2)]
max_depth.append(None)

#Create random grid
random_grid = {'n_estimators': n_estimators,'max_features': max_features,'max_depth': max_depth }

#Random search of parameters
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=0, random_state=2, n_jobs = -1)

#Fit the model
rf_random.fit(train_data, train_targets)

#Print the results
print(rf_random.best_params_)



{'n_estimators': 500, 'max_features': 'log2', 'max_depth': 500}


In [38]:
#Running Random Forest with the best parameters and find confusion matrix and classification report
rf = RandomForestClassifier(n_estimators=500, max_depth=500, max_features='log2')
rf.fit(train_data,train_targets)
y_predict = rf.predict(test_data)
rf_cv_score = cross_val_score(rf, X_balance, Y_balance, cv=10)

print("Confusion Matrix")
print(confusion_matrix(test_targets, y_predict))

print("Classification Report")
print(classification_report(test_targets, y_predict))

Confusion Matrix
[[422 281]
 [293 404]]
Classification Report
              precision    recall  f1-score   support

           0       0.59      0.60      0.60       703
           1       0.59      0.58      0.58       697

    accuracy                           0.59      1400
   macro avg       0.59      0.59      0.59      1400
weighted avg       0.59      0.59      0.59      1400



In [39]:
#Find f1 score for above prediction
f1_score(test_targets, y_predict, average='macro')

0.5899322132842368

In [40]:
test_predictions = y_predict
test_predictions_class = ((test_predictions > 0.5) * 1).flatten()
test_df = pd.DataFrame(
    {
        "Predicted_score": test_predictions.flatten(),
        "Predicted_class": test_predictions_class,
        "True": test_targets,
    }
)
roc_auc = metrics.roc_auc_score(test_targets, y_predict)
print("The AUC on test set:\n")
print(roc_auc)

The AUC on test set:

0.5899557338808263


In [41]:
#Recall
from sklearn.metrics import recall_score
recall_score(test_targets, y_predict, average='macro')

0.5899557338808263

In [42]:
#Precision
from sklearn.metrics import precision_score
precision_score(test_targets, y_predict, average='macro')

0.5899954060538002