In [1]:
import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE

from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score

In [2]:
#Call the dataset with its targete
df_numeric = pd.read_csv('data1.csv')
df_target=pd.read_csv('data_target1.csv')

# Balance dataset

In [3]:
#Use SMOTE technique to balance dataset
Y = df_target['TARGET'].astype('int')
X_balance,Y_balance = SMOTE().fit_sample(df_numeric,Y)
X_balance = pd.DataFrame(X_balance, columns = df_numeric.columns)

In [4]:
X_balance.shape

(15272, 72)

# Split the data to train and test

In [6]:
#spliting the data to train and test
train_data, test_data, train_targets, test_targets = train_test_split(
    X_balance, Y_balance, test_size=0.7, random_state=112
)
train_targets = train_targets.values
test_targets = test_targets.values
print("Sizes and class distributions for train/test data")
print("Shape train_data {}".format(train_data.shape))
print("Shape test_data {}".format(test_data.shape))
print(
    "Train data number of 0s {} and 1s {}".format(
        np.sum(train_targets == 0), np.sum(train_targets == 1)
    )
)
print(
    "Test data number of 0s {} and 1s {}".format(
        np.sum(test_targets == 0), np.sum(test_targets == 1)
    )
)

Sizes and class distributions for train/test data
Shape train_data (4581, 72)
Shape test_data (10691, 72)
Train data number of 0s 2316 and 1s 2265
Test data number of 0s 5320 and 1s 5371


# Running the basic Random Forest Classifier

In [7]:
#Create the random forest and evaluate classifier on our dataset(sample); Find accuracy and confusion matrix
rf = RandomForestClassifier()
rffit = rf.fit(train_data, train_targets)
y_predict = rffit.predict(test_data)

print('Accuracy Score is',accuracy_score(test_targets, y_predict))
print(pd.DataFrame(confusion_matrix(test_targets,y_predict)))

Accuracy Score is 0.908614722663923
      0     1
0  5077   243
1   734  4637


# Evaluating the Performance of Random Forest

In [8]:
#Use 10 fold cross-validation 
rf_cv_score = cross_val_score(rf, X_balance, Y_balance, cv=10)

In [9]:
#Print scors
print("Confusion Matrix")
print(confusion_matrix(test_targets, y_predict))

print("Classification Report")
print(classification_report(test_targets, y_predict))

Confusion Matrix
[[5077  243]
 [ 734 4637]]
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      5320
           1       0.95      0.86      0.90      5371

    accuracy                           0.91     10691
   macro avg       0.91      0.91      0.91     10691
weighted avg       0.91      0.91      0.91     10691



# Tunning hyperparameter

In [10]:
#Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1, stop = 1000, num = 5)]

#Number of features at every split
max_features = ['auto', 'sqrt','log2','int','float','None']

#Maximum depth
max_depth = [int(x) for x in np.linspace(10, 500, num =2)]
max_depth.append(None)

#Create random grid
random_grid = {'n_estimators': n_estimators,'max_features': max_features,'max_depth': max_depth }

#Random search of parameters
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=0, random_state=2, n_jobs = -1)

#Fit the model
rf_random.fit(train_data, train_targets)

#Print the results
print(rf_random.best_params_)



{'n_estimators': 1000, 'max_features': 'auto', 'max_depth': None}


In [11]:
#Running Random Forest with the best parameters
rf = RandomForestClassifier(n_estimators=1000, max_depth=None, max_features='auto')
rf.fit(train_data,train_targets)
y_predict = rf.predict(test_data)
rf_cv_score = cross_val_score(rf, X_balance, Y_balance, cv=10)

print("Confusion Matrix")
print(confusion_matrix(test_targets, y_predict))

print("Classification Report")
print(classification_report(test_targets, y_predict))

Confusion Matrix
[[5112  208]
 [ 698 4673]]
Classification Report
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      5320
           1       0.96      0.87      0.91      5371

    accuracy                           0.92     10691
   macro avg       0.92      0.92      0.92     10691
weighted avg       0.92      0.92      0.92     10691



In [12]:
#Calulate ROC AUC
test_predictions = y_predict
test_predictions_class = ((test_predictions > 0.5) * 1).flatten()
test_df = pd.DataFrame(
    {
        "Predicted_score": test_predictions.flatten(),
        "Predicted_class": test_predictions_class,
        "True": test_targets,
    }
)
roc_auc = metrics.roc_auc_score(test_targets, y_predict)
print("The AUC on test set:\n")
print(roc_auc)

The AUC on test set:

0.9154725391023639


In [13]:
#Calculate f1 score
f1_score(test_targets, y_predict, average='macro')

0.9151126911795792

In [14]:
# Recall
from sklearn.metrics import recall_score
recall_score(test_targets, y_predict, average='macro')

0.9154725391023639

In [15]:
#Precision
from sklearn.metrics import precision_score
precision_score(test_targets, y_predict, average='macro')

0.9186240439852307