## Random Forest

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.metrics import precision_recall_curve, auc, confusion_matrix
from joblib import dump
import numpy as np

#### Initialising

In [67]:
# initialising WandB
import wandb, os

os.environ["WANDB_API_KEY"] = "e84d2e19bd2cc42ec6e5d232cd0b6f0fe41f2189"
os.environ["WANDB_NOTEBOOK_NAME"] = "randomforest_models.ipynb"

'''
Syntax for using WandB:

wandb.init(project="MSciProject", name="name", notebook="your-notebook-name")
# code here
wandb.finish()
'''

'\nSyntax for using WandB:\n\nwandb.init(project="MSciProject", name="name", notebook="your-notebook-name")\n# code here\nwandb.finish()\n'

In [68]:
# Load the data
data = pd.read_csv('/Users/kirst/Downloads/uvf.csv', parse_dates=['time'])

assert len(data.columns)==21

In [69]:
# Convert "time" column to datetime format
#data['time'] = pd.to_datetime(data['time'], format='%d/%m/%Y %H:%M')

# Split the data into training and testing sets based on the date
train_data = data[data['time'].dt.year <= 2012]
test_data = data[(data['time'].dt.year >= 2014) & (data['time'].dt.year <= 2016)]

print(f"Train range: {train_data['time'].min()} -> {train_data['time'].max()}")
print(f"Test range: {test_data['time'].min()} -> {test_data['time'].max()}")

training_date_range = "2003-05-22 to 2012-12-31"
testing_date_range = "2014-01-01 to 2016-12-31"

# Drop the "time" column as it won't be used in the model
train_data = train_data.drop(columns=['time'])
test_data = test_data.drop(columns=['time'])

# Check the size of the training and testing sets
len(train_data), len(test_data)

# Define the features (X) and the target (y)
X_train = train_data.drop(columns=['flag'])
y_train = train_data['flag']
X_test = test_data.drop(columns=['flag'])
y_test = test_data['flag']

# Balanced Data - removing NaN values and associated data
y_train = y_train.dropna()
y_test = y_test.dropna()

X_train = X_train.loc[y_train.index]
X_test = X_test.loc[y_test.index]

assert len(X_train.columns)==19

Train range: 2003-05-22 12:56:00 -> 2012-12-31 22:29:00
Test range: 2014-01-01 00:25:00 -> 2016-12-31 21:51:00


### Initial Model

In [70]:
#wandb.init(project="MSciProject")

# Initialize the Random Forest model
rf_model1 = RandomForestClassifier(n_estimators=250, random_state=42, bootstrap=False)

# Train the model
rf_model1.fit(X_train, y_train)

# Use the model to make predictions on the test data
y_pred_rf = rf_model1.predict(X_test)

# Calculate the performance metrics
accuracy = accuracy_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf)
recall = recall_score(y_test, y_pred_rf)
auc_roc = roc_auc_score(y_test, y_pred_rf)

#auc_pr
precision1, recall1, _ = precision_recall_curve(y_test, y_pred_rf)
auc_pr = auc(recall1, precision1)

#specificity
threshold = 0.5
# Convert predicted probabilities to binary predictions using the threshold
y_pred_binary = (y_pred_rf > threshold).astype(int)

conf_matrix = confusion_matrix(y_test, y_pred_binary)
tn, fp, fn, tp = conf_matrix.ravel()

specificity = tn / (tn + fp)



accuracy, precision, recall, auc_roc, f1, auc_pr, specificity

#wandb.log({"model_name":"Random Forest (Balanced Data), bootstrap=false", "accuracy":accuracy, "precision":precision, "recall":recall, 
           #"auc_roc":auc_roc, "f1_score":f1, "auc_pr":auc_pr, "specificity":specificity,
           #"training date range": training_date_range, "testing date range": testing_date_range})

#wandb.finish()


(0.6694666927134206,
 0.7429268292682927,
 0.5665922619047619,
 0.6749045225607725,
 0.6428872942169691,
 0.7685513017889105,
 0.7832167832167832)

In [71]:
# Save the model ?
dump(rf_model1, '/Users/kirst/Downloads/rf_model1.joblib')

['/Users/kirst/Downloads/rf_model1.joblib']

### Optimising Hyperparameters

In [57]:
rf_classifier = RandomForestClassifier()
params = rf_classifier.get_params()

print(params)

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [11]:
#wandb.init(project="MSciProject")

# Define the hyperparameters to search over
'''
INITIAL GRID
param_grid = {'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': [None, 'balanced']}
'''
param_grid = {
    'n_estimators': [10, 50, 100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap' : [True, False]
}


# Initialize the grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid, cv=5, scoring='precision', n_jobs=-1)

# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best recall score
best_params = grid_search.best_params_
best_recall = grid_search.best_score_

best_params, best_recall

#wandb.log({"model_name":"Random Forest Grid Search", "Best Parameters": best_params, "Best Recall": best_recall})

({'class_weight': None,
  'max_depth': 20,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 200},
 0.7142679058830519)

### Optimised Model

In [15]:
wandb.init(project="MSciProject")

# Run model with best parameters
# try this against gradient boosted tree and compare?

rf_model = RandomForestClassifier(random_state=42,
                                  max_depth=20,
                                  min_samples_leaf=1,
                                  min_samples_split=2,
                                  n_estimators=200,
                                  bootstrap=False)


rf_model.fit(X_train, y_train)

# Use the model to make predictions on the test data
y_pred = rf_model.predict(X_test)

# Calculate the performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

#auc_pr
precision1, recall1, _ = precision_recall_curve(y_test, y_pred)
auc_pr = auc(recall1, precision1)

#specificity
threshold = 0.5
# Convert predicted probabilities to binary predictions using the threshold
y_pred_binary = (y_pred > threshold).astype(int)

conf_matrix = confusion_matrix(y_test, y_pred_binary)
tn, fp, fn, tp = conf_matrix.ravel()

specificity = tn / (tn + fp)

accuracy, precision, recall, auc_roc, f1, auc_pr, specificity

wandb.log({"model_name":"Final Random Forest, bootstrap=False", "accuracy":accuracy, "precision":precision, 
           "recall":recall, "auc_roc":auc_roc, "f1_score":f1, "auc_pr":auc_pr, "specificity":specificity,
           "training date range": training_date_range, "testing date range": testing_date_range })

wandb.finish()

0,1
accuracy,▁
auc_pr,▁
auc_roc,▁
f1_score,▁
precision,▁
recall,▁
specificity,▁

0,1
accuracy,0.6887
auc_pr,0.78208
auc_roc,0.69214
f1_score,0.67851
model_name,"Final Random Forest,..."
precision,0.74205
recall,0.625
specificity,0.75927
testing date range,2014-01-01 to 2016-1...
training date range,2003-05-22 to 2012-1...


In [None]:
# Save the model
dump(rf_model, '/Users/kirst/Downloads/rf_model.joblib')