In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from joblib import dump
import numpy as np

In [None]:
# initialising WandB
import wandb, os

os.environ["WANDB_API_KEY"] = "e84d2e19bd2cc42ec6e5d232cd0b6f0fe41f2189"
os.environ["WANDB_NOTEBOOK_NAME"] = "baseline_filter_train.ipynb"

'''
Syntax for using WandB:

wandb.init(project="MSciProject", name="name", notebook="your-notebook-name")
# code here
wandb.finish()
'''

In [None]:
# Load the data
data = pd.read_csv('/Users/kirst/Downloads/uvf.csv', parse_dates=['time'])

In [None]:
# Convert "time" column to datetime format
#data['time'] = pd.to_datetime(data['time'], format='%d/%m/%Y %H:%M')

# Split the data into training and testing sets based on the date
train_data = data[data['time'].dt.year <= 2012]
test_data = data[(data['time'].dt.year >= 2014) & (data['time'].dt.year <= 2015)]

print(f"Train range: {train_data['time'].min()} -> {train_data['time'].max()}")
print(f"Test range: {test_data['time'].min()} -> {test_data['time'].max()}")

training_date_range = "2003-05-22 to 2012-12-31"
testing_date_range = "2014-01-01 to 2015-12-31"

# Drop the "time" column as it won't be used in the model
train_data = train_data.drop(columns=['time'])
test_data = test_data.drop(columns=['time'])

# Check the size of the training and testing sets
len(train_data), len(test_data)

# Define the features (X) and the target (y)
X_train = train_data.drop(columns=['flag'])
y_train = train_data['flag']
X_test = test_data.drop(columns=['flag'])
y_test = test_data['flag']

In [None]:
wandb.init(project="MSciProject")

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Use the model to make predictions on the test data
y_pred_rf = rf_model.predict(X_test)

# Calculate the performance metrics
accuracy = accuracy_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf)
recall = recall_score(y_test, y_pred_rf)
auc_roc = roc_auc_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf)

accuracy, precision, recall, auc_roc, f1

wandb.log({"model_name":"Random Forest", "accuracy":accuracy, "precision":precision, "recall":recall, 
           "auc_roc":auc_roc, "f1_score":f1, "training date range": training_date_range, "testing date range": testing_date_range})

wandb.finish()


In [None]:
wandb.init(project="MSciProject")

# Initialize Gradient boosted tree model
gbt_model = GradientBoostingClassifier(random_state=42)

# Train the model
gbt_model.fit(X_train, y_train)

# Use the model to make predictions on the test data
y_pred_gbt = gbt_model.predict(X_test)

# Calculate the performance metrics
accuracy = accuracy_score(y_test, y_pred_gbt)
precision = precision_score(y_test, y_pred_gbt)
recall = recall_score(y_test, y_pred_gbt)
auc_roc = roc_auc_score(y_test, y_pred_gbt)
f1 = f1_score(y_test, y_pred_gbt)

accuracy, precision, recall, auc_roc, f1

wandb.log({"model_name":"Gradient boosted tree", "accuracy":accuracy, "precision":precision, 
           "recall":recall, "auc_roc":auc_roc, "f1_score":f1, "training date range": training_date_range, "testing date range": testing_date_range})

wandb.finish()

In [None]:
wandb.init(project="MSciProject")

# Define the hyperparameters to search over
param_grid = {'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': [None, 'balanced']}

# Initialize the grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid, cv=5, scoring='recall', n_jobs=-1)

# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best recall score
best_params = grid_search.best_params_
best_recall = grid_search.best_score_

best_params, best_recall

wandb.log({"model_name":"Random Forest Grid Search", "Best Parameters": best_params, "Best Recall": best_recall})

In [None]:
wandb.init(project="MSciProject")

# Hyperparameters grid for the randomized search
param_dist = {
    'n_estimators': np.arange(50, 301, 50),
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.5],
    'max_depth': np.arange(3, 15, 1),
    'min_samples_split': np.arange(2, 11, 1),
    'min_samples_leaf': np.arange(1, 11, 1),
    'subsample': [0.7, 0.8, 0.9, 1]
}

# Initialize the randomized search with 5-fold cross-validation
random_search = RandomizedSearchCV(GradientBoostingClassifier(random_state=42),
                                   param_distributions=param_dist, n_iter=20, 
                                   scoring='recall', cv=5, n_jobs=-1, random_state=42)

# Perform the randomized search on the training data
random_search.fit(X_train, y_train)

# Get the best parameters and the best recall score
best_params_random = random_search.best_params_
best_recall_random = random_search.best_score_

best_params_random, best_recall_random

wandb.log({"model_name":"Gradient Boosting Random Search", "Best Parameters": best_params, "Best Recall": best_recall})

In [None]:
wandb.init(project="MSciProject")

# Run model with best parameters
# try this against gradient boosted tree and compare?

rf_model = RandomForestClassifier(random_state=42,
                                  class_weight="balanced",
                                  max_depth=10,
                                  min_samples_leaf=4,
                                  min_samples_split=2,
                                  n_estimators=200)


rf_model.fit(X_train, y_train)

# Use the model to make predictions on the test data
y_pred = rf_model.predict(X_test)

# Calculate the performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

accuracy, precision, recall, auc_roc, f1

wandb.log({"model_name":"Final Random Forest", "accuracy":accuracy, "precision":precision, 
           "recall":recall, "auc_roc":auc_roc, "f1_score":f1, "training date range": training_date_range, "testing date range": testing_date_range })

wandb.finish()

In [None]:
# Save the model
dump(rf_model, '/Users/kirst/Downloads/rf_model.joblib')