# Random Forest Classifier

In [9]:
import pandas as pd
import utils
import numpy as np
from tqdm import tqdm
import collections
import time
import copy
import os
import datetime
from datetime import date
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import calendar
from pathlib import Path
from email.mime.multipart import MIMEMultipart
from email.mime.base import MIMEBase
from email import encoders
from email.mime.text import MIMEText
import smtplib
home = str(Path.home())
t1 = time.time()
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, recall_score, make_scorer, precision_recall_curve
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import itertools
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# Snowflake connector dbt_jferragut
engine = utils.create_snowflake_engine(TAG = ' ')
connection = engine.connect()

In [4]:
base_data_sql = """
SELECT *
FROM  test_hbg.dbt_jferragut.FEATURE_ENCODED"""

base_data = pd.read_sql_query(base_data_sql,engine)

In [5]:
#from sklearn.utils import resample
X = base_data.drop(['bookingreference', 'booking_timestamp', 'is_fraud'], axis = 1)
y = base_data['is_fraud']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y) # constant ratio in train and test

In [58]:
def manual_grid_search(estimator, param_grid, X, y, set_sizes_grid,file_name="grid_search_results.txt"):
    """
    Manual implementation of GridSearchCV functionality.
    
    Parameters:
    - estimator: The machine learning model (e.g., an instance of sklearn's classifiers).
    - param_grid: Dictionary where keys are parameter names and values are lists of parameter settings to try.
    - X: Features dataset.
    - y: Target dataset.
    - scoring: Metric function to evaluate the model (default: accuracy_score).
    - file_name: Name of the output .txt file to save results.
    """
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Get all parameter combinations
    param_names = list(param_grid.keys())
    param_values = list(param_grid.values())
    param_combinations = list(itertools.product(*param_values))
    # Get the current timestamp
    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
        
    # Iterate over all parameter combinations
    for combination in param_combinations:
        # Create a dictionary of parameters
        params = dict(zip(param_names, combination))
        
        # Set the parameters to the estimator
        estimator.set_params(**params)
        
        # Train the model
        estimator.fit(X_train, y_train)
        
        # Make predictions and calculate the score
        y_pred = estimator.predict(X_test)
        score = classification_report(y_test, y_pred)
        
       
         # Open the file to write results
        with open(file_name, "a") as file:
            # Write the results to the file
            file.write(f"{timestamp}\t{set_sizes_grid}\t{params}\n{score}\n")
        
        # Optionally, print the results to the console
        # print(f"Timestamp: {timestamp}, Parameters: {params}, Score: {score}")

    print(f"Results have been saved to {file_name}")

In [51]:
X_train_sampled, _, y_train_sampled, _ = train_test_split(X_train,y_train, test_size=0.9, random_state=42, stratify=y_train) # constant ratio in train and test

In [132]:
param_grid ={
    'n_estimators': [200, 150, 100], # number of trees
    'max_depth': [10], # max depth for each tree (None-> with no limit)
    'min_samples_split': [2], # minimum samples required to split a node
    'min_samples_leaf': [12, 15, 20, 25], # minimum samples required in a leaf (terminal node)
    # ,'min_weight_fraction_leaf' :[0,0.001]
    'criterion': ['gini', 'entropy', 'log_loss'],
    'class_weight': [{0: 689, 1: 3360347}, {0: 1, 1: 1000}, {0: 1, 1: 10000}]
}
set_sizes_grid = [0.7] # 600k, 900k, 1.2M , 0.7, 0.6

In [6]:
rf=RandomForestClassifier(random_state=42)
# manual_grid_search(rf,param_grid, X_train_sampled, y_train_sapled, file_name="grid_search_results.txt" ,set_sizes_grid=0)

In [None]:
for i in set_sizes_grid:
    X_train_sampled, _, y_train_sampled, _ = train_test_split(X_train,y_train, test_size=i, random_state=42, stratify=y_train) # constant ratio in train and test
    manual_grid_search(rf,param_grid, X_train_sampled, y_train_sampled, file_name="grid_search_results.txt",set_sizes_grid=i )

In [None]:
# train final model with best params
final_model = RandomForestClassifier(
        n_estimators=200, 
        max_depth=10,
        min_samples_split=2,
        min_samples_leaf=12,
        criterion='gini', 
        class_weight={0:1,1:1000}, 
        random_state=42)
final_model.fit(X_train,y_train)

In [None]:
y_scores_train = final_model.predict_proba(X_train)[:,1]
precision, recall, thresholds = precision_recall_curve(y_train, y_scores_train)

for p, r, t in zip(precision, recall, thresholds):
    if r > 0.8 and p >= 0.01:
        print(f"Threshold: {t: .2f}, precision: {p: .2f}, recall: {r: .2f}")

In [None]:
# we define threshold based on what we see in previous cell results
threshold = 0.4
y_pred_train = (y_scores_train > threshold).astype(int)
print(len(y_pred_train))
print(classification_report(y_train,y_pred_train, target_names=['No Fraud','Fraud']))

In [None]:
# evaluate in train
y_pred_train = final_model.predict(X_train)
print(classification_report(y_train,y_pred_train, target_names=['No Fraud','Fraud']))

In [None]:
# evaluate in test
threshold = 0.45
y_scores = final_model.predict_proba(X_test)[:,1]
y_pred = (y_scores > threshold).astype(int)
print(len(y_pred))
print(classification_report(y_test,y_pred, target_names=['No Fraud','Fraud']))

In [None]:
# evaluate in test
y_pred = final_model.predict(X_test)
print(classification_report(y_test,y_pred, target_names=['No Fraud','Fraud']))