# Random Forest on UFC
This notebook aims to provide a model for the win/lose question with RF, tuned with `sklearn.GridSearchCV`.

In [165]:
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import accuracy_score

In [150]:
!readlink . -f

/home/zebalgebra/School/DAB/Team-7/Code


In [151]:
fp = "/home/zebalgebra/School/DAB/Team-7/Data/Cleaned/"
!eza -l (readlink $fp -f) # or replace eza by ls if no eza installed

.[1;33mr[31mw[90m-[0m[33mr[1;90m--[0m[33mr[1;90m--[0m [1;32m625k[0m [1;33mzebalgebra[0m [34m12 Apr 22:36[0m df_records.csv
.[1;33mr[31mw[90m-[0m[33mr[1;90m--[0m[33mr[1;90m--[0m [33m3.4M[0m [1;33mzebalgebra[0m [34m12 Apr 22:36[0m fight_data.csv
.[1;33mr[31mw[90m-[0m[33mr[1;90m--[0m[33mr[1;90m--[0m [33m5.3M[0m [1;33mzebalgebra[0m [34m12 Apr 22:36[0m fighter_agg.csv
.[1;33mr[31mw[90m-[0m[33mr[1;90m--[0m[33mr[1;90m--[0m [1;32m6.5k[0m [1;33mzebalgebra[0m [34m12 Apr 22:36[0m ppv_data.csv


In [152]:
[rec_df, fight_df, agg_df] = [
    pd.read_csv(f"{fp}{x}.csv")
    for x in ["df_records", "fight_data", "fighter_agg"]
]
df_all = [rec_df, fight_df, agg_df]

In [189]:
df = pd.concat(
    [
        df_all[0],
        df_all[2].iloc[:, 2:]
    ],
    axis=1
).merge(
    right=df_all[1][["Winner", "date"]],
    right_on=["Winner", "date"],
    left_on=["fighter", "date"],
    how="outer"
)
df["is_winner"] = 1 - df["Winner"].isna()
df = df.drop(columns="Winner").iloc[:, 2:]
df = df.fillna(0)
df_x, df_y = df.iloc[:, :-1], df.iloc[:, -1]

## GridSearchCV

In [202]:
%%time

clf = RandomForestClassifier(random_state=6203)

gscv_opts = {
    "estimator": clf,
    "param_grid": {
        "n_estimators": [80],
        "max_depth": [4, 6, 8, 12, 15],
        "max_features": ['sqrt', 'log2'],
        "min_samples_split": [2, 10, 30],
        "min_samples_leaf": [1, 3, 6]
    },
    "scoring": "accuracy",
    "cv": 10,
    "verbose": 2
}

gscv = GridSearchCV(**gscv_opts)
gscv.fit(df_x, df_y)
res = gscv.cv_results_
bi = gscv.best_index_

Fitting 10 folds for each of 90 candidates, totalling 900 fits
[CV] END max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=80; total time=   1.6s
[CV] END max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=80; total time=   1.6s
[CV] END max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=80; total time=   1.6s
[CV] END max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=80; total time=   1.6s
[CV] END max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=80; total time=   1.7s
[CV] END max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=80; total time=   1.5s
[CV] END max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=80; total time=   1.5s
[CV] END max_depth=4, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=80; total time= 

In [203]:
print(f"""
best params: {res["params"][bi]}
best mean test score: {res["mean_test_score"][bi]}
""")


best params: {'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 6, 'min_samples_split': 2, 'n_estimators': 80}
best mean test score: 0.5505868534777381

