In [1]:
# Perform initial import of the needed libraries

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import sklearn
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from IPython.display import display

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split

In [3]:
# Read the CSV and perform basic data cleaning
file_path = Path('./Resources/fighters+fights_export.csv')
fnf_df = pd.read_csv(file_path)[:-2]

In [4]:
fnf_df.columns

Index(['fight_id', 'winner', 'title_bout', 'weight_class', 'no_of_rounds',
       'r_fighter', 'r_fighter_id', 'r_fighter_stance', 'b_fighter',
       'b_fighter_id', 'b_fighter_stance', 'date', 'referee', 'referee_id',
       'b_fighter_height', 'r_fighter_height', 'b_fighter_reach',
       'r_fighter_reach', 'b_fighter_weight', 'r_fighter_weight'],
      dtype='object')

In [None]:
columns = (['winner', 'title_bout', 'weight_class', 'no_of_rounds', 'r_fighter_stance', 'b_fighter_stance', 'b_fighter_height', 'r_fighter_height', 'b_fighter_reach',
       'r_fighter_reach', 'b_fighter_weight', 'r_fighter_weight'])

target = ["winner"]

In [None]:
fnf_df = fnf_df.loc[:, columns].copy()

In [5]:
fnf_df

Unnamed: 0,fight_id,winner,title_bout,weight_class,no_of_rounds,r_fighter,r_fighter_id,r_fighter_stance,b_fighter,b_fighter_id,b_fighter_stance,date,referee,referee_id,b_fighter_height,r_fighter_height,b_fighter_reach,r_fighter_reach,b_fighter_weight,r_fighter_weight
0,FT1,Red,TRUE,Catch Weight,1,Royce Gracie,1,Southpaw,Gerard Gordeau,3,Orthodox,11/12/1993,Joao Alberto Barreto,r1,195.58,185.42,,,216.0,175.0
1,FT3,Red,FALSE,Open Weight,1,Royce Gracie,1,Southpaw,Ken Shamrock,4,Orthodox,11/12/1993,Joao Alberto Barreto,r1,185.42,185.42,182.88,,205.0,175.0
2,FT6,Red,FALSE,Open Weight,1,Royce Gracie,1,Southpaw,Art Jimmerson,8,Orthodox,11/12/1993,Joao Alberto Barreto,r1,185.42,185.42,,,196.0,175.0
3,FT2,Red,FALSE,Open Weight,1,Jason DeLucia,2,Southpaw,Trent Jenkins,6,,11/12/1993,Joao Alberto Barreto,r1,187.96,180.34,,,185.0,190.0
4,FT4,Red,FALSE,Open Weight,1,Gerard Gordeau,3,Orthodox,Kevin Rosier,5,Orthodox,11/12/1993,Joao Alberto Barreto,r1,193.04,195.58,,,275.0,216.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5137,FT5139,Blue,FALSE,Women's Strawweight,3,Karolina Kowalkiewicz,1491,Orthodox,Alexa Grasso,1583,Orthodox,6/8/2019,Kevin MacDonald,r73,165.10,160.02,167.64,162.56,115.0,115.0
5138,FT5133,Red,TRUE,Women's Flyweight,5,Valentina Shevchenko,1492,Southpaw,Jessica Eye,1123,Orthodox,6/8/2019,Robert Madrigal,r82,167.64,165.10,167.64,167.64,125.0,125.0
5139,FT5137,Red,FALSE,Women's Strawweight,3,Tatiana Suarez,1537,,Nina Ansaroff,1344,Orthodox,6/8/2019,Robert Madrigal,r82,165.10,165.10,162.56,167.64,115.0,115.0
5140,FT5144,Red,FALSE,Women's Flyweight,3,Katlyn Chookagian,1546,Orthodox,Joanne Calderwood,1368,Orthodox,6/8/2019,Dan Miragliotta,r25,167.64,175.26,165.10,172.72,125.0,125.0


In [None]:
fnf_df.isnull().sum()

In [None]:
fnf_df['r_fighter_stance'].fillna("unlisted", inplace = True)
fnf_df['b_fighter_stance'].fillna("unlisted", inplace = True)
fnf_df['r_fighter_height'].fillna(value=fnf_df['r_fighter_height'].mean(), inplace = True)
fnf_df['b_fighter_height'].fillna(value=fnf_df['b_fighter_height'].mean(), inplace = True)
fnf_df['b_fighter_reach'].fillna(value=fnf_df['b_fighter_reach'].mean(), inplace = True)
fnf_df['r_fighter_reach'].fillna(value=fnf_df['r_fighter_reach'].mean(), inplace = True)
fnf_df['b_fighter_weight'].fillna(value=fnf_df['b_fighter_weight'].mean(), inplace = True)
fnf_df['r_fighter_weight'].fillna(value=fnf_df['r_fighter_weight'].mean(), inplace = True)

In [None]:
fnf_df.isnull().sum()

In [None]:
fnf_df

In [None]:
fnf_df.dtypes

In [None]:
# Generate our categorical variable lists
app_cat = fnf_df.dtypes[fnf_df.dtypes == "object"].index.tolist()

In [None]:
app_cat = [
 'title_bout',
 'weight_class',
 'r_fighter_stance',
 'b_fighter_stance']

In [None]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(fnf_df[app_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(app_cat)
encode_df.head()

In [None]:
pd.options.display.max_columns = None
display(encode_df)

In [None]:
fnf_df

In [None]:
encode_df.columns

In [None]:
fnf_df_num = pd.concat([fnf_df,encode_df],axis=1)

In [None]:
fnf_df_num

In [None]:
fnf_df_num_2 = fnf_df_num.copy()

In [None]:
fnf_df_num_2 = fnf_df_num_2.replace(to_replace ="Red ", value = 1.0)
fnf_df_num_2 = fnf_df_num_2.replace(to_replace ="Blue", value = 2.0)
fnf_df_num_2 = fnf_df_num_2.replace(to_replace ="Draw", value = 3.0)

fnf_df_num_2.head(40)

In [None]:
fnf_df_num_2.drop(['title_bout', 'weight_class', 'r_fighter_stance', 'b_fighter_stance'], axis = 1 , inplace = True)
fnf_df_num_2

In [None]:
fnf_df_num_2.dtypes

In [None]:
# Preprocess data, split the data into Training and Testing
    # Create feature set 
X = fnf_df_num_2.copy()
X = X.drop(columns='winner', axis=1)
X

In [None]:
# Create target
y = fnf_df_num_2["winner"]
y

In [None]:
# Check the balance of our target values
y.value_counts()

In [None]:
# Split into Train and Test sets. 
x_Train, x_Test, y_Train, y_Test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Create a StandardScaler instance. 
scaler = StandardScaler()

In [None]:
# Fit the Standard Scaler with the training data. 
X_scaler = scaler.fit(X_train)

In [None]:
# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
# YOUR CODE HERE
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_train, y_train)

In [None]:
y_pred = brf.predict(X_test)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
X.columns

In [None]:
# List the features sorted in descending order by feature importance
sorted(zip(brf.feature_importances_, X_train.columns), reverse=True)