# 1. Import libraries, data, and globals

In [1]:
%matplotlib inline

# Get rid of warnings
import warnings
warnings.filterwarnings('ignore')

# General libraries
import requests
import pandas as pd
import string
import time
from bs4 import BeautifulSoup
from pandas.plotting import scatter_matrix
import os 
import string
import random
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import datetime
from dateutil.relativedelta import relativedelta
import category_encoders as ce

# Sklearn Specific
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import cross_validate
from sklearn import preprocessing

# sklearn algos
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB # Naive Bayes
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier # SGD
from sklearn.neural_network import MLPClassifier # neural network (multilayer perceptron)

# Bayesian Hyperparameter optimization
from hyperopt import hp, tpe, fmin

# Import data and remove Tony Johnson
fs_df = pd.read_csv("../data/full_stats_v5.csv")
fs_df = fs_df[fs_df['fighter_1'] != "Tony Johnson"]
fs_df = fs_df[fs_df['fighter_2'] != "Tony Johnson"]

fc_df = pd.read_csv("../data/fighter_char_v3.csv")
fc_df = fc_df[fc_df['name'] != "Tony Johnson"]

  from numpy.core.umath_tests import inner1d


# 2.1 Data wrangling for full_stats

#### Add Totals

In [2]:
fs_df = fs_df.fillna(0) # Fill NaN with 0

# Calc time of fight in seconds
fs_df['duration'] = fs_df.apply(lambda x: (int(x['end_round'])-1)*60*5 + int(x['end_time'].split(":")[0])*60 + int(x['end_time'].split(":")[1]) , axis = 1)

# Convert date to date object
fs_df['date'] = fs_df.apply(lambda x: datetime.datetime.strptime(x['date'],'%Y-%m-%d').date(), axis = 1)


# Calc totals for each fighter
which_fighter = ["f1","f2"]
stat_name_ending = ["kds","sigstrikes_l","sigstrikes_a","strikes_l","strikes_a","tds_l","tds_a","subs_a","passes",
                   "revs","ss_head_l","ss_head_a","ss_body_l","ss_body_a","ss_leg_l","ss_leg_a","ss_dist_l","ss_dist_a",
                   "ss_clinch_l","ss_clinch_a","ss_ground_l","ss_ground_a"]

for f in which_fighter:
    
    for stat in stat_name_ending:
        
        fs_df[f+'_t_' + stat] = np.sum(fs_df.loc[:,(fs_df.columns.str.startswith(f) & fs_df.columns.str.endswith(stat))], axis =1)
        
        fs_df[f+'_pm_' + stat] =  fs_df[f+'_t_' + stat] / fs_df['duration'] * 60


#### Subset the Data for Key Stats and Per Minute 

In [3]:
matchers = ['fighter','winner',"_pm_","date"]
ml_cols = [s for s in fs_df.columns if any(xs in s for xs in matchers)]
ml_df = fs_df[ml_cols]

#### Make each fight have two rows, so that the label is win or lose

In [4]:
# Part 1
part_1_df = ml_df.copy()

part1_key_cols = part_1_df.columns
part1_value_cols = []

for col in part1_key_cols:
    new_col = col.replace("f1","f")
    new_col = new_col.replace("f2","o")
    new_col = new_col.replace("fighter_1","a_fighter")
    new_col = new_col.replace("fighter_2","a_opponent")
    new_col = new_col.replace("winner","a_winner")
    part1_value_cols.append(new_col)
    
p1_col_dict = dict(zip(part1_key_cols, part1_value_cols))

part_1_df = part_1_df.rename(columns = p1_col_dict)

In [5]:
# Part 2
part_2_df = ml_df.copy()

part2_key_cols = part_2_df.columns
part2_value_cols = []

for col in part2_key_cols:
    new_col = col.replace("f1","o")
    new_col = new_col.replace("f2","f")
    new_col = new_col.replace("fighter_1","a_opponent")
    new_col = new_col.replace("fighter_2","a_fighter")
    new_col = new_col.replace("winner","a_winner")
    part2_value_cols.append(new_col)
    
p2_col_dict = dict(zip(part2_key_cols, part2_value_cols))

part_2_df = part_2_df.rename(columns = p2_col_dict)

In [6]:
two_pf = pd.concat([part_1_df, part_2_df], sort = True)
two_pf = two_pf.sort_values(by ='date', ascending= False)
two_pf['result'] = two_pf.apply(lambda x: int(x['a_fighter'] == x['a_winner']), axis =1)


#### Show head and describe the wrangled dataset

In [7]:
two_pf.head(2)

Unnamed: 0,a_fighter,a_opponent,a_winner,date,f_pm_kds,f_pm_passes,f_pm_revs,f_pm_sigstrikes_a,f_pm_sigstrikes_l,f_pm_ss_body_a,...,o_pm_ss_head_a,o_pm_ss_head_l,o_pm_ss_leg_a,o_pm_ss_leg_l,o_pm_strikes_a,o_pm_strikes_l,o_pm_subs_a,o_pm_tds_a,o_pm_tds_l,result
0,"Francis Ngannou ""The Predator""",Cain Velasquez,"Francis Ngannou ""The Predator""",2019-02-17,0.0,0.0,0.0,46.153846,27.692308,2.307692,...,23.076923,13.846154,0.0,0.0,36.745562,12.248521,0.0,2.307692,0.0,1
1,"James Vick ""The Texecutioner""","Paul Felder ""The Irish Dragon""","James Vick ""The Texecutioner""",2019-02-17,0.0,0.0,0.0,11.333333,5.533333,0.933333,...,5.0,1.6,1.666667,1.4,25.328889,12.244444,0.0,0.0,0.0,1


In [8]:
two_pf.describe()

Unnamed: 0,f_pm_kds,f_pm_passes,f_pm_revs,f_pm_sigstrikes_a,f_pm_sigstrikes_l,f_pm_ss_body_a,f_pm_ss_body_l,f_pm_ss_clinch_a,f_pm_ss_clinch_l,f_pm_ss_dist_a,...,o_pm_ss_head_a,o_pm_ss_head_l,o_pm_ss_leg_a,o_pm_ss_leg_l,o_pm_strikes_a,o_pm_strikes_l,o_pm_subs_a,o_pm_tds_a,o_pm_tds_l,result
count,9866.0,9866.0,9866.0,9866.0,9866.0,9866.0,9866.0,9866.0,9866.0,9866.0,...,9866.0,9866.0,9866.0,9866.0,9866.0,9866.0,9866.0,9866.0,9866.0,9866.0
mean,0.128069,0.188637,0.020098,10.079072,4.672931,1.156535,0.806863,1.141474,0.779893,7.324719,...,8.062033,3.189895,0.861356,0.678066,37.308198,18.770931,0.111921,0.379623,0.162333,0.49179
std,0.681112,0.494391,0.180453,9.27337,5.734757,1.128777,0.879737,2.135073,1.491094,6.401251,...,6.997737,3.854575,0.991489,0.804398,46.824339,31.798594,0.427196,0.57966,0.374653,0.499958
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,4.705882,1.834179,0.4,0.261059,0.133333,0.066667,2.733333,...,3.733333,1.2,0.2,0.133333,17.43065,7.481107,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,7.904192,3.333333,0.882353,0.6,0.560748,0.343348,5.766711,...,6.333333,2.186867,0.6,0.455812,27.6,12.744637,0.0,0.2,0.0,0.0
75%,0.0,0.2,0.0,12.631579,5.533333,1.590909,1.071429,1.333333,0.933333,10.133333,...,10.2,3.733333,1.2,0.963855,42.46,20.077501,0.066667,0.533333,0.2,1.0
max,17.142857,13.333333,13.333333,137.142857,112.0,21.0,18.0,51.176471,35.294118,75.0,...,94.285714,56.25,22.0,16.0,1016.326531,810.612245,13.333333,10.909091,10.909091,1.0


# 2.2 Data Wrangling for fighter_char

#### Calculate mean height and reach to fill in missing data

In [9]:
# Height
height_df = fc_df[fc_df['height'] != "--"]
height_df = height_df.reset_index()

for index, row in height_df.iterrows():
    height_list = row['height'].split(" ")
    p1 = int(re.sub("[^0-9]", "", height_list[0]))*12
    p2 = int(re.sub("[^0-9]", "", height_list[1]))
    height_df.loc[index, 'height'] = p1 + p2
    
mean_height = np.mean(height_df['height'])

# Reach
reach_df = fc_df[fc_df['reach'] != "--"]
reach_df = reach_df.reset_index()

for index, row in reach_df.iterrows():
    reach_df.loc[index, 'reach'] = int(row['reach'].replace('"',""))
    
mean_reach = np.mean(reach_df['reach'])

In [10]:
fc_df = fc_df.drop(['Unnamed: 0'], axis = 1) # drops first column

fc_df = fc_df.fillna(value = "") # fills na

# Change values to numerics and add full_name column
for index, row in fc_df.iterrows(): 
    # add full_name
    if(row['nick_name'] == ""):
        fc_df.loc[index,'full_name'] = row['name']
        
    else:
        fc_df.loc[index,'full_name'] = row['name'] + ' "' + row['nick_name'] + '"' 
    
    # convert height to int(inches)
    if(row['height'] != "--"):
        height_list = row['height'].split(" ")
        fc_df.loc[index,'height'] = int(re.sub("[^0-9]", "", height_list[0]))*12 + int(re.sub("[^0-9]", "", height_list[1]))
    else:
        fc_df.loc[index,'height'] = mean_height
    
    # convert reach to date object
    if(row['reach'] != "--"):
        fc_df.loc[index,'reach'] = int(row['reach'].replace('"',""))
    else:
        fc_df.loc[index,'reach'] = mean_reach
        
    # convert dob to date object
    if(row['DOB'] != "--"):
        date_str = row['DOB'].replace(",","").replace(" ","-")
        fc_df.loc[index,'DOB'] = datetime.datetime.strptime(date_str, '%b-%d-%Y').date()
        
        

#### Show head and describe the wrangled dataset

In [11]:
fc_df.head(3)

Unnamed: 0,name,nick_name,fight_record,height,weight,reach,stance,DOB,full_name
0,Dongi Yang,The Ox,13-3-0,70.0,185 lbs.,71.0,Southpaw,1984-12-07,"Dongi Yang ""The Ox"""
1,Goldman Butler,,1-4-0,70.3936,--,71.8466,,--,Goldman Butler
2,Ruslan Magomedov,Leopard,14-1-0,75.0,242 lbs.,78.0,Orthodox,1986-11-26,"Ruslan Magomedov ""Leopard"""


In [12]:
fc_df.describe()

Unnamed: 0,name,nick_name,fight_record,height,weight,reach,stance,DOB,full_name
count,3214,3214.0,3214,3214.0,3214,3214.0,3214,3214,3214
unique,3212,1393.0,1044,26.0,109,26.0,6,2142,3214
top,Michael McDonald,,0-1-0,72.0,170 lbs.,71.846561,Orthodox,--,"Dan Severn ""The Beast"""
freq,2,1492.0,75,370.0,506,1702.0,1873,737,1


# To do

- Implement Bayesian hyperparameter optimization

# 3. Create cumulative statistics data with specified look back window

#### Create functions to make the look back window dataframe

In [13]:
def generate_fighters_dict():
    cols = two_pf.columns.tolist()
    fighters = {}
    for index, row in two_pf.iterrows():
        fighter = row["a_fighter"]
        if fighter not in fighters:
            fighters[fighter] = [row]
        else:
            fighters[fighter].append(row)
    for fighter in fighters:
        rows_list = fighters[fighter]
        fighters[fighter] = pd.DataFrame(columns=cols, data=rows_list)
    return fighters

fighters_dict = generate_fighters_dict() # Generates a dictionary with each fighter having his own pd.DataFrame()

In [14]:
def get_fighter_info(past_df, is_fighter, lb_num, is_exact):
    if is_exact:
        latest_df = past_df.head(lb_num)
    else:
        latest_df = past_df
        
    latest_lb_cum = latest_df.mean(skipna=True, numeric_only=True).to_dict()
    new_dict = {}
    if is_fighter:
        for var in latest_lb_cum:
            if var[0] == "f":
                new_dict[var] = latest_lb_cum[var]
    else:
        for var in latest_lb_cum:
            if var[0] == "f":
                new_var = var.replace("f_", "o_")
                new_dict[new_var] = latest_lb_cum[var]
    return(new_dict)

In [15]:
def generate_cum_df(fighters_dict, lb_num, is_exact):
    
    cumulative_df = pd.DataFrame(columns = two_pf.columns)
    
    for index, row in two_pf.iterrows():
        fighter_name = row["a_fighter"]
        opponent_name = row["a_opponent"]
        fight_date = row["date"]
        fighter_df = fighters_dict[fighter_name]
        opponent_df = fighters_dict[opponent_name]
        
        past_fighter_df = fighter_df.loc[fighter_df['date'] < fight_date]
        if past_fighter_df.shape[0] < lb_num:
            continue
        past_opponent_df = opponent_df.loc[opponent_df['date'] < fight_date]
        if past_opponent_df.shape[0] < lb_num:
            continue
        fighter_info = get_fighter_info(past_fighter_df, True, lb_num, is_exact)
        opponent_info = get_fighter_info(past_opponent_df, False, lb_num, is_exact)
        all_info = dict(fighter_info, **opponent_info)
        all_info['a_fighter'] = row['a_fighter']
        all_info['a_opponent'] = row['a_opponent']
        all_info['a_winner'] = row['a_winner']
        all_info['date'] = row['date']
        all_info['result'] = row['result']
    
        # Merge fight characterestics
        fc_index = 0
        f_init = "fo"
        fighters_list = [fighter_name,opponent_name]
        
        for fighter in fighters_list:
            fc_index_r = fc_index%2
            
            # Merge in height, reach, and stance
            all_info['{0}_height'.format(f_init[fc_index_r])] = fc_df.loc[fc_df['full_name'] == fighters_list[fc_index_r]]['height'].item()
            all_info['{0}_reach'.format(f_init[fc_index_r])] = fc_df.loc[fc_df['full_name'] == fighters_list[fc_index_r]]['reach'].item()
            all_info['{0}_stance'.format(f_init[fc_index_r])] = fc_df.loc[fc_df['full_name'] == fighters_list[fc_index_r]]['stance'].item()
            
            #Merge in age of fighter
            DOB = fc_df.loc[fc_df['full_name'] == fighters_list[fc_index_r]]['DOB'].item()
            if(DOB == "--"):
                break
            age = relativedelta(fight_date, DOB).years
            all_info['{0}_age'.format(f_init[fc_index_r])] = age

            fc_index += 1
            
        
        # Convert dict to pdf
        all_info_df = pd.DataFrame(all_info, index=[0])
        
        cumulative_df = pd.concat([cumulative_df, all_info_df])
        
    return(cumulative_df)
        

#### Make cumulative data

In [None]:
exact_1_df = generate_cum_df(fighters_dict,1, True)    
exact_3_df = generate_cum_df(fighters_dict,3, True)    
exact_5_df = generate_cum_df(fighters_dict,5, True)
exact_10_df = generate_cum_df(fighters_dict,10, True)

cumu_1_df = generate_cum_df(fighters_dict,1, False)    
cumu_3_df = generate_cum_df(fighters_dict,3, False)    
cumu_5_df = generate_cum_df(fighters_dict,5, False)
cumu_10_df = generate_cum_df(fighters_dict,10, False)

In [None]:
cumu_dfs_dict = {"Exact Data: 1 Fight Lookback Window" : exact_1_df,
                 "Exact Data: 3 Fight Lookback Window" : exact_3_df,
                 "Exact Data: 5 Fight Lookback Window" : exact_5_df,
                 "Exact Data: 10 Fight Lookback Window" : exact_10_df,
                 "Cumulative Data: 1 Fight Lookback Window": cumu_1_df,
                 "Cumulative Data: 3 Fight Lookback Window": cumu_3_df,
                 "Cumulative Data: 5 Fight Lookback Window": cumu_5_df,
                "Cumulative Data: 10 Fight Lookback Window": cumu_10_df} #  create dictionary of the dfs

In [None]:
cumu_10_df.shape

#### Feature Engineering

In [None]:
fe_cumu_dfs_dict = cumu_dfs_dict.copy() # create copy of aggregate data sets

accuracy_stats = ['sigstrikes', 'strikes', 'tds', 'ss_head', 'ss_body', 'ss_leg', 'ss_dist', 'ss_clinch', 'ss_ground']
for key in fe_cumu_dfs_dict:
    
    df = fe_cumu_dfs_dict[key].copy()
    
    
    for stat in accuracy_stats:
        
        fe_index = 0
        f_init = "fo"
        
        for letter in f_init:
            
            fe_index_r = fe_index%2
            # Merge in height, reach, and stance
            df['{0}_acu_{1}'.format(f_init[fe_index_r], stat)] =  df['{0}_pm_{1}_l'.format(f_init[fe_index_r], stat)] / df['{0}_pm_{1}_a'.format(f_init[fe_index_r], stat)]

            fe_index += 1
            
    df = df.fillna(0)
    fe_cumu_dfs_dict[key] = df
    

In [21]:
data_dicts = [cumu_dfs_dict,fe_cumu_dfs_dict] # add dictionaries to list

# 4. Generate machine learning models

In [None]:
cumu_test = fe_cumu_dfs_dict['Cumulative Data: 1 Fight Lookback Window']

# 5. Train and print scores for each model

In [36]:
# Initialize a dictionary for scores, dataset
score_d = {}
score_d['dict_type'] = []
score_d['dataset'] = []
score_d['num_obs'] = []
score_d['model_name'] = []
score_d['accuracy'] = []
score_d['precision'] = []
score_d['recall'] = []
score_d['roc_auc'] = []

    
models = []

blr_clf = LogisticRegression()
rf_clf = RandomForestClassifier()
svm_clf = svm.SVC()
knn_clf = KNeighborsClassifier()
dtree_clf = DecisionTreeClassifier()
nb_clf = GaussianNB()
perc_clf = Perceptron()
sgd_clf = SGDClassifier()
mlp_clf = MLPClassifier()

models.extend((blr_clf ,rf_clf, svm_clf, knn_clf, dtree_clf, nb_clf, perc_clf, sgd_clf,mlp_clf))


for key in fe_cumu_dfs_dict:
    df = data_dict[key].copy()

    dataset = key
    num_obs = df.shape[0]

    print(dataset + "(N = "+ str(num_obs)+")\n")
    # Split features and labels
    X = df.iloc[:,4:-1]
    X = X.fillna(0)
    y = df['result'].astype('int')

    # Wrangle 
    X['f_stance'] = X.apply(lambda x: str(x['f_stance']), axis = 1)
    X['o_stance'] = X.apply(lambda x: str(x['o_stance']), axis = 1)

    # Encode categorical data
    ce_binary = ce.BinaryEncoder(cols = ['f_stance','o_stance'])
    X = ce_binary.fit_transform(X, y)
    # Scale features with mean = 0 and sd = 1
    X = preprocessing.scale(X)

    for model in models:
        model_name = type(model).__name__
        print(" " + model_name)


        scoring_stats = {'accuracy': 'accuracy',
               'recall': 'recall',
               'precision': 'precision',
               'roc_auc': 'roc_auc'}

        # Calculate scores
        avg_scores = cross_validate(model, X, y, cv=5, scoring = scoring_stats)

        accuracy = np.mean(avg_scores['test_accuracy'])
        print("     Accuracy: " + str(accuracy))

        precision = np.mean(avg_scores['test_precision'])
        print("     Precision:" + str(precision))

        recall = np.mean(avg_scores['test_recall'])
        print("     Recall:" + str(recall))

        roc_auc = np.mean(avg_scores['test_roc_auc'])
        print("     ROC - AUC:" + str(roc_auc))
        print("\n")

        # append to dictionary
        score_d['dict_type'].append(dict_type)
        score_d['dataset'].append(dataset)
        score_d['num_obs'].append(num_obs)
        score_d['model_name'].append(model_name)
        score_d['accuracy'].append(accuracy)
        score_d['precision'].append(precision)
        score_d['recall'].append(recall)
        score_d['roc_auc'].append(roc_auc)



Exact Data: 1 Fight Lookback Window(N = 6914)

 LogisticRegression
     Accuracy: 1.0
     Precision:1.0
     Recall:1.0
     ROC - AUC:1.0


 RandomForestClassifier
     Accuracy: 0.9989871845277432
     Precision:1.0
     Recall:0.9979416083614062
     ROC - AUC:0.9999926680073739


 SVC
     Accuracy: 0.9858243477242498
     Precision:0.9919562009594867
     Recall:0.9791267167660015
     ROC - AUC:0.999376284602925


 KNeighborsClassifier
     Accuracy: 0.873300402365676
     Precision:0.8760971036039796
     Recall:0.8647862140450894
     ROC - AUC:0.9369861082534463


 DecisionTreeClassifier
     Accuracy: 1.0
     Precision:1.0
     Recall:1.0
     ROC - AUC:1.0


 GaussianNB
     Accuracy: 1.0
     Precision:1.0
     Recall:1.0
     ROC - AUC:1.0


 Perceptron
     Accuracy: 0.9971058531239805
     Precision:0.9947566963095353
     Recall:0.9994117647058823
     ROC - AUC:0.9995550600523255


 SGDClassifier
     Accuracy: 0.9975410521736949
     Precision:0.9976505050730877
   

KeyboardInterrupt: 

In [33]:
scores_df = pd.DataFrame(score_d)
scores_df

Unnamed: 0,dict_type,dataset,num_obs,model_name,accuracy,precision,recall,roc_auc
0,cumu_dfs_dict,Exact Data: 1 Fight Lookback Window,6914,LogisticRegression,0.530086,0.525787,0.468559,0.539459
1,cumu_dfs_dict,Exact Data: 1 Fight Lookback Window,6914,RandomForestClassifier,0.508678,0.500739,0.381243,0.509469
2,cumu_dfs_dict,Exact Data: 1 Fight Lookback Window,6914,SVC,0.520396,0.514446,0.448855,0.533860
3,cumu_dfs_dict,Exact Data: 1 Fight Lookback Window,6914,KNeighborsClassifier,0.505945,0.497994,0.496190,0.510326
4,cumu_dfs_dict,Exact Data: 1 Fight Lookback Window,6914,DecisionTreeClassifier,0.501309,0.493186,0.490897,0.501146
5,cumu_dfs_dict,Exact Data: 1 Fight Lookback Window,6914,GaussianNB,0.507380,0.500192,0.701056,0.527453
6,cumu_dfs_dict,Exact Data: 1 Fight Lookback Window,6914,Perceptron,0.508975,0.502590,0.500303,0.520724
7,cumu_dfs_dict,Exact Data: 1 Fight Lookback Window,6914,SGDClassifier,0.513313,0.508307,0.484408,0.520691
8,cumu_dfs_dict,Exact Data: 1 Fight Lookback Window,6914,MLPClassifier,0.527922,0.520206,0.515881,0.534516
9,cumu_dfs_dict,Exact Data: 3 Fight Lookback Window,3780,LogisticRegression,0.534916,0.528913,0.497597,0.542698


In [34]:
scores_df.to_csv("../data/scores_v2.csv")

# Preliminary Analysis
- The hyperparameters from the sklearn classifiers were set to their defaults and will be tuned. 


- Nevertheless, the effectiveness of each model appears to vary based off the length of the look back number. I will soon be transforming the printed text data above into nicer looking graphs.  


- Please scroll around in the above cell to view the accuracy, recall, precision, and ROC-AUC from using a cross validate method. 


- Now, most of the accuracy percentages are hovering near the 50% mark. As noted by previous literature, this data is inherently noisy and will likely make it very difficult to have an accuracy of over 60%. It is even more of an issue when generating the data using a look back window because the number of observations decreases substantially. 

