# 1. Import libraries, data, and globals

In [55]:
%matplotlib inline

# Regular libraries
import requests
import pandas as pd
import string
import time
from bs4 import BeautifulSoup
from pandas.plotting import scatter_matrix
import os 
import string
import random
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Get rid of warnings
import warnings
warnings.filterwarnings('ignore')

# Sklearn Specific
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import cross_validate

# sklearn algos
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB # Naive Bayes
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier # SGD
from sklearn.neural_network import MLPClassifier # neural network (multilayer perceptron)

fs_df = pd.read_csv("../data/full_stats_v4.csv")
fc_df = pd.read_csv("../data/fighter_char.csv")
# Might want to drop the first column of each df

# 2. Data wrangling

#### Add Totals

In [56]:
fs_df = fs_df.fillna(0) # Fill NaN with 0

# Calc time of fight in seconds
fs_df['duration'] = fs_df.apply(lambda x: (int(x['end_round'])-1)*60*5 + int(x['end_time'].split(":")[0])*60 + int(x['end_time'].split(":")[1]) , axis = 1)

# Calc totals for each fighter
which_fighter = ["f1","f2"]
stat_name_ending = ["kds","sigstrikes_l","sigstrikes_a","strikes_l","strikes_a","tds_l","tds_a","subs_a","passes",
                   "revs","ss_head_l","ss_head_a","ss_body_l","ss_body_a","ss_leg_l","ss_leg_a","ss_dist_l","ss_dist_a",
                   "ss_clinch_l","ss_clinch_a","ss_ground_l","ss_ground_a"]

for f in which_fighter:
    
    for stat in stat_name_ending:
        
        fs_df[f+'_t_' + stat] = np.sum(fs_df.loc[:,(fs_df.columns.str.startswith(f) & fs_df.columns.str.endswith(stat))], axis =1)
        
        fs_df[f+'_pm_' + stat] =  fs_df[f+'_t_' + stat] / fs_df['duration'] * 60


KeyError: ('end_round', 'occurred at index 0')

#### Subset the Data for Key Stats and Per Minute 

In [None]:
matchers = ['fighter','winner',"_pm_","date"]
ml_cols = [s for s in fs_df.columns if any(xs in s for xs in matchers)]
ml_df = fs_df[ml_cols]

#### Make each fight have two rows, so that the label is win or lose

In [None]:
# Part 1
part_1_df = ml_df.copy()

part1_key_cols = part_1_df.columns
part1_value_cols = []

for col in part1_key_cols:
    new_col = col.replace("f1","f")
    new_col = new_col.replace("f2","o")
    new_col = new_col.replace("fighter_1","a_fighter")
    new_col = new_col.replace("fighter_2","a_opponent")
    new_col = new_col.replace("winner","a_winner")
    part1_value_cols.append(new_col)
    
p1_col_dict = dict(zip(part1_key_cols, part1_value_cols))

part_1_df = part_1_df.rename(columns = p1_col_dict)

In [57]:
# Part 2
part_2_df = ml_df.copy()

part2_key_cols = part_2_df.columns
part2_value_cols = []

for col in part2_key_cols:
    new_col = col.replace("f1","o")
    new_col = new_col.replace("f2","f")
    new_col = new_col.replace("fighter_1","a_opponent")
    new_col = new_col.replace("fighter_2","a_fighter")
    new_col = new_col.replace("winner","a_winner")
    part2_value_cols.append(new_col)
    
p2_col_dict = dict(zip(part2_key_cols, part2_value_cols))

part_2_df = part_2_df.rename(columns = p2_col_dict)

In [58]:
part_2_df

Unnamed: 0,a_opponent,a_fighter,a_winner,date,o_pm_kds,o_pm_sigstrikes_l,o_pm_sigstrikes_a,o_pm_strikes_l,o_pm_strikes_a,o_pm_tds_l,...,f_pm_ss_body_l,f_pm_ss_body_a,f_pm_ss_leg_l,f_pm_ss_leg_a,f_pm_ss_dist_l,f_pm_ss_dist_a,f_pm_ss_clinch_l,f_pm_ss_clinch_a,f_pm_ss_ground_l,f_pm_ss_ground_a
0,"Israel Adesanya ""The Last Stylebender""","Anderson Silva ""The Spider""","Israel Adesanya ""The Last Stylebender""",2019-02-09,0.000000,5.666667,11.733333,17.377778,35.982222,0.000000,...,0.600000,1.466667,1.933333,2.066667,4.200000,8.600000,0.133333,0.200000,0.000000,0.000000
1,"Lando Vannata ""Groovy""","Marcos Mariano ""Dhalsim""","Lando Vannata ""Groovy""",2019-02-09,0.000000,8.135593,16.271186,33.790290,61.885665,0.406780,...,0.000000,0.203390,0.406780,0.406780,0.406780,0.813559,0.000000,0.000000,3.661017,7.322034
2,Rani Yahya,Ricky Simon,Rani Yahya,2019-02-09,0.000000,4.533333,16.533333,14.302222,51.435556,0.000000,...,0.666667,0.933333,0.533333,0.866667,3.133333,12.400000,0.000000,0.000000,0.000000,0.000000
3,Montana De La Rosa,"Nadia Kassem ""187""",Montana De La Rosa,2019-02-09,0.000000,4.070022,6.695842,26.136108,36.327682,0.393873,...,0.131291,0.131291,0.000000,0.000000,0.919037,2.494530,0.000000,0.000000,1.838074,2.100656
4,"Jimmy Crute ""The Brute""","Sam Alvey ""Smile'N""","Jimmy Crute ""The Brute""",2019-02-09,0.710059,9.940828,22.721893,35.481951,79.072862,0.000000,...,0.710059,1.065089,1.775148,2.130178,3.195266,8.165680,0.000000,0.000000,1.775148,3.195266
5,"Devonte Smith ""King Kage""","Dong Hyun Ma ""Maestro""","Devonte Smith ""King Kage""",2019-02-09,0.515021,9.785408,22.145923,31.876071,72.140581,0.000000,...,0.257511,1.030043,1.030043,1.287554,3.862661,9.785408,0.000000,0.000000,1.030043,1.287554
6,"Shane Young ""Sugar""","Austin Arnett ""Golden Boy""","Shane Young ""Sugar""",2019-02-09,0.066667,10.600000,27.733333,32.640000,85.182222,0.000000,...,2.133333,3.066667,1.266667,1.666667,7.933333,20.466667,0.066667,0.066667,0.000000,0.000000
7,"Kai Kara-France ""Don't Blink""",Raulian Paiva,"Kai Kara-France ""Don't Blink""",2019-02-09,0.000000,4.666667,13.800000,14.444444,42.453333,0.133333,...,1.133333,1.600000,0.533333,0.666667,3.666667,10.933333,0.066667,0.133333,0.133333,0.133333
8,"Teruto Ishihara ""Yashabo""","Kyung Ho Kang ""Mr. Perfect""","Teruto Ishihara ""Yashabo""",2019-02-09,0.502092,8.535565,23.598326,28.251606,77.221337,0.000000,...,0.251046,1.255230,1.004184,1.506276,2.761506,8.786611,1.506276,2.510460,0.000000,0.502092
9,"Callan Potter ""The Rockstar""","Jalin Turner ""The Tarantula""","Callan Potter ""The Rockstar""",2019-02-09,0.000000,6.792453,9.056604,28.066928,37.422570,0.000000,...,0.000000,0.000000,2.264151,2.264151,3.396226,4.528302,0.000000,0.000000,0.000000,0.000000


In [59]:
two_pf = pd.concat([part_1_df, part_2_df], sort = True)
two_pf = two_pf.sort_values(by ='date', ascending= False)
two_pf['result'] = two_pf.apply(lambda x: int(x['a_fighter'] == x['a_winner']), axis =1)

#### Show head and describe the wrangled dataset

In [60]:
two_pf.head(3)

Unnamed: 0,a_fighter,a_opponent,a_winner,date,f_pm_kds,f_pm_passes,f_pm_revs,f_pm_sigstrikes_a,f_pm_sigstrikes_l,f_pm_ss_body_a,...,o_pm_ss_head_a,o_pm_ss_head_l,o_pm_ss_leg_a,o_pm_ss_leg_l,o_pm_strikes_a,o_pm_strikes_l,o_pm_subs_a,o_pm_tds_a,o_pm_tds_l,result
0,"Israel Adesanya ""The Last Stylebender""","Anderson Silva ""The Spider""","Israel Adesanya ""The Last Stylebender""",2019-02-09,0.0,0.0,0.0,11.733333,5.666667,0.933333,...,5.266667,1.8,2.066667,1.933333,14.92,6.537778,0.0,0.0,0.0,1
1,"Lando Vannata ""Groovy""","Marcos Mariano ""Dhalsim""","Lando Vannata ""Groovy""",2019-02-09,0.0,0.813559,0.0,16.271186,8.135593,0.20339,...,7.525424,3.661017,0.40678,0.40678,6.270612,2.158001,0.0,0.0,0.0,1
10,"Jonathan Martinez ""Dragon""","Wuliji Buren ""Beast Master""","Wuliji Buren ""Beast Master""",2019-02-09,0.0,0.266667,0.066667,5.8,3.133333,0.533333,...,4.266667,2.066667,1.6,1.066667,9.657778,5.164444,0.066667,0.6,0.266667,0


In [61]:
two_pf.describe()

Unnamed: 0,f_pm_kds,f_pm_passes,f_pm_revs,f_pm_sigstrikes_a,f_pm_sigstrikes_l,f_pm_ss_body_a,f_pm_ss_body_l,f_pm_ss_clinch_a,f_pm_ss_clinch_l,f_pm_ss_dist_a,...,o_pm_ss_head_a,o_pm_ss_head_l,o_pm_ss_leg_a,o_pm_ss_leg_l,o_pm_strikes_a,o_pm_strikes_l,o_pm_subs_a,o_pm_tds_a,o_pm_tds_l,result
count,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0,...,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0
mean,0.109153,0.155623,0.015604,10.523933,4.787612,1.264215,0.86138,1.07183,0.736254,8.085448,...,8.346284,3.209708,0.913661,0.71815,37.801289,18.426116,0.079519,0.356855,0.137669,0.490987
std,0.512414,0.389497,0.117449,8.857232,5.388737,1.149576,0.889993,2.0528,1.431561,6.34867,...,6.911298,3.76126,0.995655,0.805858,42.79317,28.338528,0.301374,0.510187,0.277029,0.499952
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,5.4,2.133333,0.524017,0.322725,0.133333,0.066667,3.666667,...,4.066667,1.266667,0.230843,0.166493,19.299867,8.239358,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,8.517699,3.58209,1.0,0.666667,0.533927,0.335196,6.666667,...,6.666667,2.233215,0.659341,0.5,28.897778,13.249778,0.0,0.2,0.0,0.0
75%,0.0,0.144928,0.0,13.020113,5.666667,1.666667,1.133333,1.266667,0.866667,10.866667,...,10.5,3.8,1.266667,1.0,43.382521,20.069622,0.0,0.527705,0.2,1.0
max,9.230769,9.6,8.571429,137.142857,112.0,21.0,18.0,51.176471,35.294118,75.0,...,94.285714,56.25,22.0,16.0,1016.326531,792.0,8.571429,7.5,7.5,1.0


# To do

>- Run the Model by individual fights then use the aggregate data as the test set. 
>- Get the scraping to run properly

# 3. Create cumulative statistics data with specified look back window

#### Create functions to make the look back window dataframe

In [62]:
def generate_fighters_dict():
    cols = two_pf.columns.tolist()
    fighters = {}
    for index, row in two_pf.iterrows():
        fighter = row["a_fighter"]
        if fighter not in fighters:
            fighters[fighter] = [row]
        else:
            fighters[fighter].append(row)
    for fighter in fighters:
        rows_list = fighters[fighter]
        fighters[fighter] = pd.DataFrame(columns=cols, data=rows_list)
    return fighters

fighters_dict = generate_fighters_dict() # Generates a dictionary with each fighter having his own pd.DataFrame()

In [63]:
def get_fighter_info(past_df, is_fighter, lb_num, is_exact):
    if is_exact:
        latest_df = past_df.head(lb_num)
    else:
        latest_df = past_df
        
    latest_lb_cum = latest_df.mean(skipna=True, numeric_only=True).to_dict()
    new_dict = {}
    if is_fighter:
        for var in latest_lb_cum:
            if var[0] == "f":
                new_dict[var] = latest_lb_cum[var]
    else:
        for var in latest_lb_cum:
            if var[0] == "f":
                new_var = var.replace("f_", "o_")
                new_dict[new_var] = latest_lb_cum[var]
    return(new_dict)

In [64]:
def generate_cum_df(fighters_dict, lb_num, is_exact):
    
    cumulative_df = pd.DataFrame(columns = two_pf.columns)
    
    for index, row in two_pf.iterrows():
        fighter_name = row["a_fighter"]
        opponent_name = row["a_opponent"]
        fight_date = row["date"]
        
        fighter_df = fighters_dict[fighter_name]
        opponent_df = fighters_dict[opponent_name]
        
        past_fighter_df = fighter_df.loc[fighter_df['date'] < fight_date]
        if past_fighter_df.shape[0] < lb_num:
            continue
        past_opponent_df = opponent_df.loc[opponent_df['date'] < fight_date]
        if past_opponent_df.shape[0] < lb_num:
            continue
        fighter_info = get_fighter_info(past_fighter_df, True, lb_num, is_exact)
        opponent_info = get_fighter_info(past_opponent_df, False, lb_num, is_exact)
        all_info = dict(fighter_info, **opponent_info)
        all_info['a_fighter'] = row['a_fighter']
        all_info['a_opponent'] = row['a_opponent']
        all_info['a_winner'] = row['a_winner']
        all_info['date'] = row['date']
        all_info['result'] = row['result']
        
        all_info_df = pd.DataFrame(all_info, index=[0])
        
        cumulative_df = pd.concat([cumulative_df, all_info_df])
        
    return(cumulative_df)
        

#### Make cumulative data

In [65]:
exact_1_df = generate_cum_df(fighters_dict,1, True)    
exact_3_df = generate_cum_df(fighters_dict,3, True)    
exact_5_df = generate_cum_df(fighters_dict,5, True)
exact_10_df = generate_cum_df(fighters_dict,10, True)

cumu_1_df = generate_cum_df(fighters_dict,1, False)    
cumu_3_df = generate_cum_df(fighters_dict,3, False)    
cumu_5_df = generate_cum_df(fighters_dict,5, False)
cumu_10_df = generate_cum_df(fighters_dict,10, False)

In [66]:
cumu_dfs_dict = {"Exact Data: 1 Fight Lookback Window" : exact_1_df,
                 "Exact Data: 3 Fight Lookback Window" : exact_3_df,
                 "Exact Data: 5 Fight Lookback Window" : exact_5_df,
                 "Exact Data: 10 Fight Lookback Window" : exact_10_df,
                 "Cumulative Data: 1 Fight Lookback Window": cumu_1_df,
                 "Cumulative Data: 3 Fight Lookback Window": cumu_3_df,
                 "Cumulative Data: 5 Fight Lookback Window": cumu_5_df,
                "Cumulative Data: 10 Fight Lookback Window": cumu_10_df} #  create dictionary of the dfs

# 4. Generate machine learning models

#### Initialize classifiers

In [67]:
models = []

blr_clf = LogisticRegression()
rf_clf = RandomForestClassifier()
svm_clf = svm.SVC()
knn_clf = KNeighborsClassifier()
dtree_clf = DecisionTreeClassifier()
nb_clf = GaussianNB()
perc_clf = Perceptron()
sgd_clf = SGDClassifier()
mlp_clf = MLPClassifier()

models.extend((blr_clf ,rf_clf, svm_clf, knn_clf, dtree_clf, nb_clf, perc_clf, sgd_clf,mlp_clf))



# 5. Train and print scores for each model

In [68]:
for key in cumu_dfs_dict:
    df = cumu_dfs_dict[key]
    print(key + "(N = "+ str(df.shape[0])+")\n")
    # Split features and labels
    X = df.iloc[:,4:-1]
    y = df['result'].astype('int')

    for model in models:
        print(" " + type(model).__name__)
        
       # model.fit(X, y)
        
        scoring_stats = {'accuracy': 'accuracy',
               'recall': 'recall',
               'precision': 'precision',
               'roc_auc': 'roc_auc'}
        
        avg_scores = cross_validate(model, X, y, cv=5, scoring = scoring_stats)
    
        print("     Accuracy: " + str(np.mean(avg_scores['test_accuracy'])))
        print("     Recall:" + str(np.mean(avg_scores['test_recall'])))
        print("     Precision:" + str(np.mean(avg_scores['test_precision'])))
        print("     ROC - AUC:" + str(np.mean(avg_scores['test_roc_auc'])))
        print("\n")

Exact Data: 1 Fight Lookback Window(N = 5306)

 LogisticRegression
     Accuracy: 0.5160181794050802
     Recall:0.4323023069399401
     Precision:0.5102700305764295
     ROC - AUC:0.5188342881382402


 RandomForestClassifier
     Accuracy: 0.5047072097755227
     Recall:0.3774424368110251
     Precision:0.4946865950127949
     ROC - AUC:0.5087574485833907


 SVC
     Accuracy: 0.525259555823743
     Recall:0.40200027945080563
     Precision:0.5219723364971899
     ROC - AUC:0.5264733589542202


 KNeighborsClassifier
     Accuracy: 0.502076360520826
     Recall:0.48485008935071805
     Precision:0.49316059292465
     ROC - AUC:0.5045385765837626


 DecisionTreeClassifier
     Accuracy: 0.511874706942191
     Recall:0.5147594149182606
     Precision:0.5034677293737226
     ROC - AUC:0.5119203519997748


 GaussianNB
     Accuracy: 0.5039629761468316
     Recall:0.3947007302490789
     Precision:0.4930952128191162
     ROC - AUC:0.5026699381894673


 Perceptron
     Accuracy: 0.5073508501

     Accuracy: 0.5417676709114596
     Recall:0.4947368421052632
     Precision:0.5527504027478571
     ROC - AUC:0.550485987696514


Cumulative Data: 5 Fight Lookback Window(N = 1498)

 LogisticRegression
     Accuracy: 0.5353400222965441
     Recall:0.5094594594594595
     Precision:0.5314476118827423
     ROC - AUC:0.5615857535820937


 RandomForestClassifier
     Accuracy: 0.51803121516165
     Recall:0.3810810810810811
     Precision:0.5176815953682958
     ROC - AUC:0.5264154687102579


 SVC
     Accuracy: 0.5520713489409141
     Recall:0.49459459459459454
     Precision:0.5523367921698632
     ROC - AUC:0.5596151309901746


 KNeighborsClassifier
     Accuracy: 0.5293667781493869
     Recall:0.5175675675675675
     Precision:0.5238508344786654
     ROC - AUC:0.540266613991126


 DecisionTreeClassifier
     Accuracy: 0.4993377926421405
     Recall:0.4783783783783783
     Precision:0.4914633703459919
     ROC - AUC:0.4991142501860522


 GaussianNB
     Accuracy: 0.5259977703455965


# Preliminary Analysis
- The hyperparameters from the sklearn classifiers were set to their defaults and will be tuned. 


- Nevertheless, the effectiveness of each model appears to vary based off the length of the look back number. I will soon be transforming the printed text data above into nicer looking graphs.  


- Please scroll around in the above cell to view the accuracy, recall, precision, and ROC-AUC from using a cross validate method. 


- Now, most of the accuracy percentages are hovering near the 50% mark. As noted by previous literature, this data is inherently noisy and will likely make it very difficult to have an accuracy of over 60%. It is even more of an issue when generating the data using a look back window because the number of observations decreases substantially. 



# (WIP) Making Cumulative Data

- Add the fighter's age, height, and reach has already been scraped and will be merged into the cumulative data by fighter names.


- Standardize/normalize fight data


- Perhaps use more data from before 2010. 