# Import Libraries, Data, and Globals

In [4]:
%matplotlib inline

# Regular libraries
import requests
import pandas as pd
import string
import time
from bs4 import BeautifulSoup
from pandas.plotting import scatter_matrix
import os 
import string
import random
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Get rid of warnings
import warnings
warnings.filterwarnings('ignore')

# Sklearn Specific
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import cross_validate

# sklearn algos
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB # Naive Bayes
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier # SGD
from sklearn.neural_network import MLPClassifier # neural network (multilayer perceptron)

fs_df = pd.read_csv("../data/full_stats_v2.csv")
fc_df = pd.read_csv("../data/fighter_char.csv")

# Data Wrangling

#### Add Totals

In [5]:
fs_df = fs_df.fillna(0) # Fill NaN with 0

# Calc time of fight in seconds
fs_df['duration'] = fs_df.apply(lambda x: (int(x['end_round'])-1)*60*5 + int(x['end_time'].split(":")[0])*60 + int(x['end_time'].split(":")[1]) , axis = 1)

# Calc totals for each fighter
which_fighter = ["f1","f2"]
stat_name_ending = ["kds","sigstrikes_l","sigstrikes_a","strikes_l","strikes_a","tds_l","tds_a","subs_a","passes",
                   "revs","ss_head_l","ss_head_a","ss_body_l","ss_body_a","ss_leg_l","ss_leg_a","ss_dist_l","ss_dist_a",
                   "ss_clinch_l","ss_clinch_a","ss_ground_l","ss_ground_a"]

for f in which_fighter:
    
    for stat in stat_name_ending:
        
        fs_df[f+'_t_' + stat] = np.sum(fs_df.loc[:,(fs_df.columns.str.startswith(f) & fs_df.columns.str.endswith(stat))], axis =1)
        
        fs_df[f+'_pm_' + stat] =  fs_df[f+'_t_' + stat] / fs_df['duration'] * 60


#### Subset the Data for Key Stats and Per Minute 

In [6]:
matchers = ['fighter','winner',"_pm_","date"]
ml_cols = [s for s in fs_df.columns if any(xs in s for xs in matchers)]
ml_df = fs_df[ml_cols]

#### Make each fight have two rows, so that the label is win or lose

In [7]:
# Part 1
part_1_df = ml_df.copy()

part1_key_cols = part_1_df.columns
part1_value_cols = []

for col in part1_key_cols:
    new_col = col.replace("f1","f")
    new_col = new_col.replace("f2","o")
    new_col = new_col.replace("fighter_1","a_fighter")
    new_col = new_col.replace("fighter_2","a_opponent")
    new_col = new_col.replace("winner","a_winner")
    part1_value_cols.append(new_col)
    
p1_col_dict = dict(zip(part1_key_cols, part1_value_cols))

part_1_df = part_1_df.rename(columns = p1_col_dict)

In [8]:
# Part 2
part_2_df = ml_df.copy()

part2_key_cols = part_2_df.columns
part2_value_cols = []

for col in part2_key_cols:
    new_col = col.replace("f1","o")
    new_col = new_col.replace("f2","f")
    new_col = new_col.replace("fighter_1","a_opponent")
    new_col = new_col.replace("fighter_2","a_fighter")
    new_col = new_col.replace("winner","a_winner")
    part2_value_cols.append(new_col)
    
p2_col_dict = dict(zip(part2_key_cols, part2_value_cols))

part_2_df = part_2_df.rename(columns = p2_col_dict)

In [9]:
two_pf = pd.concat([part_1_df, part_2_df], sort = True)
two_pf = two_pf.sort_values(by ='date', ascending= False)
two_pf['result'] = two_pf.apply(lambda x: int(x['a_fighter'] == x['a_winner']), axis =1)

## Show the wrangled and win/loss adjusted dataframe

In [10]:
two_pf.head(5)

Unnamed: 0,a_fighter,a_opponent,a_winner,date,f_pm_kds,f_pm_passes,f_pm_revs,f_pm_sigstrikes_a,f_pm_sigstrikes_l,f_pm_ss_body_a,...,o_pm_ss_head_a,o_pm_ss_head_l,o_pm_ss_leg_a,o_pm_ss_leg_l,o_pm_strikes_a,o_pm_strikes_l,o_pm_subs_a,o_pm_tds_a,o_pm_tds_l,result
0,"Israel Adesanya ""The Last Stylebender""","Anderson Silva ""The Spider""","Israel Adesanya ""The Last Stylebender""",2019-02-09,0.0,0.0,0.0,11.733333,5.666667,0.933333,...,5.266667,1.8,2.066667,1.933333,14.92,6.537778,0.0,0.0,0.0,1
1,"Lando Vannata ""Groovy""","Marcos Mariano ""Dhalsim""","Lando Vannata ""Groovy""",2019-02-09,0.0,0.813559,0.0,16.271186,8.135593,0.20339,...,7.525424,3.661017,0.40678,0.40678,6.270612,2.158001,0.0,0.0,0.0,1
10,"Jonathan Martinez ""Dragon""","Wuliji Buren ""Beast Master""","Wuliji Buren ""Beast Master""",2019-02-09,0.0,0.266667,0.066667,5.8,3.133333,0.533333,...,4.266667,2.066667,1.6,1.066667,9.657778,5.164444,0.066667,0.6,0.266667,0
9,"Jalin Turner ""The Tarantula""","Callan Potter ""The Rockstar""","Callan Potter ""The Rockstar""",2019-02-09,1.132075,1.132075,0.0,19.245283,10.188679,0.0,...,18.113208,7.924528,2.264151,2.264151,37.42257,28.066928,0.0,2.264151,0.0,0
8,"Kyung Ho Kang ""Mr. Perfect""","Teruto Ishihara ""Yashabo""","Teruto Ishihara ""Yashabo""",2019-02-09,0.251046,0.251046,0.0,14.560669,6.527197,1.25523,...,20.585774,7.782427,1.506276,1.004184,77.221337,28.251606,0.0,0.0,0.0,0


In [11]:
two_pf.describe()

Unnamed: 0,f_pm_kds,f_pm_passes,f_pm_revs,f_pm_sigstrikes_a,f_pm_sigstrikes_l,f_pm_ss_body_a,f_pm_ss_body_l,f_pm_ss_clinch_a,f_pm_ss_clinch_l,f_pm_ss_dist_a,...,o_pm_ss_head_a,o_pm_ss_head_l,o_pm_ss_leg_a,o_pm_ss_leg_l,o_pm_strikes_a,o_pm_strikes_l,o_pm_subs_a,o_pm_tds_a,o_pm_tds_l,result
count,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0,...,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0,7434.0
mean,0.109153,0.155623,0.015604,10.523933,4.787612,1.264215,0.86138,1.07183,0.736254,8.085448,...,8.346284,3.209708,0.913661,0.71815,37.801289,18.426116,0.079519,0.356855,0.137669,0.490987
std,0.512414,0.389497,0.117449,8.857232,5.388737,1.149576,0.889993,2.0528,1.431561,6.34867,...,6.911298,3.76126,0.995655,0.805858,42.79317,28.338528,0.301374,0.510187,0.277029,0.499952
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,5.4,2.133333,0.524017,0.322725,0.133333,0.066667,3.666667,...,4.066667,1.266667,0.230843,0.166493,19.299867,8.239358,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,8.517699,3.58209,1.0,0.666667,0.533927,0.335196,6.666667,...,6.666667,2.233215,0.659341,0.5,28.897778,13.249778,0.0,0.2,0.0,0.0
75%,0.0,0.144928,0.0,13.020113,5.666667,1.666667,1.133333,1.266667,0.866667,10.866667,...,10.5,3.8,1.266667,1.0,43.382521,20.069622,0.0,0.527705,0.2,1.0
max,9.230769,9.6,8.571429,137.142857,112.0,21.0,18.0,51.176471,35.294118,75.0,...,94.285714,56.25,22.0,16.0,1016.326531,792.0,8.571429,7.5,7.5,1.0


# Generate Machine Learning Models

In [12]:
# Split features and labels
X = two_pf.iloc[:,4:-1]
y = two_pf['result']

In [13]:
models = []

blr_clf = LogisticRegression()
rf_clf = RandomForestClassifier()
svm_clf = svm.SVC()
knn_clf = KNeighborsClassifier()
dtree_clf = DecisionTreeClassifier()
nb_clf = GaussianNB()
perc_clf = Perceptron()
sgd_clf = SGDClassifier()
mlp_clf = MLPClassifier()

models.extend((blr_clf ,rf_clf, svm_clf, knn_clf, dtree_clf, nb_clf, perc_clf, sgd_clf,mlp_clf))



In [14]:
for model in models:
    print(model)
    scoring = {'accuracy': 'accuracy',
           'recall': 'recall',
           'precision': 'precision',
           'roc_auc': 'roc_auc'}
    
    model.fit(X, y)
    avg_scores = cross_validate(model, X, y, cv=10, scoring= scoring)
    print(avg_scores.keys())
    print("Accuracy: " + str(np.mean(avg_scores['test_accuracy'])))
    print("Recall:" + str(np.mean(avg_scores['test_recall'])))
    print("Precision:" + str(np.mean(avg_scores['test_precision'])))
    print("ROC - AUC:" + str(np.mean(avg_scores['test_roc_auc'])))
    print("\n \n")

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
dict_keys(['fit_time', 'score_time', 'test_accuracy', 'train_accuracy', 'test_recall', 'train_recall', 'test_precision', 'train_precision', 'test_roc_auc', 'train_roc_auc'])
Accuracy: 0.9669092533900635
Recall:0.9709589041095891
Precision:0.9621005910979411
ROC - AUC:0.98770268401815

 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
dict_keys(['fit_time', 's

# Preliminary Analysis

- These models are using the statistics from night of the fight, which will have hindsight bias. Consequently, the accuracy, precision, and recall listed here is much higher than what should be expected with cumulative data. 

- The hyperparameters from the sklearn classifiers were set to their defaults. These hyperparameters may be tuned once the cumulative data generation is completed.

- Nevertheless, the LogisticRegression, Random Forrest, Perceptron, SGDClassifier, and MLPClassifier appear to be the most promising in terms of highest accuracy %. In fact, the MLPClassifier looks to be the most promising model so far, which has not been used in the academic literature yet. 

- Please scroll around in the above cell to view the accuracy, recall, precision, and ROC-AUC from using a cross validate method. 


# (WIP) Making Cumulative Data

- This has been the most challenging part of the data manipulation.
- I want to create a function that allows the lookback window to vary by number of fights. This will allow for a comparison of varying lookback windows and how they effect the models. 
- Once this part of the project is done, it will be extremely easy and fast to iteratively update the models. 
- The code below is scratch work for this section.

- Additionally, the fighter's age, height, and reach data has already been scraped and will be added once the cumulati

In [None]:
two_pf.columns

In [None]:
len(two_pf.columns)

In [None]:
def generate_fighters_dict():
    cols = two_pf.columns.tolist()
    fighters = {}
    for index, row in two_pf.iterrows():
        fighter = row["a_fighter"]
        if fighter not in fighters:
            fighters[fighter] = [row]
        else:
            fighters[fighter].append(row)
    for fighter in fighters:
        rows_list = fighters[fighter]
        fighters[fighter] = pd.DataFrame(columns=cols, data=rows_list)
    return fighters

fighters_dict = generate_fighters_dict()

In [None]:
def get_fighter_info(past_df, is_fighter):
    last_3 = past_df.head(3)
    last_3_cum = last_3.mean(skipna=True, numeric_only=True).to_dict()
    new_dict = {}
    if is_fighter:
        for var in last_3_cum:
            if var[0] == "f":
                new_dict[var] = last_3_cum[var]
    else:
        for var in last_3_cum:
            if var[0] == "o":
                new_dict[var] = last_3_cum[var]
    return new_dict

def generate_cum_df(fighters_dict):
    
    cumulative_df = pd.DataFrame()
    
    for index, row in two_pf.iterrows():
        fighter_name = row["a_fighter"]
        opponent_name = row["a_opponent"]
        fight_date = row["date"]
        fighter_df = fighters_dict[fighter_name]
        opponent_df = fighters_dict[opponent_name]
        past_fighter_df = fighter_df.loc[fighter_df['date'] < fight_date]
        if past_fighter_df.shape[0] < 3:
            continue
        past_opponent_df = opponent_df.loc[opponent_df['date'] < fight_date]
        if past_fighter_df.shape[0] < 3:
            continue
        fighter_info = get_fighter_info(past_fighter_df, True)
        opponent_info = get_fighter_info(past_opponent_df, False)
        all_info = dict(fighter_info)
        all_info.update(opponent_info)
        all_info['a_fighter'] = row['a_fighter']
        all_info['a_opponent'] = row['a_opponent']
        all_info['a_winner'] = row['a_winner']
        all_info['date'] = row['date']
        all_info['result'] = row['result']
        
        # Fix this shit
        all_info_df = pd.DataFrame(all_info, index=[0])
        cumulative_df = pd.concat([cumulative_df, all_info_df])
        
        if(index == 3):
            return(cumulative_df)
            break
        
final_df = generate_cum_df(fighters_dict)     

In [None]:
final_df

In [116]:
cum_df = pd.DataFrame(data=None, columns=two_pf.columns)
cum_df

Unnamed: 0,a_fighter,a_opponent,a_winner,date,f_pm_kds,f_pm_passes,f_pm_revs,f_pm_sigstrikes_a,f_pm_sigstrikes_l,f_pm_ss_body_a,...,o_pm_ss_head_a,o_pm_ss_head_l,o_pm_ss_leg_a,o_pm_ss_leg_l,o_pm_strikes_a,o_pm_strikes_l,o_pm_subs_a,o_pm_tds_a,o_pm_tds_l,result


In [123]:

for index, row in two_pf.iterrows():
    fighter_name = row['a_fighter']
    fight_date = row['date']
    fighter_df = two_pf.loc[two_pf['a_fighter'] == fighter_name] 
    past_df = fighter_df.loc[fighter_df['date'] < fight_date]
    
    cum_series = past_df.mean()
    
    cum_keys = cum_series.keys().tolist()
    cum_values = []
    cum_values = cum_series.tolist()
    
    cum_values =[fighter_name] + [row['a_opponent']] + [row['a_winner']] + [fight_date] + cum_values 
    print(len(cum_values))
    if len(cum_values) == 53:
        print("cum_keys:",len(cum_keys))
        print(cum_values)
    cum_df.append(cum_values)
    
    if index == 3:
        break

49
49
49
49
49
53
cum_keys: 49
['Raulian Paiva', 'Kai Kara-France "Don\'t Blink"', 'Kai Kara-France "Don\'t Blink"', '2019-02-09', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]
49
49
49
49
