# Data Mining

In [1]:
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pandas as pd
import pickle
from scipy import stats
from itertools import permutations
%matplotlib inline
%autosave 300
%load_ext autoreload
%autoreload 2
from sklearn.preprocessing import scale, Normalizer, normalize, scale, MinMaxScaler
from sklearn.dummy import DummyClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from pandas.tools.plotting import scatter_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_selection import chi2
from pandas.tools.plotting import scatter_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn import linear_model
from IPython.display import display, HTML

plt.rcParams['figure.figsize'] = (16, 8)
plt.rcParams['font.size'] = 16
plt.rcParams['text.usetex'] = False
plt.rcParams['axes.labelsize'] = plt.rcParams['font.size']
plt.rcParams['axes.titlesize'] = 1.5*plt.rcParams['font.size']
plt.rcParams['legend.fontsize'] = 16
plt.rcParams["figure.facecolor"] = 'white'

import matplotlib
params = {'axes.labelsize': 18,'axes.titlesize':20, 'legend.fontsize': 20, 'xtick.labelsize': 16, 'ytick.labelsize': 16}
matplotlib.rcParams.update(params)
np.set_printoptions(precision=4)

from sklearn.model_selection import KFold
num_splits = 5.0
kf = KFold(n_splits=int(num_splits))

raw_data = pd.read_csv('data/fighter_data2020-06-02.csv')
pd.options.mode.chained_assignment = None  # default='warn'

Autosaving every 300 seconds


# Preprocessing

In [2]:
df_5w = raw_data[((raw_data['w'].astype(int)+raw_data['l'].astype(float)+raw_data['d'].astype(float)) >= 5)]
df_5w.reset_index(drop=True, inplace=True)
print(df_5w.shape)
significant_df = df_5w[(df_5w['SLpM']
           + df_5w['Str. Acc.'].apply(lambda x: x.strip('%')).astype(float)
           + df_5w['SApM']
           + df_5w['Str. Def'].apply(lambda x: x.strip('%')).astype(float)
           + df_5w['TD Avg']
           + df_5w['TD Acc.'].apply(lambda x: x.strip('%')).astype(float)
           + df_5w['TD Def.'].apply(lambda x: x.strip('%')).astype(float)
           + df_5w['Sub. Avg.']) > 0
          ]
significant_df.reset_index(drop=True, inplace=True)
#Get rid of rows from fightmatrix that are all zeros
print (significant_df.shape)

(3082, 20)
(2666, 20)


In [3]:
def extract_stats_df(input_df):
    df = input_df.filter(['SLpM','Str. Acc.','SApM', 'Str. Def', 'TD Avg', 
                                     'TD Acc.', 'TD Def.', 'Sub. Avg.'], 
                                     axis=1)
    df['Str. Acc.'] = df['Str. Acc.'].apply(lambda x: x.strip('%')).astype(float)
    df['Str. Def'] = df['Str. Def'].apply(lambda x: x.strip('%')).astype(float)
    df['TD Acc.'] = df['TD Acc.'].apply(lambda x: x.strip('%')).astype(float)
    df['TD Def.'] = df['TD Def.'].apply(lambda x: x.strip('%')).astype(float)
    return df
stats_df = extract_stats_df(significant_df)
print(stats_df.shape)
stats_df.reset_index(drop=True, inplace=True)

(2666, 8)


In [4]:
xticks = ['SLpM','Str. Acc.','SApM', 'Str. Def', 'TD Avg', 'TD Acc.', 'TD Def.', 'Sub. Avg.']

In [5]:
#load the current top15 ranked fighters in all divisions
top15_all_class = np.genfromtxt('./data/top15.csv', delimiter=',', dtype='str')
significant_df = df_5w[(df_5w['SLpM']
           + df_5w['Str. Acc.'].apply(lambda x: x.strip('%')).astype(float)
           + df_5w['SApM']
           + df_5w['Str. Def'].apply(lambda x: x.strip('%')).astype(float)
           + df_5w['TD Avg']
           + df_5w['TD Acc.'].apply(lambda x: x.strip('%')).astype(float)
           + df_5w['TD Def.'].apply(lambda x: x.strip('%')).astype(float)
           + df_5w['Sub. Avg.']) > 0
          ]
significant_df.reset_index(drop=True, inplace=True)
top15_df = significant_df[(significant_df['first'] + ' ' + significant_df['last']).isin(top15_all_class)]
print(top15_df.shape)

(160, 20)


In [6]:
top15_names_df = (top15_df['first'] + ' ' + top15_df['last']).values
reach_op = lambda x: float(x.replace('"', '').strip())
# weight_op = lambda x: float(x.split()[0])
stracc_op = lambda x: float(x[:-1])
def height_op(in_str):
    in_str = str(in_str)
    if type(in_str) is int:
        return in_str
    feet, inches = in_str.split(' ')
    feet_num = int(feet[:-1])
    inches_num = int(inches[:-1])
    return feet_num * 12 + inches_num
def result_op(in_str):
    if 'win' in in_str:
        return 1
    elif 'loss' in in_str:
        return 0
    elif 'n/a' in in_str:
        return 0
def cleanup_df(indf):
    ret = pd.DataFrame()
    indf.dropna(subset=['height', 'reach'], inplace=True)
    ret['reach'] = indf['reach'].apply(reach_op)
#     ret['weight'] = indf['weight'].apply(weight_op)
    ret['height'] = indf['height'].apply(height_op)
    ret['w/l'] = indf['w']/(indf['w']+indf['l'])
    ret['SLpM'] = indf['SLpM']
    ret['Str. Acc.'] = indf['Str. Acc.'].apply(stracc_op)
    ret['SApM'] = indf['SApM'].apply(float)
    ret['Str. Def'] = indf['Str. Def'].apply(stracc_op)
    ret['TD Avg'] = indf['TD Avg'].apply(float)
    ret['TD Acc.'] = indf['TD Acc.'].apply(stracc_op)
    ret['TD Def.'] = indf['TD Def.'].apply(stracc_op)
    ret['Sub. Avg.'] = indf['Sub. Avg.'].apply(float)
    ret['result'] = indf['result'].apply(result_op)
    return ret

In [7]:
#create new vector consisting of: height, weight, reach, w/l ratio, and statistics
def generate_delta(primary, contender, mode='absolute'):
    #first cleanup input dataframes
    primary_clean = cleanup_df(primary)
    contender_clean = cleanup_df(contender)
    if mode == 'absolute':
        delta = pd.DataFrame(data = (contender_clean.values-primary_clean.values),
                             columns=contender_clean.columns)
    elif mode == 'percentage':
        delta = pd.DataFrame(100*(contender_clean.values-primary_clean.values)/primary_clean.values,
                             columns=contender_clean.columns)
    return delta

# Top15 Fighters

In [8]:
df_top15 = pd.read_csv('./data/top15_results_2020-05-31.csv', delimiter=',', header=None)
df_top15.columns = ['red', 'blue', 'result']
df_top15['red_first'] = df_top15['red'].str.split(' ', expand=True)[0]
def get_last_name(x):
    return x.split(' ')[-1]
df_top15['red_last'] = df_top15['red'].apply(get_last_name)
df_top15['blue_first'] = df_top15['blue'].str.split(' ', expand=True)[0]
df_top15['blue_last'] = df_top15['blue'].apply(get_last_name)
top15_list = df_top15['red'].unique()
print(df_top15.shape)

(2402, 7)


In [9]:
'''
To generate deltas, loop through the top fighters one by one and concatenate deltas results
'''
frames = []
for fighter in top15_list:
    try:
        df_fighter = df_top15[df_top15['red'] == fighter]
        first = df_fighter['red_first'].values[0]
        last = df_fighter['red_last'].values[0]
        fighter_stats = df_5w[(df_5w['first'] == first) & (df_5w['last'] == last)]
#         print(fighter_stats.isnull().values.any())
        fighter_stats['result'] = 'n/a'
        opponent_stats = pd.merge(df_5w, df_fighter, left_on=['first', 'last'], right_on=['blue_first', 'blue_last'], how='inner')
        fighter_deltas = generate_delta(fighter_stats, opponent_stats)
        frames.append(fighter_deltas)
    except Exception as e:
#         print e
        print (first, last)
        pass
#         display(fighter_stats)
#         display(opponent_stats)
# fighter_deltas.head()

Kai France
Mark Rosa
Chan Jung
Rafael Anjos
Antonio Junior
Junior Santos
Germaine Randamie
Yan Xiaonan
Montana Rosa


In [10]:
df_deltas_top15 = pd.concat(frames)
print(df_deltas_top15.shape)
df_deltas_top15.replace([np.inf, -np.inf], np.nan, inplace=True)
# Need to figure out why there are NAN results
df_deltas_top15.dropna(inplace=True)
print(df_deltas_top15.shape)

(2096, 12)
(2064, 12)


In [11]:
train_predictors = df_deltas_top15.iloc[:,:-1]
train_response = df_deltas_top15['result']
X = train_predictors.values
y = train_response.values
print (X.shape)
print (y.shape)

(2064, 11)
(2064,)


In [12]:
percent_misclassified = 0
for train_index, test_index in kf.split(X):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = RandomForestClassifier(n_jobs=-1, max_features=5, n_estimators=500)
    y_pred = clf.fit(x_train, y_train).predict(x_test)
    percent_misclassified+=100*(y_test != y_pred).sum()/float(x_test.shape[0])
#     print("mislabeled out of %d obs : %0.2f"% (x_test.shape[0], 100*(y_test != y_pred).sum()/float(x_test.shape[0])))
print("avg misclassification: %0.2f"%float(percent_misclassified/num_splits))

avg misclassification: 27.38


In [13]:
model = clf.fit(X, y)
#try a prediction
conor_stats = df_5w[(df_5w['first'] == 'Conor') & (df_5w['last'] == 'McGregor')]
nate_stats = df_5w[(df_5w['first'] == 'Khabib') & (df_5w['last'] == 'Nurmagomedov')]
conor_stats.loc[:,'result'] = 'n/a'
nate_stats.loc[:,'result'] = 'n/a'
try_delta = generate_delta(conor_stats, nate_stats).iloc[:,:-1].values
model.predict(try_delta)

array([0.])

In [14]:
df_deltas_top15.to_csv('top15_deltas_'+datetime.today().strftime('%Y-%m-%d')+'.csv', index=False)

## All bouts model

In [18]:
df_full = pd.read_csv('./data/full_results.csv', delimiter=',', header=None)
df_full.columns = ['red', 'blue', 'result']
df_full['red_first'] = df_full['red'].str.split(' ', expand=True)[0]
def get_last_name(x):
    return x.split(' ')[-1]
df_full['red_last'] = df_full['red'].apply(get_last_name)
df_full['blue_first'] = df_full['blue'].str.split(' ', expand=True)[0]
df_full['blue_last'] = df_full['blue'].apply(get_last_name)
full_list = df_full['red'].unique()
df_full.head()

Unnamed: 0,red,blue,result,red_first,red_last,blue_first,blue_last
0,Tom Aaron,Matt Ricehouse,loss,Tom,Aaron,Matt,Ricehouse
1,Tom Aaron,Eric Steenberg,win,Tom,Aaron,Eric,Steenberg
2,Danny Abbadi,Jorge Gurgel,loss,Danny,Abbadi,Jorge,Gurgel
3,Danny Abbadi,Kalib Starnes,loss,Danny,Abbadi,Kalib,Starnes
4,David Abbott,Kevin Ferguson,loss,David,Abbott,Kevin,Ferguson


In [20]:
df_deltas_full = pd.concat(frames)
df_deltas_full.replace([np.inf, -np.inf], np.nan, inplace=True)
#Need to figure out why there are NAN results
df_deltas_full.dropna(inplace=True)

In [21]:
exclude_list = ['Danny Abbadi','David Abbott','Daniel Acacio','Mike Aina','Hitomi Akano','Razak Al-Hassan','Gilbert Aldana','Kenneth Alexander','Marcio Junior','Olaf Alfonso','Amilcar Alves','Andre Amado','Karl Amoussou','Reese Andy','Shinya Aoki','Erik Apple','Ricardo Arona','Pat Audinwood','Luiz Azeredo','Bryan Baker','Tae Bang','David Baron','Lyle Beerbohm','Charles Bennett','Steve Berger','Bret Bergmark','Keith Berry','Edson Berto','Jason Black','Jesse Bongfeldt','Paul Bradley','Ebenezer Braga','Joe Brammer','Chris Brennan','Aaron Brink','Todd Brown','Steve Bruno','Courtney Buck','Murilo Bustamante','Gina Carano','Phil Cardella','Antonio Junior','Shonie Carter','Bendy Casimir','Luke Caudillo','Ansar Chalangov','Joachim Christensen','Heather Clark','Wayne Cole','Devin Cole','Kit Cope','Muhsin Corbbrey','Wesley Correira','Jeff Cox','Dan Cramer','Alberto Crane','Richard Crunkilton','Marcio Cruz','Philip Fries','Chris Rocha','Mike Torre','Rodrigo Lima','Edilberto Oliveira','Jorge Oliveira','Germaine Randamie','Shane Rosario','Jon Reyes','Thomas Denny','Cory Devela','Edwin DeWees','Alessio Chirico','Rafael Dias','Kyle Dietz','Rafael Anjos','Junior Santos','Antonio Santos','Robert Drysdale','Joe Duarte','Justin Eilers','Jon Einemo','Aleksander Emelianenko','Tom Erikson','Doug Evans','Dan Evensen','Tonya Evinger','Edward Faaloloto','Brodie Farber','Kevin Ferguson','Bibiano Fernandes','Anthony Figueroa','Paulo Filho','Mirko Cop','Jan Finney','Luiz Firmino','Xavier Foupa-Pokam','Ian Freeman','Don Frye','Tony Fryklund','Kazuyuki Fujita','Travis Galbraith','Andre Galvao','Marcos Galvao','Joey Gambino','Brian Gassaway','Tiki Ghosn','Jason Gilliam','Clint Godfrey','Allan Goes','Gary Goodridge','Chase Gormley','Damian Grabowski','Royce Gracie','Renzo Gracie','Andre Gusmao','John Halverson','Joachim Hansen','Justin Haskins','Josh Haynes','Henrique Silva','Conor Heun','Branden Hinkle','Kwan Kwak','Sam Hoger','Mark Holst','Matt Horwich','John Hosman','Abongo Humphrey','Joe Hurley','Fabiano Iha','Seichi Ikemoto','Yusuke Imamura','Masakazu Imanari','Brad Imes','Enson Inoue','Mitsuhiro Ishida','Yoislandy Izquierdo','Eugene Jackson','Jeremy Jackson','Maciej Jewtuszko','Brian Johnston','Carlton Jones','Kevin Jordan','Jeff Joslin','Chan Jung','Hiromitsu Kanehara','Kyung Kang','Tetsuji Kato','CJ Keith','Mark Kerr','Sergei Kharitonov','Dong Kim','Tsuyoshi Kohsaka','John Kolosci','Yuki Kondo','Dan Lauzon','David Lee','Kimo Leopoldo','Justin Levens','Scott Lighty','Hyun Lim','Lucio Linhares','Jorge Lopez','Ian Loveland','Waylon Lowe','Mike Lullo','Adam Lynn','Bill Mahood','Melvin Manhoef','Jose Maria','Wagner Martins','Daijiro Matsui','Gan McGee','Greg McIntyre','Dave Menne','Guy Mezger','Kristof Midoux','Pat Miletich','Curtis Millender','Micah Miller','Adam Milstead','Ikuhisa Minowa','Kazuo Misaki','Dokonjonosuke Mishima','Eiji Mitsuoka','Kazuyuki Miyata','Tatsuya Mizuno','Roxanne Modafferi','Nate Moore','Brandon Moreno','Sammy Morgan','Brad Morris','Anthony Morrison','Lee Murray','Daisuke Nakamura','Jutaro Nakao','Yui Nam','Fabio Nascimento','Bobby Nash','Pawel Nastula','Dustin Neace','Antonio Neto','Carlos Newton','Yosuke Nishijima',"TJ O'Brien",'Jorge Oliveira','Alexander Otsuka','Shungo Oyama','Alexandre Pantoja','Bryan Pardoe','Michael Patt','Julio Paulino','Joe Pearson','Rolando Perez','Ross Pointon','Chris Price','Niko Price','Benji Radach','Jordan Radev','Hector Ramirez','Luis Ramos','Kevin Randleman','Gideon Ray','Abdul Alhassan','Chad Reiner','Jason Reinhardt','Will Ribeiro','Matt Ricehouse','Pedro Rizzo','Buddy Roberts','Colin Robinson','Carlos Rocha','Ricco Rodriguez','Marcos Lima','Jake Rosholt','Murilo Rua','Gabe Ruediger','Anthony Ruiz','Ovince Preux','Kazushi Sakuraba','Hayato Sakurai','Ivan Salaverry','Sean Salmon','Diego Saraiva','Harris Sarmiento','Masaaki Satake','Lumumba Sayers','Fabiano Scherner','Samy Schiavo','Semmy Schilt','Alex Schoenauer','Andrei Semenov','Ivan Serati','Dan Severn','Frank Shamrock','Katsuyori Shibata','Akira Shoji','Assuerio Silva','Jay Silva','Douglas Andrade','Wes Sims','Lodune Sincaid','Maurice Smith','Rameau Sokoudjou','Bobby Southworth','Chris Spang','Pete Spratt','Dion Staring','Alex Stiebling','Tyler Stinson','Dan Stittgen','Denis Stojnic','Curtis Stout','Dave Strasser','Genki Sudo','Amar Suloev','Yoshiki Takahashi','Daiju Takase','Hiroyuki Takaya','Kiyoshi Tamura','Jesse Taylor','James Huna','Tra Telligman','David Terrell','James Terry','Din Thomas','Ryan Thomas','Noah Thomas','James Thompson','Hideo Tokoro','Anthony Torres','Ronys Torres','Bryan Travers','Yasuhiro Urushitani','Victor Valimaki','Andrew Valladerez','Mike Arsdale','Matt Buren','Matt Veach','Joe Vedepo','Joe Veres','Renato Verissimo','Ketlen Vieira','Steve Vigneault','Joey Villasenor','Falaniko Vitale','Jason Flue','Igor Vovchanchyn','Donny Walker','Crafton Wallace','Joe Warren','Mark Weir','Vernon White','Mike Whitehead','Justin Wilcox','Pete Williams','Eric Wisely','Travis Wiuff','Brandon Wolff','Justin Wren','Eddie Yagin','Hirotaka Yokoi','Dong Yoon','Hidehiko Yoshida','Trenell Young','Rob Yundt','Marius Zaromskis','Roman Zentsov']
intercepted_list = [x for x in full_list if x not in exclude_list]

In [22]:
df_5w.loc[:,'full name'] = df_5w['first'] + ' ' + df_5w['last']
df_5w = df_5w[~df_5w['full name'].isin(exclude_list)]

In [23]:
train_predictors = df_deltas_full.iloc[:,:-1]
train_response = df_deltas_full['result']
X = train_predictors.values
y = train_response.values
print(X.shape)
print(y.shape)

(2064, 11)
(2064,)


In [24]:
# percent_misclassified = 0
# for train_index, test_index in kf.split(X):
#     x_train, x_test = X[train_index], X[test_index]
#     y_train, y_test = y[train_index], y[test_index]
#     clf = RandomForestClassifier(n_jobs=-1, max_features=5, n_estimators=500)
#     y_pred = clf.fit(x_train, y_train).predict(x_test)
#     percent_misclassified+=100*(y_test != y_pred).sum()/float(x_test.shape[0])
# #     print("mislabeled out of %d obs : %0.2f"% (x_test.shape[0], 100*(y_test != y_pred).sum()/float(x_test.shape[0])))
# print("avg misclassification: %0.2f"%float(percent_misclassified/num_splits))

In [27]:
fighter1 = 'Conor Mcgregor'.split(' ')
fighter2 = 'Nate Diaz'.split(' ')
perm = permutations([fighter1, fighter2])
largest_proba = 0
largest_index = None
predrepo = []
for x in perm: 
    stat1 = df_5w[(df_5w['first'].str.lower() == x[0][0].lower()) &
        (df_5w['last'].str.lower() == x[0][1].lower())]
    stat2 = df_5w[(df_5w['first'].str.lower() == x[1][0].lower()) &
            (df_5w['last'].str.lower() == x[1][1].lower())]
    stat1.loc[:,'result'] = 'n/a'
    stat2.loc[:,'result'] = 'n/a'
    
    delta = generate_delta(stat1, stat2).iloc[:,:-1].values
    prediction = model.predict_proba(delta)
    predrepo+=list(prediction[0])
    
# largest_proba=np.max()
nppredrepo = np.array(predrepo)
win_idx = np.argmax(nppredrepo)

if win_idx==0:
    print(fighter2)
if win_idx==1:
    print(fighter1)
if win_idx==2:
    print(fighter1)
if win_idx==3:
    print(fighter2)
print(np.max(nppredrepo))

['Conor', 'Mcgregor']
0.5621999999999997


In [29]:
# For manual checking

fighter1 = 'Conor Mcgregor'.split(' ')
fighter2 = 'Nate Diaz'.split(' ')
stat1 = df_5w[(df_5w['first'].str.lower() == fighter1[0].lower()) &
        (df_5w['last'].str.lower() == fighter1[1].lower())]
stat2 = df_5w[(df_5w['first'].str.lower() == fighter2[0].lower()) &
        (df_5w['last'].str.lower() == fighter2[1].lower())]
stat1.loc[:,'result'] = 'n/a'
stat2.loc[:,'result'] = 'n/a'

delta = generate_delta(stat1, stat2).iloc[:,:-1].values
prediction = model.predict_proba(delta)
prediction

array([[0.5622, 0.4378]])