In [198]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import load_model

In [227]:
df = pd.read_csv('Data/model_ready.csv', index_col = 0)

In [228]:
xgb_model = joblib.load('Models/xgb_clf')
scaler = joblib.load('Models/scaler')

In [229]:
keras_model = load_model('Models/mlp')

In [203]:
df.columns

Index(['Winner', 'R_fighter', 'Fighter', 'Height', 'Reach', 'Stance', 'Weight',
       'age', 'loss', 'win',
       ...
       'Open Stance', 'Orthodox', 'Sideways', 'Southpaw', 'Switch',
       'Open Stance.1', 'Orthodox.1', 'Sideways.1', 'Southpaw.1', 'Switch.1'],
      dtype='object', length=124)

In [204]:
#Relevant cumulative statistics of fighter's most recent fight:
fighter_columns = ['Fighter', 'Height', 'Reach', 'Stance', 'Weight',
       'age', 'loss', 'win', 'draws', 'streak', 'cum_match_time', 'avg_cum_KD',
       'avg_cum_Sub. att', 'avg_cum_Pass', 'avg_cum_Rev.',
       'avg_cum_Sig. str. Hits', 'avg_cum_Sig. str. Attempts',
       'avg_cum_Total str. Hits', 'avg_cum_Total str. Attempts',
       'avg_cum_Td Hits', 'avg_cum_Td Attempts', 'avg_cum_Head Hits',
       'avg_cum_Head Attempts', 'avg_cum_Body Hits', 'avg_cum_Body Attempts',
       'avg_cum_Leg Hits', 'avg_cum_Leg Attempts', 'avg_cum_Distance Hits',
       'avg_cum_Distance Attempts', 'avg_cum_Clinch Hits',
       'avg_cum_Clinch Attempts', 'avg_cum_Ground Hits',
       'avg_cum_Ground Attempts', 'eff_sig_str', 'eff_hits', 'eff_tds',
       'eff_head', 'eff_leg', 'eff_body', 'eff_distance', 'eff_clinch',
       'eff_ground', 'matches', 'Open Stance', 'Orthodox', 'Sideways', 'Southpaw', 'Switch']

#Relevant cumulative statistics of fighter.1's most recent fight:
fighter_1_columns = ['Fighter.1', 'Height.1', 'Reach.1', 'Stance.1',
       'Weight.1', 'age.1', 'loss.1', 'win.1', 'draws.1', 'streak.1',
       'cum_match_time.1', 'avg_cum_KD.1', 'avg_cum_Sub. att.1',
       'avg_cum_Pass.1', 'avg_cum_Rev..1', 'avg_cum_Sig. str. Hits.1',
       'avg_cum_Sig. str. Attempts.1', 'avg_cum_Total str. Hits.1',
       'avg_cum_Total str. Attempts.1', 'avg_cum_Td Hits.1',
       'avg_cum_Td Attempts.1', 'avg_cum_Head Hits.1',
       'avg_cum_Head Attempts.1', 'avg_cum_Body Hits.1',
       'avg_cum_Body Attempts.1', 'avg_cum_Leg Hits.1',
       'avg_cum_Leg Attempts.1', 'avg_cum_Distance Hits.1',
       'avg_cum_Distance Attempts.1', 'avg_cum_Clinch Hits.1',
       'avg_cum_Clinch Attempts.1', 'avg_cum_Ground Hits.1',
       'avg_cum_Ground Attempts.1', 'eff_sig_str.1', 'eff_hits.1', 'eff_tds.1',
       'eff_head.1', 'eff_leg.1', 'eff_body.1', 'eff_distance.1',
       'eff_clinch.1', 'eff_ground.1', 'matches.1', 'Open Stance.1', 'Orthodox.1', 'Sideways.1', 'Southpaw.1', 'Switch.1']

#List of weight classes
weight_classes = ['Bantamweight', 'Catch Weight', 'Featherweight', 'Flyweight',
       'Heavyweight', 'Light Heavyweight', 'Lightweight', 'Middleweight',
       'Open Weight', 'Super Heavyweight', 'Welterweight',
       "Women's Bantamweight", "Women's Featherweight", "Women's Flyweight",
       "Women's Strawweight"]

In [205]:
#Helper function that determines if fighter is Fighter or Fighter.1 in a particular match
def zero_or_one_fighter(match, fighter):
    if match['Fighter'] == fighter:
        return 0
    else:
        return 1

#Helper function that adds the correct one-hot-encoded weight_class information:
def get_weight_class(wt_class):
    weight = pd.DataFrame([0]*len(weight_classes)).T
    weight.columns = weight_classes
    
    try:
        weight[wt_class] = 1
        return weight
    except:
        return weight

#Retrieve complete statistics of fighter's most recent fight
def get_recent(fighter):
    temp = df[df['Fighter'] == fighter]
    temp_1 = df[df['Fighter.1'] == fighter]
    
    combined = pd.concat([temp, temp_1], axis = 0).sort_index()
    return combined.iloc[0, :]


    
#Returns a details of a hypothetical match between fighter and fighter_1 using cumulative statistics
#of their most recent match:
def get_match(fighter, fighter_1):
    
    recent = get_recent(fighter)
    recent_1 = get_recent(fighter_1)
    
    if zero_or_one_fighter(recent, fighter) == 0:
        recent_stats = recent[fighter_columns]
    else:
        recent_stats = recent[fighter_1_columns]
        
    if zero_or_one_fighter(recent_1, fighter_1) == 0:
        recent_stats_1 = recent_1[fighter_columns]
    else:
        recent_stats_1 = recent_1[fighter_1_columns]
        
    combined = pd.DataFrame(pd.concat([recent_stats, recent_stats_1], axis = 0)).T
    combined.columns = fighter_columns+fighter_1_columns
    
    return combined

In [207]:
drop_columns = ['Fighter', 'Height', 'Reach', 'Stance', 'Weight', 'Fighter.1', 
                'Height.1', 'Reach.1', 'Stance.1', 'Weight.1']

In [208]:
def to_model_format(match, weight):
    match['Reach_diff'] = match['Reach'] - match['Reach.1']
    match['Height_diff'] = match['Height'] - match['Height.1']
    match['Weight_diff'] = match['Weight'] - match['Weight.1']

    match = pd.concat([match, get_weight_class(weight)], axis = 1)

    
    match.drop(drop_columns, inplace = True, axis = 1)

    return match


In [209]:
test_match = get_match('Charles Oliveira', 'Gilbert Burns')
to_model_format(test_match, '')

Unnamed: 0,age,loss,win,draws,streak,cum_match_time,avg_cum_KD,avg_cum_Sub. att,avg_cum_Pass,avg_cum_Rev.,...,Lightweight,Middleweight,Open Weight,Super Heavyweight,Welterweight,Women's Bantamweight,Women's Featherweight,Women's Flyweight,Women's Strawweight,Unnamed: 21
0,30.43,8,16,1,6,162.18,0.25,1.85,2.1,0.25,...,0,0,0,0,0,0,0,0,0,1


# New Match Pipeline Scrape

In [210]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime as dt

In [211]:
url = 'http://www.ufcstats.com/event-details/5f8e00c27b7e7410'

In [212]:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [213]:
#Removes multiple white spaces (for better implementation see gensim's preprocessing utils)
def remove_space_lines(text):
    pattern1 = re.compile(r'[\s\s+]')
    return re.sub(pattern1, ' ', text)

def get_fight_auxiliary(soup):
    '''
    Input: beautifulsoup of an event url: (ie. http://www.ufcstats.com/event-details/53278852bcd91e11)
    Outputs: pandas Series
        date, location, attendance
    '''
    
    table = []
    
    auxiliary_table = soup.find_all('li', {'class': 'b-list__box-list-item'})
    for item in auxiliary_table:
        attribute = remove_space_lines(item.text).strip()

        #If attribute is missing, replace with ''
        try:
            attribute = re.findall(r'\s\s+(.*)', attribute)[0]
        except:
            attribute = '' 
        
        table.append(attribute)
        
    table_series = pd.Series(table)
    table_series.index = ['date', 'location', 'attendance']
    
    if table_series['attendance'] != '':
        table_series['attendance'] = re.sub(',', '', table_series['attendance'])
        table_series['attendance'] = int(table_series['attendance'])
    
    table_series['date'] = dt.strptime(table_series['date'], '%B %d, %Y').strftime('%d-%m-%Y')

    return table_series

#Determine if observation is a title-bout
def find_belt(img_tag):
    try:
        image_link = img_tag['src']
        if re.match(r'.*belt.*', image_link) != None:
            return True
    except:
        return False

In [214]:
table_rows = soup.find_all('p', {'class': 'b-fight-details__table-text'})

In [215]:
name_class = []

for i in table_rows:
    name_class.append(remove_space_lines(i.text).strip())
    
name_class = [i for i in name_class if i != '']
name_class = np.array(name_class).reshape(-1, 4)

name_class = pd.DataFrame(name_class, columns = ['Fighter', 'Fighter.1', 'Drop', 'Weight_class'])
name_class.drop('Drop', inplace = True, axis = 1)
name_class

Unnamed: 0,Fighter,Fighter.1,Weight_class
0,Anthony Smith,Glover Teixeira,Light Heavyweight
1,Ben Rothwell,Ovince Saint Preux,Heavyweight
2,Alexander Hernandez,Drew Dober,Lightweight
3,Ricky Simon,Ray Borg,Bantamweight
4,Karl Roberson,Marvin Vettori,Middleweight
5,Andrei Arlovski,Philipe Lins,Heavyweight
6,Michael Johnson,Thiago Moises,Lightweight
7,Sijara Eubanks,Sarah Moras,Women's Bantamweight
8,Gabriel Benitez,Omar Morales,Lightweight
9,Hunter Azure,Brian Kelleher,Featherweight


# Predictions:

In [216]:
new_matches = pd.DataFrame()

for row in match_class.iterrows():
    entry = row[1]
    try:
        print(entry['Fighter.1'])
        print(entry['Fighter'])
        match = get_match(entry['Fighter'], entry['Fighter.1'])    
        final = to_model_format(match, entry['Weight_class'])   
        new_matches = new_matches.append(final)
    except:
        pass


Glover Teixeira
Anthony Smith
Ovince Saint Preux
Ben Rothwell
Drew Dober
Alexander Hernandez
Ray Borg
Ricky Simon
Marvin Vettori
Karl Roberson
Philipe Lins
Andrei Arlovski
Thiago Moises
Michael Johnson
Sarah Moras
Sijara Eubanks
Omar Morales
Gabriel Benitez
Brian Kelleher
Hunter Azure
Ike Villanueva
Chase Sherman


In [217]:
new_matches = new_matches.astype(float)
new_matches['title_bout'] = 0
new_matches['num_rounds'] = 3

orig_columns = ['age', 'loss', 'win', 'draws', 'streak', 'cum_match_time', 'avg_cum_KD', 'avg_cum_Sub. att', 
                'avg_cum_Pass', 'avg_cum_Rev.', 'avg_cum_Sig. str. Hits', 'avg_cum_Sig. str. Attempts', 
                'avg_cum_Total str. Hits', 'avg_cum_Total str. Attempts', 'avg_cum_Td Hits', 'avg_cum_Td Attempts', 
                'avg_cum_Head Hits', 'avg_cum_Head Attempts', 'avg_cum_Body Hits', 'avg_cum_Body Attempts', 
                'avg_cum_Leg Hits', 'avg_cum_Leg Attempts', 'avg_cum_Distance Hits', 'avg_cum_Distance Attempts', 
                'avg_cum_Clinch Hits', 'avg_cum_Clinch Attempts', 'avg_cum_Ground Hits', 'avg_cum_Ground Attempts', 
                'eff_sig_str', 'eff_hits', 'eff_tds', 'eff_head', 'eff_leg', 'eff_body', 'eff_distance', 'eff_clinch', 
                'eff_ground', 'age.1', 'loss.1', 'win.1', 'draws.1', 'streak.1', 'cum_match_time.1', 'avg_cum_KD.1', 
                'avg_cum_Sub. att.1', 'avg_cum_Pass.1', 'avg_cum_Rev..1', 'avg_cum_Sig. str. Hits.1', 
                'avg_cum_Sig. str. Attempts.1', 'avg_cum_Total str. Hits.1', 'avg_cum_Total str. Attempts.1', 
                'avg_cum_Td Hits.1', 'avg_cum_Td Attempts.1', 'avg_cum_Head Hits.1', 'avg_cum_Head Attempts.1', 
                'avg_cum_Body Hits.1', 'avg_cum_Body Attempts.1', 'avg_cum_Leg Hits.1', 'avg_cum_Leg Attempts.1', 
                'avg_cum_Distance Hits.1', 'avg_cum_Distance Attempts.1', 'avg_cum_Clinch Hits.1', 'avg_cum_Clinch Attempts.1',
                'avg_cum_Ground Hits.1', 'avg_cum_Ground Attempts.1', 'eff_sig_str.1', 'eff_hits.1', 'eff_tds.1', 'eff_head.1', 
                'eff_leg.1', 'eff_body.1', 'eff_distance.1', 'eff_clinch.1', 'eff_ground.1', 'title_bout', 'num_rounds', 
                'matches', 'matches.1', 'Reach_diff', 'Weight_diff', 'Height_diff', 'Bantamweight', 'Catch Weight', 
                'Featherweight', 'Flyweight', 'Heavyweight', 'Light Heavyweight', 'Lightweight', 'Middleweight', 'Open Weight', 
                'Super Heavyweight', 'Welterweight', "Women's Bantamweight", "Women's Featherweight", "Women's Flyweight", 
                "Women's Strawweight", 'Open Stance', 'Orthodox', 'Sideways', 'Southpaw', 'Switch', 'Open Stance.1', 
                'Orthodox.1', 'Sideways.1', 'Southpaw.1', 'Switch.1']

scale_columns = ['age','loss','win','draws','streak','cum_match_time','avg_cum_KD',
                 'avg_cum_Sub. att', 'avg_cum_Pass','avg_cum_Rev.','avg_cum_Sig. str. Hits','avg_cum_Sig. str. Attempts',
                 'avg_cum_Total str. Hits','avg_cum_Total str. Attempts','avg_cum_Td Hits','avg_cum_Td Attempts',
                 'avg_cum_Head Hits','avg_cum_Head Attempts','avg_cum_Body Hits','avg_cum_Body Attempts',
                 'avg_cum_Leg Hits','avg_cum_Leg Attempts','avg_cum_Distance Hits','avg_cum_Distance Attempts',
                 'avg_cum_Clinch Hits','avg_cum_Clinch Attempts','avg_cum_Ground Hits','avg_cum_Ground Attempts',
                 'eff_sig_str','eff_hits','eff_tds','eff_head','eff_leg','eff_body','eff_distance','eff_clinch','eff_ground',
                 'age.1','loss.1','win.1','draws.1','streak.1','cum_match_time.1',
                 'avg_cum_KD.1','avg_cum_Sub. att.1','avg_cum_Pass.1','avg_cum_Rev..1','avg_cum_Sig. str. Hits.1',
                 'avg_cum_Sig. str. Attempts.1','avg_cum_Total str. Hits.1','avg_cum_Total str. Attempts.1',
                 'avg_cum_Td Hits.1','avg_cum_Td Attempts.1','avg_cum_Head Hits.1','avg_cum_Head Attempts.1',
                 'avg_cum_Body Hits.1','avg_cum_Body Attempts.1','avg_cum_Leg Hits.1','avg_cum_Leg Attempts.1',
                 'avg_cum_Distance Hits.1','avg_cum_Distance Attempts.1','avg_cum_Clinch Hits.1','avg_cum_Clinch Attempts.1',
                 'avg_cum_Ground Hits.1','avg_cum_Ground Attempts.1','eff_sig_str.1','eff_hits.1','eff_tds.1','eff_head.1',
                 'eff_leg.1','eff_body.1','eff_distance.1','eff_clinch.1','eff_ground.1','num_rounds','matches',
                 'matches.1','Reach_diff','Weight_diff','Height_diff'
]

In [218]:
new_matches = new_matches[orig_columns]
new_matches

Unnamed: 0,age,loss,win,draws,streak,cum_match_time,avg_cum_KD,avg_cum_Sub. att,avg_cum_Pass,avg_cum_Rev.,...,Open Stance,Orthodox,Sideways,Southpaw,Switch,Open Stance.1,Orthodox.1,Sideways.1,Southpaw.1,Switch.1
0,30.87,4.0,7.0,0.0,-1.0,114.58,0.44,0.35,0.44,0.17,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
0,38.16,6.0,6.0,0.0,-3.0,123.08,0.24,0.32,1.3,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
0,26.82,1.0,2.0,0.0,-1.0,24.42,0.41,0.0,1.23,0.41,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
0,27.28,1.0,3.0,0.0,-1.0,45.77,0.44,0.22,1.75,0.22,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
0,29.12,2.0,3.0,0.0,1.0,41.78,0.24,0.48,1.68,0.72,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
0,33.42,10.0,11.0,0.0,-1.0,242.38,0.37,0.04,0.0,0.12,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
0,34.42,1.0,2.0,0.0,-1.0,45.0,0.22,0.0,1.33,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
0,31.19,2.0,5.0,0.0,2.0,65.7,0.3,1.07,0.3,0.15,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
0,27.55,0.0,0.0,0.0,0.0,15.0,0.0,0.0,0.67,0.67,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [230]:
scaled = scaler.transform(new_matches[scale_columns])
xgb_predictions = pd.Series(xgb_model.predict_proba(new_matches)[:, 1])
nn_predictions = keras_model.predict(scaled)

In [231]:
def get_winner(names, predictions):
    
    winner = []
    loser = []
    
    for name, predict in zip(names.iterrows(), predictions):
        if predict >= .5:
            winner.append(name[1]['Fighter.1'])
            loser.append(name[1]['Fighter'])
        else:
            winner.append(name[1]['Fighter'])
            loser.append(name[1]['Fighter.1'])
            
    display = pd.DataFrame([winner, loser]).T
    display.columns = ['Winner', 'Loser']
            
    return display
    
    

In [232]:
rf_predictions

0    0.52
1    0.40
2    0.30
3    0.42
4    0.42
5    0.58
6    0.56
7    0.67
8    0.63
dtype: float64

In [233]:
nn_predictions

array([[0.47836375],
       [0.435794  ],
       [0.4294776 ],
       [0.45306876],
       [0.447289  ],
       [0.56033874],
       [0.5356354 ],
       [0.46809778],
       [0.60952234]], dtype=float32)

In [234]:
get_winner(name_class, nn_predictions)

Unnamed: 0,Winner,Loser
0,Anthony Smith,Glover Teixeira
1,Ben Rothwell,Ovince Saint Preux
2,Alexander Hernandez,Drew Dober
3,Ricky Simon,Ray Borg
4,Karl Roberson,Marvin Vettori
5,Philipe Lins,Andrei Arlovski
6,Thiago Moises,Michael Johnson
7,Sijara Eubanks,Sarah Moras
8,Omar Morales,Gabriel Benitez


In [236]:
get_winner(name_class, xgb_predictions)

Unnamed: 0,Winner,Loser
0,Anthony Smith,Glover Teixeira
1,Ben Rothwell,Ovince Saint Preux
2,Alexander Hernandez,Drew Dober
3,Ricky Simon,Ray Borg
4,Marvin Vettori,Karl Roberson
5,Andrei Arlovski,Philipe Lins
6,Thiago Moises,Michael Johnson
7,Sijara Eubanks,Sarah Moras
8,Omar Morales,Gabriel Benitez
