In [1]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import load_model

In [2]:
df = pd.read_csv('Data/model_ready.csv', index_col = 0)

In [3]:
xgb_model = joblib.load('xgboost_clf')
scaler = joblib.load('scaler')

In [4]:
keras_model = load_model('mlp')

In [5]:
#Relevant cumulative statistics of fighter's most recent fight:
fighter_columns = ['Fighter', 'Height', 'Reach', 'Stance', 'Weight',
       'age', 'loss', 'win', 'draws', 'streak', 'cum_match_time', 'avg_cum_KD',
       'avg_cum_Sub. att', 'avg_cum_Pass', 'avg_cum_Rev.',
       'avg_cum_Sig. str. Hits', 'avg_cum_Sig. str. Attempts',
       'avg_cum_Total str. Hits', 'avg_cum_Total str. Attempts',
       'avg_cum_Td Hits', 'avg_cum_Td Attempts', 'avg_cum_Head Hits',
       'avg_cum_Head Attempts', 'avg_cum_Body Hits', 'avg_cum_Body Attempts',
       'avg_cum_Leg Hits', 'avg_cum_Leg Attempts', 'avg_cum_Distance Hits',
       'avg_cum_Distance Attempts', 'avg_cum_Clinch Hits',
       'avg_cum_Clinch Attempts', 'avg_cum_Ground Hits',
       'avg_cum_Ground Attempts', 'eff_sig_str', 'eff_hits', 'eff_tds',
       'eff_head', 'eff_leg', 'eff_body', 'eff_distance', 'eff_clinch',
       'eff_ground', 'matches', 'Open Stance', 'Orthodox', 'Sideways', 'Southpaw', 'Switch']

#Relevant cumulative statistics of fighter.1's most recent fight:
fighter_1_columns = ['Fighter.1', 'Height.1', 'Reach.1', 'Stance.1',
       'Weight.1', 'age.1', 'loss.1', 'win.1', 'draws.1', 'streak.1',
       'cum_match_time.1', 'avg_cum_KD.1', 'avg_cum_Sub. att.1',
       'avg_cum_Pass.1', 'avg_cum_Rev..1', 'avg_cum_Sig. str. Hits.1',
       'avg_cum_Sig. str. Attempts.1', 'avg_cum_Total str. Hits.1',
       'avg_cum_Total str. Attempts.1', 'avg_cum_Td Hits.1',
       'avg_cum_Td Attempts.1', 'avg_cum_Head Hits.1',
       'avg_cum_Head Attempts.1', 'avg_cum_Body Hits.1',
       'avg_cum_Body Attempts.1', 'avg_cum_Leg Hits.1',
       'avg_cum_Leg Attempts.1', 'avg_cum_Distance Hits.1',
       'avg_cum_Distance Attempts.1', 'avg_cum_Clinch Hits.1',
       'avg_cum_Clinch Attempts.1', 'avg_cum_Ground Hits.1',
       'avg_cum_Ground Attempts.1', 'eff_sig_str.1', 'eff_hits.1', 'eff_tds.1',
       'eff_head.1', 'eff_leg.1', 'eff_body.1', 'eff_distance.1',
       'eff_clinch.1', 'eff_ground.1', 'matches.1', 'Open Stance.1', 'Orthodox.1', 'Sideways.1', 'Southpaw.1', 'Switch.1']

#List of weight classes
weight_classes = ['Bantamweight', 'Catch Weight', 'Featherweight', 'Flyweight',
       'Heavyweight', 'Light Heavyweight', 'Lightweight', 'Middleweight',
       'Open Weight', 'Super Heavyweight', 'Welterweight',
       "Women's Bantamweight", "Women's Featherweight", "Women's Flyweight",
       "Women's Strawweight"]

In [10]:
#Helper function that determines if fighter is Fighter or Fighter.1 in a particular match
def zero_or_one_fighter(match, fighter):
    if match['Fighter'] == fighter:
        return 0
    else:
        return 1

#Helper function that adds the correct one-hot-encoded weight_class information:
def get_weight_class(wt_class):
    weight = pd.DataFrame([0]*len(weight_classes)).T
    weight.columns = weight_classes
    
    try:
        weight[wt_class] = 1
        return weight
    except:
        return weight

#Retrieve complete statistics of fighter's most recent fight
def get_recent(fighter):
    temp = df[df['Fighter'] == fighter]
    temp_1 = df[df['Fighter.1'] == fighter]
    
    combined = pd.concat([temp, temp_1], axis = 0).sort_index()
    return combined.iloc[0, :]


    
#Returns a details of a hypothetical match between fighter and fighter_1 using cumulative statistics
#of their most recent match:
def get_match(fighter, fighter_1):
    
    recent = get_recent(fighter)
    recent_1 = get_recent(fighter_1)
    
    if zero_or_one_fighter(recent, fighter) == 0:
        recent_stats = recent[fighter_columns]
    else:
        recent_stats = recent[fighter_1_columns]
        
    if zero_or_one_fighter(recent_1, fighter_1) == 0:
        recent_stats_1 = recent_1[fighter_columns]
    else:
        recent_stats_1 = recent_1[fighter_1_columns]
        
    combined = pd.DataFrame(pd.concat([recent_stats, recent_stats_1], axis = 0)).T
    combined.columns = fighter_columns+fighter_1_columns
    
    return combined

In [11]:
drop_columns = ['Fighter', 'Height', 'Reach', 'Stance', 'Weight', 'Fighter.1', 
                'Height.1', 'Reach.1', 'Stance.1', 'Weight.1']

In [23]:
def to_model_format(match, weight):
    match['Reach_diff'] = match['Reach'] - match['Reach.1']
    match['Height_diff'] = match['Height'] - match['Height.1']
    match['Weight_diff'] = match['Weight'] - match['Weight.1']

    match = pd.concat([match, get_weight_class(weight)], axis = 1)

    
    match.drop(drop_columns, inplace = True, axis = 1)

    return match


In [24]:
test_match = get_match('Charles Oliveira', 'Gilbert Burns')
to_model_format(test_match, '')

Unnamed: 0,age,loss,win,draws,streak,cum_match_time,avg_cum_KD,avg_cum_Sub. att,avg_cum_Pass,avg_cum_Rev.,...,Lightweight,Middleweight,Open Weight,Super Heavyweight,Welterweight,Women's Bantamweight,Women's Featherweight,Women's Flyweight,Women's Strawweight,Unnamed: 21
0,30.43,8,16,1,6,162.18,0.25,1.85,2.1,0.25,...,0,0,0,0,0,0,0,0,0,1


# New Match Pipeline Scrape

In [18]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime as dt

In [19]:
url = 'http://www.ufcstats.com/event-details/5f8e00c27b7e7410'

In [20]:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

In [21]:
#Removes multiple white spaces (for better implementation see gensim's preprocessing utils)
def remove_space_lines(text):
    pattern1 = re.compile(r'[\s\s+]')
    return re.sub(pattern1, ' ', text)

def get_fight_auxiliary(soup):
    '''
    Input: beautifulsoup of an event url: (ie. http://www.ufcstats.com/event-details/53278852bcd91e11)
    Outputs: pandas Series
        date, location, attendance
    '''
    
    table = []
    
    auxiliary_table = soup.find_all('li', {'class': 'b-list__box-list-item'})
    for item in auxiliary_table:
        attribute = remove_space_lines(item.text).strip()

        #If attribute is missing, replace with ''
        try:
            attribute = re.findall(r'\s\s+(.*)', attribute)[0]
        except:
            attribute = '' 
        
        table.append(attribute)
        
    table_series = pd.Series(table)
    table_series.index = ['date', 'location', 'attendance']
    
    if table_series['attendance'] != '':
        table_series['attendance'] = re.sub(',', '', table_series['attendance'])
        table_series['attendance'] = int(table_series['attendance'])
    
    table_series['date'] = dt.strptime(table_series['date'], '%B %d, %Y').strftime('%d-%m-%Y')

    return table_series

#Determine if observation is a title-bout
def find_belt(img_tag):
    try:
        image_link = img_tag['src']
        if re.match(r'.*belt.*', image_link) != None:
            return True
    except:
        return False

In [22]:
name_classes = []

for i in classes:
    name_classes.append(remove_space_lines(i.text).strip())
    
name_classes = [i for i in name_classes if i != '']
match_class = np.array(name_classes).reshape(-1, 4)

NameError: name 'classes' is not defined

In [290]:
new = pd.read_csv('new_fights.csv', index_col = 0)
new

Unnamed: 0,R_fighter,B_fighter,WEIGHT_CLASS
0,Justin Gaethje,Tony Ferguson,Lightweight
1,Henry Cejudo,Dominick Cruz,Bantamweight
2,Francis Ngannou,Jairzinho Rozenstruik,Heavyweight
3,Calvin Kattar,Jeremy Stephens,Featherweight
4,Greg Hardy,Yorgan De Castro,Heavyweight
5,Anthony Pettis,Donald Cerrone,Welterweight
6,Aleksei Oleinik,Fabricio Werdum,Heavyweight
7,Carla Esparza,Michelle Waterson,Women's Strawweight
8,Vicente Luque,Niko Price,Welterweight
9,Bryce Mitchell,Charles Rosa,Featherweight


In [328]:
new_matches = pd.DataFrame()

for row in new.iterrows():
    try:
        match = get_match(row[1].R_fighter, row[1].B_fighter)
        final = to_model_format(match, row[1].WEIGHT_CLASS)
        new_matches = new_matches.append(final)
    except:
        pass


for ar in match_class:
    try:
        match = get_match(ar[0], ar[1])    
        final = to_model_format(match, ar[3])   
        new_matches = new_matches.append(final)
    except:
        pass



In [329]:
new_matches

Unnamed: 0,age,loss,win,draws,streak,cum_match_time,avg_cum_KD,avg_cum_Sub. att,avg_cum_Pass,avg_cum_Rev.,...,Light Heavyweight,Lightweight,Middleweight,Open Weight,Super Heavyweight,Welterweight,Women's Bantamweight,Women's Featherweight,Women's Flyweight,Women's Strawweight
0,30.85,2,3,0,2,43.28,0.46,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
0,32.35,2,8,0,4,123.77,0.32,0.08,0.97,0.0,...,0,0,0,0,0,0,0,0,0,0
0,32.84,2,8,0,2,65.81,0.61,0.3,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
0,31.64,1,4,0,2,48.81,0.61,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
0,31.3,1,2,1,-1,25.47,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
0,33.0,8,9,0,-1,195.06,0.21,0.56,0.72,0.51,...,0,0,0,0,0,1,0,0,0,0
0,42.61,4,6,0,-2,50.11,0.4,1.6,2.0,0.2,...,0,0,0,0,0,0,0,0,0,0
0,31.97,4,5,0,1,125.26,0.0,0.24,1.44,0.24,...,0,0,0,0,0,0,0,0,0,1
0,27.95,2,10,0,6,94.45,0.74,0.74,0.85,0.11,...,0,0,0,0,0,1,0,0,0,0
0,25.19,0,2,0,2,30.0,0.0,1.0,0.33,0.33,...,0,0,0,0,0,0,0,0,0,0


In [330]:
new_matches = new_matches.astype(float)
new_matches['title_bout'] = 0
new_matches['num_rounds'] = 3

orig_columns = ['age', 'loss', 'win', 'draws', 'streak', 'cum_match_time', 'avg_cum_KD', 'avg_cum_Sub. att', 
                'avg_cum_Pass', 'avg_cum_Rev.', 'avg_cum_Sig. str. Hits', 'avg_cum_Sig. str. Attempts', 
                'avg_cum_Total str. Hits', 'avg_cum_Total str. Attempts', 'avg_cum_Td Hits', 'avg_cum_Td Attempts', 
                'avg_cum_Head Hits', 'avg_cum_Head Attempts', 'avg_cum_Body Hits', 'avg_cum_Body Attempts', 
                'avg_cum_Leg Hits', 'avg_cum_Leg Attempts', 'avg_cum_Distance Hits', 'avg_cum_Distance Attempts', 
                'avg_cum_Clinch Hits', 'avg_cum_Clinch Attempts', 'avg_cum_Ground Hits', 'avg_cum_Ground Attempts', 
                'eff_sig_str', 'eff_hits', 'eff_tds', 'eff_head', 'eff_leg', 'eff_body', 'eff_distance', 'eff_clinch', 
                'eff_ground', 'age.1', 'loss.1', 'win.1', 'draws.1', 'streak.1', 'cum_match_time.1', 'avg_cum_KD.1', 
                'avg_cum_Sub. att.1', 'avg_cum_Pass.1', 'avg_cum_Rev..1', 'avg_cum_Sig. str. Hits.1', 
                'avg_cum_Sig. str. Attempts.1', 'avg_cum_Total str. Hits.1', 'avg_cum_Total str. Attempts.1', 
                'avg_cum_Td Hits.1', 'avg_cum_Td Attempts.1', 'avg_cum_Head Hits.1', 'avg_cum_Head Attempts.1', 
                'avg_cum_Body Hits.1', 'avg_cum_Body Attempts.1', 'avg_cum_Leg Hits.1', 'avg_cum_Leg Attempts.1', 
                'avg_cum_Distance Hits.1', 'avg_cum_Distance Attempts.1', 'avg_cum_Clinch Hits.1', 'avg_cum_Clinch Attempts.1',
                'avg_cum_Ground Hits.1', 'avg_cum_Ground Attempts.1', 'eff_sig_str.1', 'eff_hits.1', 'eff_tds.1', 'eff_head.1', 
                'eff_leg.1', 'eff_body.1', 'eff_distance.1', 'eff_clinch.1', 'eff_ground.1', 'title_bout', 'num_rounds', 
                'matches', 'matches.1', 'Reach_diff', 'Weight_diff', 'Height_diff', 'Bantamweight', 'Catch Weight', 
                'Featherweight', 'Flyweight', 'Heavyweight', 'Light Heavyweight', 'Lightweight', 'Middleweight', 'Open Weight', 
                'Super Heavyweight', 'Welterweight', "Women's Bantamweight", "Women's Featherweight", "Women's Flyweight", 
                "Women's Strawweight", 'Open Stance', 'Orthodox', 'Sideways', 'Southpaw', 'Switch', 'Open Stance.1', 
                'Orthodox.1', 'Sideways.1', 'Southpaw.1', 'Switch.1']

scale_columns = ['age','loss','win','draws','streak','cum_match_time','avg_cum_KD',
                 'avg_cum_Sub. att', 'avg_cum_Pass','avg_cum_Rev.','avg_cum_Sig. str. Hits','avg_cum_Sig. str. Attempts',
                 'avg_cum_Total str. Hits','avg_cum_Total str. Attempts','avg_cum_Td Hits','avg_cum_Td Attempts',
                 'avg_cum_Head Hits','avg_cum_Head Attempts','avg_cum_Body Hits','avg_cum_Body Attempts',
                 'avg_cum_Leg Hits','avg_cum_Leg Attempts','avg_cum_Distance Hits','avg_cum_Distance Attempts',
                 'avg_cum_Clinch Hits','avg_cum_Clinch Attempts','avg_cum_Ground Hits','avg_cum_Ground Attempts',
                 'eff_sig_str','eff_hits','eff_tds','eff_head','eff_leg','eff_body','eff_distance','eff_clinch','eff_ground',
                 'age.1','loss.1','win.1','draws.1','streak.1','cum_match_time.1',
                 'avg_cum_KD.1','avg_cum_Sub. att.1','avg_cum_Pass.1','avg_cum_Rev..1','avg_cum_Sig. str. Hits.1',
                 'avg_cum_Sig. str. Attempts.1','avg_cum_Total str. Hits.1','avg_cum_Total str. Attempts.1',
                 'avg_cum_Td Hits.1','avg_cum_Td Attempts.1','avg_cum_Head Hits.1','avg_cum_Head Attempts.1',
                 'avg_cum_Body Hits.1','avg_cum_Body Attempts.1','avg_cum_Leg Hits.1','avg_cum_Leg Attempts.1',
                 'avg_cum_Distance Hits.1','avg_cum_Distance Attempts.1','avg_cum_Clinch Hits.1','avg_cum_Clinch Attempts.1',
                 'avg_cum_Ground Hits.1','avg_cum_Ground Attempts.1','eff_sig_str.1','eff_hits.1','eff_tds.1','eff_head.1',
                 'eff_leg.1','eff_body.1','eff_distance.1','eff_clinch.1','eff_ground.1','num_rounds','matches',
                 'matches.1','Reach_diff','Weight_diff','Height_diff'
]

In [331]:
new_matches = new_matches[orig_columns]
new_matches

Unnamed: 0,age,loss,win,draws,streak,cum_match_time,avg_cum_KD,avg_cum_Sub. att,avg_cum_Pass,avg_cum_Rev.,...,Open Stance,Orthodox,Sideways,Southpaw,Switch,Open Stance.1,Orthodox.1,Sideways.1,Southpaw.1,Switch.1
0,30.85,2.0,3.0,0.0,2.0,43.28,0.46,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
0,32.35,2.0,8.0,0.0,4.0,123.77,0.32,0.08,0.97,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
0,32.84,2.0,8.0,0.0,2.0,65.81,0.61,0.3,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
0,31.64,1.0,4.0,0.0,2.0,48.81,0.61,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
0,31.3,1.0,2.0,1.0,-1.0,25.47,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
0,33.0,8.0,9.0,0.0,-1.0,195.06,0.21,0.56,0.72,0.51,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
0,42.61,4.0,6.0,0.0,-2.0,50.11,0.4,1.6,2.0,0.2,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
0,31.97,4.0,5.0,0.0,1.0,125.26,0.0,0.24,1.44,0.24,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
0,27.95,2.0,10.0,0.0,6.0,94.45,0.74,0.74,0.85,0.11,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
0,25.19,0.0,2.0,0.0,2.0,30.0,0.0,1.0,0.33,0.33,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [332]:
scaled = scaler.transform(new_matches[scale_columns])

In [338]:
predictions = pd.Series(xgb_model.predict(new_matches))

In [341]:
for row, predict in zip(new.iterrows(), predictions[0:new.shape[0]]):
    if predict == 0:
        print(row[1].R_fighter)
    else:
        print(row[1].B_fighter)
    

Justin Gaethje
Henry Cejudo
Francis Ngannou
Calvin Kattar
Greg Hardy
Anthony Pettis
Fabricio Werdum
Carla Esparza
Vicente Luque
Charles Rosa
Ryan Spann


In [342]:
keras_model.predict(scaled)

array([[0.684282  ],
       [0.5888426 ],
       [0.5021188 ],
       [0.21597075],
       [0.17376526],
       [0.5266418 ],
       [0.67326725],
       [0.5653739 ],
       [0.26195535],
       [0.6451544 ],
       [0.30872133],
       [0.4596965 ],
       [0.4247528 ],
       [0.4081001 ],
       [0.41540363],
       [0.4815122 ],
       [0.53111106],
       [0.49211296],
       [0.5298134 ],
       [0.62593967]], dtype=float32)