# Create Mapping between Betting Odds and Fight Outcomes

In [80]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from unidecode import unidecode
from fuzzywuzzy import fuzz
pd.set_option('display.max_columns', 500)
from zipfile import ZipFile
pd.set_option('display.max_colwidth', -1)

In [81]:
# load all datasets

# mapping of wiki fight event urls to bestfightodds urls. unique key is the wikipedia fight event url
wiki_bestfightodds_url_id_mapping= pd.read_csv("../../data/bestfightodds_data/bestfightodds_urls.csv")
wiki_bestfightodds_url_id_mapping.rename(columns = {'url':'wiki_url'}, inplace = True)
wiki_bestfightodds_url_id_mapping = \
wiki_bestfightodds_url_id_mapping.loc[(wiki_bestfightodds_url_id_mapping['fight_odds_url'] != '') & \
                                      (wiki_bestfightodds_url_id_mapping['fight_odds_url'].notna()), \
                                      ['wiki_url','fight_odds_url', '#', 'Event', 'Date']]

# load each fight outcome for each ufc event. Each row is a fight outcome (from wikipedia)
wiki_fight_outcomes = pd.read_csv("../../data/wikipedia_data/wikipedia_all_ufc_fight_outcomes.csv")


# load all odds. Since I'm just creating a mapping I only need one row per fight
zip_file = ZipFile('../../data/bestfightodds_data/straight_bets.zip')
dfs = [pd.read_csv(zip_file.open(text_file.filename)) for text_file in zip_file.infolist() 
       if text_file.filename.endswith('.csv')]
moneyline_data = pd.concat(dfs)
moneyline_data = moneyline_data[['fighter1', 'fighter2', 'url']].drop_duplicates(['fighter1', 'fighter2', 'url'])
del dfs

In [82]:
# filter to all cards since 2008
wiki_bestfightodds_url_id_mapping['Date']= pd.to_datetime(wiki_bestfightodds_url_id_mapping['Date'])
wiki_bestfightodds_url_id_mapping= wiki_bestfightodds_url_id_mapping.loc[:, ["fight_odds_url", "wiki_url", "Date"]]
wiki_fight_outcomes = pd.merge(wiki_fight_outcomes, wiki_bestfightodds_url_id_mapping, how = 'inner', 
                               on = ['wiki_url'])
wiki_fight_outcomes = wiki_fight_outcomes.loc[wiki_fight_outcomes['Date'].dt.year >= 2008, :]
print(wiki_fight_outcomes.shape[0])
wiki_bestfightodds_url_id_mapping.rename(columns = {'Date':'Card_Date'}, inplace = True)
moneyline_data = pd.merge(wiki_bestfightodds_url_id_mapping[['Card_Date', 'fight_odds_url', 'wiki_url']],
                          moneyline_data, left_on = 'fight_odds_url', right_on = 'url', validate = "1:m")
moneyline_data['Card_Date'] = pd.to_datetime(moneyline_data['Card_Date'] )
moneyline_data = moneyline_data.loc[moneyline_data.Card_Date.dt.year >= 2008, :]
del moneyline_data['url']

4707


In [83]:
# do some name cleaning to merge fight outcomes and odds data by fighter names
wiki_fight_outcomes['winner_match'] = wiki_fight_outcomes['Winner_Cleaned'].apply(lambda x: unidecode(x.lower().replace('.', '').strip()))
wiki_fight_outcomes['loser_match'] = wiki_fight_outcomes['Loser_Cleaned'].apply(lambda x: unidecode(x.lower().replace('.', '').strip()))
moneyline_data['fighter1_match'] = moneyline_data['fighter1'].apply(lambda x: unidecode(x.lower().replace('.', '').strip()))
moneyline_data['fighter2_match'] = moneyline_data['fighter2'].apply(lambda x: unidecode(x.lower().replace('.', '').strip()))

In [84]:
# deal with duplicates before merging by just 1 fighter name
display(moneyline_data.loc[
    moneyline_data.duplicated(subset = ['fighter1_match', 'wiki_url'], keep = False), :])
moneyline_data = \
moneyline_data.loc[~((moneyline_data.fighter2 =='Paul Felder') & (moneyline_data.wiki_url == 'https://en.wikipedia.org/wiki/UFC_223')), :]
display(moneyline_data.loc[
    moneyline_data.duplicated(subset = ['fighter2_match', 'wiki_url'], keep = False), :])

Unnamed: 0,Card_Date,fight_odds_url,wiki_url,fighter1,fighter2,fighter1_match,fighter2_match
986,2018-04-07,https://www.bestfightodds.com/events/ufc-223-nurmagomedov-vs-iaquinta-1444,https://en.wikipedia.org/wiki/UFC_223,Al Iaquinta,Khabib Nurmagomedov,al iaquinta,khabib nurmagomedov
990,2018-04-07,https://www.bestfightodds.com/events/ufc-223-nurmagomedov-vs-iaquinta-1444,https://en.wikipedia.org/wiki/UFC_223,Al Iaquinta,Paul Felder,al iaquinta,paul felder


Unnamed: 0,Card_Date,fight_odds_url,wiki_url,fighter1,fighter2,fighter1_match,fighter2_match


In [85]:
# merge based on both fighters' name

dfa = pd.merge(wiki_fight_outcomes, moneyline_data, left_on = ['wiki_url', 'winner_match', 'loser_match'],
              right_on = ['wiki_url', 'fighter1_match', 'fighter2_match'], how = 'inner', validate = '1:1')
dfa['bestfightodds_winner'] = dfa['fighter1']
dfa['bestfightodds_loser'] = dfa['fighter2']
dfb = pd.merge(wiki_fight_outcomes, moneyline_data, left_on = ['wiki_url', 'winner_match', 'loser_match'],
              right_on = ['wiki_url', 'fighter2_match', 'fighter1_match'], how = 'inner', validate = '1:1')
dfb['bestfightodds_winner'] = dfb['fighter2']
dfb['bestfightodds_loser'] = dfb['fighter1']
allmatch1 = pd.concat([dfa, dfb], sort = False)

mismatches =  pd.merge(wiki_fight_outcomes[['wiki_url', 'winner_match', 'loser_match', 'Winner_Cleaned', 'Loser_Cleaned']],
                       allmatch1[['wiki_url', 'winner_match', 'loser_match', 'fighter1_match']], 
                       how = 'left', 
                       on = ['wiki_url', 'winner_match', 'loser_match']) 
mismatches = mismatches[mismatches.fighter1_match.isna()]
del mismatches['fighter1_match']
moneyline_data1 = pd.merge(moneyline_data, allmatch1[['fighter1_match', 'fighter2_match', 'winner_match', 'wiki_url']], 
                          how = 'left', on = ['fighter1_match', 'fighter2_match', 'wiki_url'])
moneyline_data1 = moneyline_data1[moneyline_data1.winner_match.isna()]
del moneyline_data1['winner_match'], dfa, dfb

# now try merging on just one name
df1 = pd.merge(mismatches, moneyline_data1, left_on = ['wiki_url', 'winner_match'],
              right_on = ['wiki_url', 'fighter1_match'], how = 'inner', validate = '1:1')
df1['bestfightodds_winner'] = df1['fighter1']
df1['bestfightodds_loser'] = df1['fighter2']
df2 = pd.merge(mismatches, moneyline_data1, left_on = ['wiki_url', 'winner_match'],
              right_on = ['wiki_url', 'fighter2_match'], how = 'inner', validate = '1:1')
df2['bestfightodds_winner'] = df2['fighter2']
df2['bestfightodds_loser'] = df2['fighter1']
df3 = pd.merge(mismatches, moneyline_data1, left_on = ['wiki_url', 'loser_match'],
              right_on = ['wiki_url', 'fighter1_match'], how = 'inner', validate = '1:1')
df3['bestfightodds_winner'] = df3['fighter2']
df3['bestfightodds_loser'] = df3['fighter1']
df4 = pd.merge(mismatches, moneyline_data1, left_on = ['wiki_url', 'loser_match'],
              right_on = ['wiki_url', 'fighter2_match'], how = 'inner', validate = '1:1')
df4['bestfightodds_winner'] = df4['fighter1']
df4['bestfightodds_loser'] = df4['fighter2']


allmatch2 = pd.concat([df1, df2, df3, df4], sort = False)
allmatch2.drop_duplicates(['wiki_url', 'fighter1_match', 'fighter2_match', 'winner_match', 'loser_match'], inplace = True)
allmatch2 = allmatch2[~(allmatch2.duplicated(['wiki_url', 'winner_match', 'loser_match'], keep = False))]
allmatch = pd.concat([allmatch1, allmatch2], sort = False)
del df1, df2, df3, df4, allmatch1, allmatch2

mismatches =  pd.merge(wiki_fight_outcomes[['wiki_url', 'winner_match', 'loser_match', 'Winner_Cleaned', 'Loser_Cleaned']],
                       allmatch[['wiki_url', 'winner_match', 'loser_match', 'fighter1_match']], 
                       how = 'left', 
                       on = ['wiki_url', 'winner_match', 'loser_match']) 
mismatches = mismatches[mismatches.fighter1_match.isna()]
del mismatches['fighter1_match']
moneyline_data1 = pd.merge(moneyline_data1, allmatch[['fighter1_match', 'fighter2_match', 'winner_match', 'wiki_url']], 
                          how = 'left', on = ['fighter1_match', 'fighter2_match', 'wiki_url'])
moneyline_data1 = moneyline_data1[moneyline_data1.winner_match.isna()]
del moneyline_data1['winner_match']

print(allmatch.shape[0])
print(mismatches.shape[0])
possibilities = pd.merge(mismatches, moneyline_data1, how = 'left', on = 'wiki_url')
print(possibilities.shape[0])

4697
10
13


In [86]:
possibilities = possibilities[~ possibilities.fighter1_match.isna()]
possibilities = possibilities[~((possibilities.fighter1_match == 'ian heinisch') & (possibilities.winner_match == 'saparbek safarov'))]
possibilities = possibilities[~((possibilities.fighter1_match == 'godofredo castro') & (possibilities.winner_match == 'thiago perpetuo'))]
possibilities = possibilities[~((possibilities.fighter1_match == 'leonardo mafra teixeira') & (possibilities.winner_match == 'rony jason'))]
possibilities['bestfightodds_winner'] = ['Karol Rosa', 'Saparbeg Safarov', 'Elizeu Zaleski', 
                                         'Yadong Song', 'Cristiane Justino', 'Rony Mariano Bezerra',
                                        'Thiago de Oliveira Perpetuo', 'Luis Cane']
possibilities['bestfightodds_loser'] = np.where(possibilities['bestfightodds_winner'] == possibilities.fighter1, possibilities.fighter2, possibilities.fighter1)
display(possibilities[['winner_match', 'loser_match', 'bestfightodds_winner', 'bestfightodds_loser']])
print(possibilities.duplicated(['fighter1', 'fighter2', 'wiki_url']).any())

Unnamed: 0,winner_match,loser_match,bestfightodds_winner,bestfightodds_loser
0,karolline rosa cavedo,lara fritzen procopio,Karol Rosa,Lara Procopio
1,saparbek safarov,nicolae negumereanu,Saparbeg Safarov,Nick Negumereanu
3,elizeu zaleski dos santos,luigi vendramini,Elizeu Zaleski,Luigi Vandramini
4,song yadong,bharat kandare,Yadong Song,Bharat Khandare
5,cris cyborg,lina lansberg,Cristiane Justino,Lina Akhtar Lansberg
7,rony jason,godofredo pepey,Rony Mariano Bezerra,Godofredo Castro
10,thiago perpetuo,leonardo mafra,Thiago de Oliveira Perpetuo,Leonardo Mafra Teixeira
11,luiz cane,rameau thierry sokoudjou,Luis Cane,Rameau Sokoudjou


False


In [90]:
allmatch_final = pd.concat([allmatch, possibilities], sort = False)
print(allmatch_final.duplicated(['fighter1', 'fighter2', 'wiki_url']).any())
print(allmatch_final.duplicated(['Winner_Cleaned', 'Loser_Cleaned', 'wiki_url']).any())
display(allmatch_final[['Winner_Cleaned', 'Loser_Cleaned', 'bestfightodds_winner',
                        'bestfightodds_loser', 'fighter1', 'fighter2']])

False
False


Unnamed: 0,Winner_Cleaned,Loser_Cleaned,bestfightodds_winner,bestfightodds_loser,fighter1,fighter2
0,Charles Oliveira,Kevin Lee,Charles Oliveira,Kevin Lee,Charles Oliveira,Kevin Lee
1,Francisco Trinaldo,John Makdessi,Francisco Trinaldo,John Makdessi,Francisco Trinaldo,John Makdessi
2,Brandon Moreno,Jussier Formiga,Brandon Moreno,Jussier Formiga,Brandon Moreno,Jussier Formiga
3,Amanda Ribas,Randa Markos,Amanda Ribas,Randa Markos,Amanda Ribas,Randa Markos
4,Enrique Barzola,Rani Yahya,Enrique Barzola,Rani Yahya,Enrique Barzola,Rani Yahya
...,...,...,...,...,...,...
4,Song Yadong,Bharat Kandare,Yadong Song,Bharat Khandare,Bharat Khandare,Yadong Song
5,Cris Cyborg,Lina Länsberg,Cristiane Justino,Lina Akhtar Lansberg,Cristiane Justino,Lina Akhtar Lansberg
7,Rony Jason,Godofredo Pepey,Rony Mariano Bezerra,Godofredo Castro,Godofredo Castro,Rony Mariano Bezerra
10,Thiago Perpétuo,Leonardo Mafra,Thiago de Oliveira Perpetuo,Leonardo Mafra Teixeira,Leonardo Mafra Teixeira,Thiago de Oliveira Perpetuo


In [91]:
del allmatch_final['fight_odds_url_y'], allmatch_final['fight_odds_url'], allmatch_final['fight_odds_url_x'], allmatch_final['Date']
del allmatch_final['Card_Date']
del allmatch_final['fighter1_match'], allmatch_final['fighter2_match'], allmatch_final['winner_match'], allmatch_final['loser_match']
allmatch_final_out = pd.merge(wiki_bestfightodds_url_id_mapping[['fight_odds_url', 'wiki_url', 'Card_Date']],
                         allmatch_final, on = 'wiki_url', validate = "1:m")                                                       
allmatch_final_out.to_csv('../../data/bestfightodds_data/outcome_mapping_bfodds_to_wiki.csv', index = False)

In [92]:
allmatch_final_out.head()

Unnamed: 0,fight_odds_url,wiki_url,Card_Date,WeightClass,Winner,Outcome,Loser,Method,Round,Time,Notes,Card,Winner_url,Loser_url,event_order,Method_Cleaned,Card_Cleaned,Winner_Cleaned,Loser_Cleaned,Champion,Interim_Champion,fighter1,fighter2,bestfightodds_winner,bestfightodds_loser
0,https://www.bestfightodds.com/events/ufc-on-espn-28-lee-vs-oliveira-1847,https://en.wikipedia.org/wiki/UFC_Fight_Night:_Lee_vs._Oliveira,2020-03-14,Catchweight (158.5 lbs),Charles Oliveira,def,Kevin Lee,Submission (guillotine choke),3.0,0:28,,Main card (ESPN+/ESPN),https://en.wikipedia.org/wiki/Charles_Oliveira,https://en.wikipedia.org/wiki/Kevin_Lee_(fighter),1.0,Sub,Main Event,Charles Oliveira,Kevin Lee,,,Charles Oliveira,Kevin Lee,Charles Oliveira,Kevin Lee
1,https://www.bestfightodds.com/events/ufc-on-espn-28-lee-vs-oliveira-1847,https://en.wikipedia.org/wiki/UFC_Fight_Night:_Lee_vs._Oliveira,2020-03-14,Lightweight,Francisco Trinaldo,def,John Makdessi,"Decision (unanimous) (30–27, 30–27, 29–28)",3.0,5:00,,Main card (ESPN+/ESPN),https://en.wikipedia.org/wiki/Francisco_Trinaldo,https://en.wikipedia.org/wiki/John_Makdessi,5.0,Unanimous Decision,Main Event,Francisco Trinaldo,John Makdessi,,,Francisco Trinaldo,John Makdessi,Francisco Trinaldo,John Makdessi
2,https://www.bestfightodds.com/events/ufc-on-espn-28-lee-vs-oliveira-1847,https://en.wikipedia.org/wiki/UFC_Fight_Night:_Lee_vs._Oliveira,2020-03-14,Flyweight,Brandon Moreno,def,Jussier Formiga,"Decision (unanimous) (30–27, 29–28, 29–28)",3.0,5:00,,Preliminary card (ESPN+/ESPN),https://en.wikipedia.org/wiki/Brandon_Moreno,https://en.wikipedia.org/wiki/Jussier_Formiga,6.0,Unanimous Decision,Prelim,Brandon Moreno,Jussier Formiga,,,Brandon Moreno,Jussier Formiga,Brandon Moreno,Jussier Formiga
3,https://www.bestfightodds.com/events/ufc-on-espn-28-lee-vs-oliveira-1847,https://en.wikipedia.org/wiki/UFC_Fight_Night:_Lee_vs._Oliveira,2020-03-14,Women's Strawweight,Amanda Ribas,def,Randa Markos,"Decision (unanimous) (30–26, 30–25, 30–25)",3.0,5:00,,Preliminary card (ESPN+/ESPN),https://en.wikipedia.org/wiki/Amanda_Ribas,https://en.wikipedia.org/wiki/Randa_Markos,7.0,Unanimous Decision,Prelim,Amanda Ribas,Randa Markos,,,Amanda Ribas,Randa Markos,Amanda Ribas,Randa Markos
4,https://www.bestfightodds.com/events/ufc-on-espn-28-lee-vs-oliveira-1847,https://en.wikipedia.org/wiki/UFC_Fight_Night:_Lee_vs._Oliveira,2020-03-14,Bantamweight,Enrique Barzola,vs,Rani Yahya,"Draw (majority) (29–28, 28–28, 28–28)",3.0,5:00,,Preliminary card (ESPN+/ESPN),https://en.wikipedia.org/wiki/Enrique_Barzola,https://en.wikipedia.org/wiki/Rani_Yahya,9.0,,Prelim,Enrique Barzola,Rani Yahya,,,Enrique Barzola,Rani Yahya,Enrique Barzola,Rani Yahya
