# **Scraping UFC Betting Data From BestFightOdds**

Data consists of the final quoted odds on UFC fights for each betting site listed on bestfightodds.com.
All bet types are included

In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import urllib.parse
import re

### **Get BestFightOdds URL for Each UFC Event in Wikipedia**

In [12]:
events = pd.read_csv('../../data/wikipedia_data/wikipedia_all_ufc_events.csv')
search = events['Event'].apply(lambda x: 'https://www.bestfightodds.com/search?query=' + urllib.parse.quote(x))
events['Date'] = pd.to_datetime(events['Date'])
fightodds_url = [''] * events.shape[0]

for i in range(events.shape[0]):  
    date_of_card = events.iloc[i]['Date']
    page = requests.get(search.iloc[i])
    soup = BeautifulSoup(page.text, 'lxml')
    
    # skip if no possible matches found
    if soup.find('table') == None or page.text.lower().find("No betting lines available for this event".lower()) != -1:
        fightodds_url[i] = ''
        continue 
    
    # if query redirects to exact match take query as url
    if soup.find('tr', class_ = 'even')  != None:
        fightodds_url[i] = search.iloc[i]
        continue
        
    table = soup.find_all('table', class_ = 'content-list') [-1].find_all('tr')
    final_url = ''
    for p in table:
        if pd.to_datetime(p.find('td').text) == date_of_card:
            final_url = 'https://www.bestfightodds.com'+ p.find_all('td')[1].a['href']
            fightodds_url[i] = final_url
            break
    
    # in some cases time zones cause dates to be off by a day. 
    if final_url == '':
        for p in table:
            if np.abs((pd.to_datetime(p.find('td').text) - date_of_card) / pd.to_timedelta(1, unit='D')) <= 1 :
                final_url = 'https://www.bestfightodds.com'+ p.find_all('td')[1].a['href']
                fightodds_url[i] = final_url
                break
            
events['fight_odds_url'] =  fightodds_url

events.head()

Unnamed: 0,#,Event,Date,Venue,Location,Attendance,url,fight_odds_url
0,513,UFC Fight Night: Lee vs. Oliveira,2020-03-14,Ginásio Nilson Nelson,"Brasília, Brazil",0,https://en.wikipedia.org/wiki/UFC_Fight_Night:...,https://www.bestfightodds.com/events/ufc-on-es...
1,512,UFC 248: Adesanya vs. Romero,2020-03-07,T-Mobile Arena,"Las Vegas, Nevada, U.S.",15077,https://en.wikipedia.org/wiki/UFC_248,https://www.bestfightodds.com/events/ufc-248-a...
2,511,UFC Fight Night: Benavidez vs. Figueiredo,2020-02-29,Chartway Arena,"Norfolk, Virginia, U.S.",7098,https://en.wikipedia.org/wiki/UFC_Fight_Night:...,https://www.bestfightodds.com/events/ufc-on-es...
3,510,UFC Fight Night: Felder vs. Hooker,2020-02-23,Spark Arena,"Auckland, New Zealand",10025,https://en.wikipedia.org/wiki/UFC_Fight_Night:...,https://www.bestfightodds.com/events/ufc-on-es...
4,509,UFC Fight Night: Anderson vs. Błachowicz 2,2020-02-15,Santa Ana Star Center,"Rio Rancho, New Mexico, U.S.",6449,https://en.wikipedia.org/wiki/UFC_Fight_Night:...,https://www.bestfightodds.com/events/ufc-on-es...


In [13]:
# get coverage of how many ufc events in Wikipedia have bestfightodds urls
events['fight_odds_url'].groupby(events['Date'].apply(lambda x: x.year)).apply(lambda x: (x != '').sum())/events['fight_odds_url'].groupby(events['Date'].apply(lambda x: x.year)).size()

Date
1993    0.000000
1994    0.000000
1995    0.000000
1996    0.200000
1997    0.400000
1998    0.000000
1999    0.166667
2000    0.000000
2001    0.200000
2002    0.142857
2003    0.000000
2004    0.200000
2005    0.200000
2006    0.000000
2007    0.578947
2008    1.000000
2009    1.000000
2010    0.958333
2011    0.925926
2012    0.967742
2013    1.000000
2014    1.000000
2015    0.975610
2016    1.000000
2017    1.000000
2018    0.974359
2019    0.952381
2020    1.000000
Name: fight_odds_url, dtype: float64

In [14]:
# view missing events
pd.set_option('display.max_rows', 500)
events[(events['fight_odds_url'] == '') & ( events['Date'].apply(lambda x: x.year) >= 2008) ]

Unnamed: 0,#,Event,Date,Venue,Location,Attendance,url,fight_odds_url
11,502,UFC Fight Night: Błachowicz vs. Jacaré,2019-11-16,Ginásio do Ibirapuera,"São Paulo, Brazil",10344,https://en.wikipedia.org/wiki/UFC_Fight_Night:...,
45,468,UFC Fight Night: Błachowicz vs. Santos,2019-02-23,O2 arena,"Prague, Czech Republic",16583,https://en.wikipedia.org/wiki/UFC_Fight_Night:...,
70,443,The Ultimate Fighter: Undefeated Finale,2018-07-06,Palms Casino Resort,"Las Vegas, Nevada, U.S.",2123,https://en.wikipedia.org/wiki/The_Ultimate_Fig...,
187,326,The Ultimate Fighter: American Top Team vs. Bl...,2015-07-12,MGM Grand Garden Arena,"Las Vegas, Nevada, U.S.",4844,https://en.wikipedia.org/wiki/The_Ultimate_Fig...,
317,196,UFC on Fox: Evans vs. Davis,2012-01-28,United Center,"Chicago, Illinois, U.S.",16963,https://en.wikipedia.org/wiki/UFC_on_Fox:_Evan...,
330,183,UFC Fight Night: Shields vs. Ellenberger,2011-09-17,Ernest N. Morial Convention Center,"New Orleans, Louisiana, U.S.",7112,https://en.wikipedia.org/wiki/UFC_Fight_Night:...,
335,178,UFC Live: Kongo vs. Barry,2011-06-26,Consol Energy Center,"Pittsburgh, Pennsylvania, U.S.",7792,https://en.wikipedia.org/wiki/UFC_Live:_Kongo_...,
357,156,UFC Live: Jones vs. Matyushenko,2010-08-01,San Diego Sports Arena,"San Diego, California, U.S.",8132,https://en.wikipedia.org/wiki/UFC_Live:_Jones_...,


In [15]:
# fill in missing events manually
events.loc[events['Event'] == 'UFC Fight Night: Błachowicz vs. Jacaré', 'fight_odds_url'] = 'https://www.bestfightodds.com/events/ufc-on-espn-22-blachowicz-vs-jacare-1755'
events.loc[events['Event'] == 'UFC Fight Night: Błachowicz vs. Santos', 'fight_odds_url'] = 'https://www.bestfightodds.com/events/ufc-on-espn-3-blachowicz-vs-santos-1638'
events.loc[events['Event'] == 'UFC on Fox: Evans vs. Davis', 'fight_odds_url'] = 'https://www.bestfightodds.com/events/ufc-on-fox-2-482'
events.loc[events['Event'] == 'UFC Fight Night: Shields vs. Ellenberger', 'fight_odds_url'] = 'https://www.bestfightodds.com/events/ufc-fight-night-25-battle-on-the-bayou-406'
events.loc[events['Event'] == 'UFC Live: Kongo vs. Barry', 'fight_odds_url'] = 'https://www.bestfightodds.com/events/ufc-on-versus-4-379'
events.loc[events['Event'] == 'UFC Live: Jones vs. Matyushenko', 'fight_odds_url'] = 'https://www.bestfightodds.com/events/ufc-on-versus-2-281'
events.loc[events['Event'] == 'The Ultimate Fighter: Team McGregor vs. Team Faber Finale', 'fight_odds_url'] = 'https://www.bestfightodds.com/events/ufc-the-ultimate-fighter-22-finale-edgar-vs-mendes-1011'
events.loc[events['Event'] == 'The Ultimate Fighter: Heavy Hitters Finale', 'fight_odds_url'] = 'https://www.bestfightodds.com/events/ufc-the-ultimate-fighter-28-finale-dos-anjos-vs-usman-1586'
events.loc[events['Event'] == 'The Ultimate Fighter: American Top Team vs. Blackzilians Finale', 'fight_odds_url'] = 'https://www.bestfightodds.com/events/ufc-the-ultimate-fighter-21-finale-ellenberger-vs-thompson-971'
events.loc[events['Event'] == 'The Ultimate Fighter: Undefeated Finale', 'fight_odds_url'] = 'https://www.bestfightodds.com/events/ufc-the-ultimate-fighter-27-finale-tavares-vs-adesanya-1503'

# get coverage of % ufc events are available
print(events['fight_odds_url'].groupby(events['Date'].apply(lambda x: x.year)).apply(lambda x: (x != '').sum())/events['fight_odds_url'].groupby(events['Date'].apply(lambda x: x.year)).size())
print(sum((events.Date  >= pd.to_datetime("Jan 1, 2008")) & (events.fight_odds_url.isna() | (events.fight_odds_url == ''))))
events = events[~(events.fight_odds_url.isna() | (events.fight_odds_url == ''))]
events.to_csv("../../data/bestfightodds_data/bestfightodds_urls.csv", index = False)


Date
1993    0.000000
1994    0.000000
1995    0.000000
1996    0.200000
1997    0.400000
1998    0.000000
1999    0.166667
2000    0.000000
2001    0.200000
2002    0.142857
2003    0.000000
2004    0.200000
2005    0.200000
2006    0.000000
2007    0.578947
2008    1.000000
2009    1.000000
2010    1.000000
2011    1.000000
2012    1.000000
2013    1.000000
2014    1.000000
2015    1.000000
2016    1.000000
2017    1.000000
2018    1.000000
2019    1.000000
2020    1.000000
Name: fight_odds_url, dtype: float64
0
