# **Scraping UFC Fight Outcomes and List of Events From Wikipedia**

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import urllib.request as urllib2

### **Scrape List of All UFC Cards From Wikipedia**

In [2]:
# pull list of past ufc events from wikipedia
url = "https://en.wikipedia.org/wiki/List_of_UFC_events"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
rows = soup.find('table', id="Past_events").find_all('tr')
columns = [x.text.strip() for x in rows[0].find_all('th')]
past_events = pd.DataFrame([[x.text.strip() for x in row.find_all('td')] for row in rows[1:]], columns = columns)
urls = []
for row in rows[1:]:
    s = row.find_all('td')
    if  len(s) >= 1 and getattr(s[1], 'a', None) != None:
       urls.append('https://en.wikipedia.org' + s[1].a['href'].strip())
    else:
        urls.append(None)
past_events['url'] = urls
past_events = past_events[past_events['Attendance'] != 'Cancelled']
past_events.sort_values('#', ascending = False, inplace = True)
past_events['Date'] = pd.to_datetime(past_events['Date'])
del past_events['Ref.']
past_events.to_csv("../../data/wikipedia_data/wikipedia_all_ufc_events.csv", index = False)

In [3]:
past_events.head()

Unnamed: 0,#,Event,Date,Venue,Location,Attendance,url
3,513,UFC Fight Night: Lee vs. Oliveira,2020-03-14,Ginásio Nilson Nelson,"Brasília, Brazil",0,https://en.wikipedia.org/wiki/UFC_Fight_Night:...
4,512,UFC 248: Adesanya vs. Romero,2020-03-07,T-Mobile Arena,"Las Vegas, Nevada, U.S.",15077,https://en.wikipedia.org/wiki/UFC_248
5,511,UFC Fight Night: Benavidez vs. Figueiredo,2020-02-29,Chartway Arena,"Norfolk, Virginia, U.S.",7098,https://en.wikipedia.org/wiki/UFC_Fight_Night:...
6,510,UFC Fight Night: Felder vs. Hooker,2020-02-23,Spark Arena,"Auckland, New Zealand",10025,https://en.wikipedia.org/wiki/UFC_Fight_Night:...
7,509,UFC Fight Night: Anderson vs. Błachowicz 2,2020-02-15,Santa Ana Star Center,"Rio Rancho, New Mexico, U.S.",6449,https://en.wikipedia.org/wiki/UFC_Fight_Night:...


### **Scrape List of All UFC Fight Outcomes For Each Card from Wikipedia**

In [37]:
dfs = []
for url in past_events['url']:
    if url != None:
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'lxml')
        
        if url == 'https://en.wikipedia.org/wiki/UFC_on_Fox:_Henderson_vs._Melendez':
            table = soup.find_all('table', class_ = 'toccolours')[2].tbody.find_all('tr')
        elif url == 'https://en.wikipedia.org/wiki/UFC_on_FX:_Belfort_vs._Bisping':
            table = soup.find_all('table', class_ = 'toccolours')[1].tbody.find_all('tr')
        else:
            table = soup.find('span', id = 'Results').parent.find_next_sibling().find('tbody').find_all('tr')
            
        card_type = table[0].find('th').text.strip()
        allrows = []
        for row in table[2:]:
            card_type_holder = row.find_all('th')
            if len(card_type_holder) == 1:
                card_type =  card_type_holder[0].text.strip()
                continue
            elif len(card_type_holder) > 0 and card_type_holder[0].text.strip().lower() == 'weight class':
                continue
            else:
                row_info = row.find_all('td')
                s = [x.text.strip() for x in row_info]
                if s[-1] != '':
                    ref = row_info[-1].a['href'].replace("#",'')
                    note = soup.find('li', id = ref).find('span', class_ = "reference-text").text
                    s[-1] = note
                s.append(card_type)
                try:
                    winner_link = "https://en.wikipedia.org" + row_info[1].a['href']
                except:
                    winner_link = ''
                try:
                    loser_link = "https://en.wikipedia.org" + row_info[3].a['href']
                except:
                    loser_link = ''
                s.append(winner_link)
                s.append(loser_link)
                allrows.append(s)
                

        df = pd.DataFrame(allrows, columns = ['WeightClass', 'Winner', 'Outcome', 'Loser', 'Method', 'Round', 'Time', 'Notes', 'Card', 'Winner_url', 'Loser_url'])
        df['wiki_url'] =url
        df['event_order'] = range(df.shape[0] + 1) 
        dfs.append(df)

ValueError: Length of values does not match length of index

(5553, 18)

## Clean Up The Data

In [27]:
all_fights = pd.concat(dfs)
print(all_fights.Outcome.value_counts())
all_fights.Outcome = all_fights.Outcome.str.replace('.', '')
print(all_fights.Outcome.value_counts())

sub = all_fights.Method.str.contains(pat = '(?i)sub') 
knockout = all_fights.Method.str.contains(pat = '(?i)ko') 
unam_decision = all_fights.Method.str.contains(pat = '(?i)dec') & all_fights.Method.str.contains(pat = '(?i)unanimous')
split_decision = all_fights.Method.str.contains(pat = '(?i)dec') & all_fights.Method.str.contains(pat = '(?i)split')
outcomes = pd.concat([sub, knockout, unam_decision, split_decision], axis=1)
display(all_fights.loc[outcomes.apply("sum", axis = 1) > 1, :])
sub[knockout] = False
outcomes = pd.concat([sub, knockout, unam_decision, split_decision], axis=1)
print(sum(outcomes.apply("sum", axis = 1) > 1))

all_fights.loc[sub, 'Method_Cleaned'] = "Sub"
all_fights.loc[knockout, 'Method_Cleaned'] = "KO"
all_fights.loc[unam_decision, 'Method_Cleaned'] = "Unanimous Decision"
all_fights.loc[split_decision, 'Method_Cleaned'] = "Split Decision"
print(all_fights.Method_Cleaned.value_counts(dropna= False))

def.    5359
def      101
vs.       91
vs         2
Name: Outcome, dtype: int64
def    5460
vs       93
Name: Outcome, dtype: int64


Unnamed: 0,WeightClass,Winner,Outcome,Loser,Method,Round,Time,Notes,Card,Winner_url,Loser_url,wiki_url
11,Welterweight,Alex Morono,def,Zak Ottow,TKO (Submission to elbows),1.0,3:34,,Preliminary Card (ESPN+),https://en.wikipedia.org/wiki/Alex_Morono,https://en.wikipedia.org/wiki/Zak_Ottow,https://en.wikipedia.org/wiki/UFC_Fight_Night:...
7,Bantamweight,Ricky Simon,def,Merab Dvalishvili,TKO (technical submission),3.0,5:00,,Preliminary card (Fox Sports 1),https://en.wikipedia.org/wiki/Ricky_Simon,https://en.wikipedia.org/wiki/Merab_Dvalishvili,https://en.wikipedia.org/wiki/UFC_Fight_Night:...
2,Middleweight,Joe Slick,def,Jason DeLucia,TKO (verbal submission due to knee injury),1.0,1:28,,Main Card,https://en.wikipedia.org/wiki/Joe_Slick,https://en.wikipedia.org/wiki/Jason_DeLucia,https://en.wikipedia.org/wiki/UFC_23
1,Heavyweight,Gary Goodridge,def,Andre Roberts,TKO (submission to punches),,0:42,,Main Card,https://en.wikipedia.org/wiki/Gary_Goodridge,https://en.wikipedia.org/wiki/Andre_Roberts_(M...,https://en.wikipedia.org/wiki/UFC_19


0
Unanimous Decision    1893
KO                    1801
Sub                   1191
Split Decision         515
NaN                    153
Name: Method_Cleaned, dtype: int64


In [34]:
print(all_fights.Card.value_counts(dropna= False))
main = all_fights.Card.str.contains('(?i)main')
prelim = all_fights.Card.str.contains('(?i)prelim')
print(sum(main & prelim))
all_fights.loc[main, 'Card_Cleaned'] = "Main Event"
all_fights.loc[prelim, 'Card_Cleaned'] = "Prelim"
print(all_fights.Card_Cleaned.value_counts(dropna= False))

Main Card                                   776
Preliminary card                            487
Preliminary Card (UFC Fight Pass)           443
Main Card (Fox Sports 1)                    425
Preliminary Card (Fox Sports 1)             423
                                           ... 
UFC Japan Middleweight Tournament Finals      1
Heavyweight Tournament Final                  1
Middleweight Finals                           1
Heavyweight Superfight                        1
UFC Heavyweight Championship                  1
Name: Card, Length: 115, dtype: int64
0
Prelim        2865
Main Event    2437
NaN            251
Name: Card_Cleaned, dtype: int64


In [36]:
# Standardize fighter names and determine if champion or interim champion is fighting


# clean fighter names and extract champion or interim champion status 
cleaner_winner = all_fights['Winner'].map(lambda x: x.split('(', 1))
cleaner_winner = cleaner_winner.map(lambda x: [x[0].strip(), x[1].strip().lower()] if len(x) > 1 else [x[0].strip()])
clean_winner = cleaner_winner.map(lambda x: x[0])

cleaner_loser = all_fights['Loser'].map(lambda x: x.split('(', 1))
cleaner_loser= cleaner_loser.map(lambda x: [x[0].strip(), x[1].strip().lower()] if len(x) > 1 else [x[0].strip()])
clean_loser = cleaner_loser.map(lambda x: x[0])

champ = cleaner_winner.map(lambda x: x[0] if len(x) > 1 and x[1] == 'c)' else '')
champ = np.where(champ == '', cleaner_loser.map(lambda x: x[0] if len(x) > 1 and x[1] == 'c)' else ''), champ)
interim_champ = cleaner_winner.map(lambda x: x[0] if len(x) > 1 and x[1] == 'ic)' else '')
interim_champ = np.where(interim_champ  == '', cleaner_loser.map(lambda x: x[0] if len(x) > 1 and x[1] == 'ic)' else ''), interim_champ)


all_fights['Winner_Clean'] = clean_winner
all_fights['Loser_Clean'] = clean_loser
all_fights['Champion'] = champ
all_fights['Interim_Champion'] = interim_champ
display(all_fights.head())
print(all_fights.Champion.value_counts())

Unnamed: 0,WeightClass,Winner,Outcome,Loser,Method,Round,Time,Notes,Card,Winner_url,Loser_url,wiki_url,Method_Cleaned,Card_Cleaned,Winner_Clean,Loser_Clean,Champion,Interim_Champion
0,Catchweight (158.5 lbs),Charles Oliveira,def,Kevin Lee,Submission (guillotine choke),3,0:28,,Main card (ESPN+/ESPN),https://en.wikipedia.org/wiki/Charles_Oliveira,https://en.wikipedia.org/wiki/Kevin_Lee_(fighter),https://en.wikipedia.org/wiki/UFC_Fight_Night:...,Sub,Main Event,Charles Oliveira,Kevin Lee,,
1,Welterweight,Gilbert Burns,def,Demian Maia,TKO (punches),1,2:34,,Main card (ESPN+/ESPN),https://en.wikipedia.org/wiki/Gilbert_Burns_(f...,https://en.wikipedia.org/wiki/Demian_Maia,https://en.wikipedia.org/wiki/UFC_Fight_Night:...,KO,Main Event,Gilbert Burns,Demian Maia,,
2,Lightweight,Renato Moicano,def,Damir Hadžović,Submission (rear-naked choke),1,0:44,,Main card (ESPN+/ESPN),https://en.wikipedia.org/wiki/Renato_Moicano,https://en.wikipedia.org/wiki/Damir_Had%C5%BEo...,https://en.wikipedia.org/wiki/UFC_Fight_Night:...,Sub,Main Event,Renato Moicano,Damir Hadžović,,
3,Light Heavyweight,Nikita Krylov,def,Johnny Walker,"Decision (unanimous) (30–27, 29–28, 29–28)",3,5:00,,Main card (ESPN+/ESPN),https://en.wikipedia.org/wiki/Nikita_Krylov,https://en.wikipedia.org/wiki/Johnny_Walker_(f...,https://en.wikipedia.org/wiki/UFC_Fight_Night:...,Unanimous Decision,Main Event,Nikita Krylov,Johnny Walker,,
4,Lightweight,Francisco Trinaldo,def,John Makdessi,"Decision (unanimous) (30–27, 30–27, 29–28)",3,5:00,,Main card (ESPN+/ESPN),https://en.wikipedia.org/wiki/Francisco_Trinaldo,https://en.wikipedia.org/wiki/John_Makdessi,https://en.wikipedia.org/wiki/UFC_Fight_Night:...,Unanimous Decision,Main Event,Francisco Trinaldo,John Makdessi,,


                      5325
Demetrious Johnson      12
Jon Jones               11
Anderson Silva          10
Matt Hughes             10
                      ... 
Ricco Rodriguez          1
Dave Menne               1
Cody Garbrandt           1
Israel Adesanya          1
Vitor Belfort            1
Name: Champion, Length: 72, dtype: int64


In [7]:


all_fights['Outcome'] = all_fights['Outcome'].replace('def', 'def.').replace('vs','vs.')
print(all_fights['Outcome'].value_counts())
print(all_fights.head())


def.    5460
vs.       93
Name: Outcome, dtype: int64
               WeightClass              Winner Outcome           Loser  \
0  Catchweight (158.5 lbs)    Charles Oliveira    def.       Kevin Lee   
1             Welterweight       Gilbert Burns    def.     Demian Maia   
2              Lightweight      Renato Moicano    def.  Damir Hadžović   
3        Light Heavyweight       Nikita Krylov    def.   Johnny Walker   
4              Lightweight  Francisco Trinaldo    def.   John Makdessi   

                                       Method Round  Time Notes  \
0               Submission (guillotine choke)     3  0:28         
1                               TKO (punches)     1  2:34         
2               Submission (rear-naked choke)     1  0:44         
3  Decision (unanimous) (30–27, 29–28, 29–28)     3  5:00         
4  Decision (unanimous) (30–27, 30–27, 29–28)     3  5:00         

                     Card                                                url  \
0  Main card (ESPN

In [8]:
all_fights.to_csv("../../datasets/wikipedia_all_ufc_fight_outcomes.csv", index = False)