In [18]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re
from html import unescape
from datetime import datetime as dt
import time


# Web Scrape

### Setup URL Structure

* url: Main page with all fights
* page: request HTML of url
* soup: BeautifulSoup object of main page

In [2]:
url = 'http://ufcstats.com/statistics/events/completed'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

In [4]:
#List of more urls to scrape:
detail_urls = [] #this grabs urls for each day recorded
for url in soup.find_all(class_='b-link b-link_style_black'):
    detail_urls.append(url['href'])

`detail_urls` is the list of all links to fight events on a given *main page*

In [5]:
detail_urls[1]

'http://ufcstats.com/event-details/4c12aa7ca246e7a4'

### Helper Functions

In [6]:
#Set up datatype dictionary: 

data_types = {
    'R_STR': int, 
    'B_STR': int,
    'R_TD': int, 
    'B_TD': int, 
    'R_SUB': int, 
    'R_SUB': int, 
    'R_PASS': int, 
    'B_PASS': int,
    'ROUND': int,
}

An image of a belt indicates a "title match." find_belt determines if associated match is a title match:

In [9]:
def remove_space_lines(text):
    pattern1 = re.compile(r'[\s\s+]')
    return re.sub(pattern1, ' ', text)


#Determine if observation is a title-bout
def find_belt(img_tag):
    try:
        image_link = img_tag['src']
        if re.match(r'.*belt.*', image_link) != None:
            return True
    except:
        return False
    
    
def get_fight_auxiliary(soup):
    '''
    Input: beautifulsoup of an event url: (ie. http://www.ufcstats.com/event-details/53278852bcd91e11)
    Outputs: pandas Series
        date, location, attendance
    '''
    
    table = []
    
    auxiliary_table = soup.find_all('li', {'class': 'b-list__box-list-item'})
    for item in auxiliary_table:
        attribute = remove_space_lines(item.text).strip()

        #If attribute is missing, replace with ''
        try:
            attribute = re.findall(r'\s\s+(.*)', attribute)[0]
        except:
            attribute = '' 
        
        table.append(attribute)
        
    table_series = pd.Series(table)
    table_series.index = ['date', 'location', 'attendance']
    
    if table_series['attendance'] != '':
        table_series['attendance'] = re.sub(',', '', table_series['attendance'])
        table_series['attendance'] = int(table_series['attendance'])
    
    table_series['date'] = dt.strptime(table_series['date'], '%B %d, %Y').strftime('%d-%m-%Y')

    return table_series

# Main Scraping Function:

In [10]:
def get_page_stats(url):
    
    '''
    Input: url of an event page (ie. http://www.ufcstats.com/event-details/53278852bcd91e11)
    Outputs: Summary statistics in pandas DataFrame
    '''
    
    #Change url into BeautifulSoup object:
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    #Contents of the main table in HTML
    stat_table = soup.findAll('table')[0].contents 
    
    table_data = stat_table[3] #first 2 indices are empty strings, table_data is html starting from first table row
    detail_data = table_data.find_all('p') #within table rows, there are <p> labels for table text    
    #image_data = table_data.find_all('img') #get image links to find belt for 
    
    
    #Initialize return objects:
    contents = [] #table that will hold all statistics
    title_match_index = [] #list to track which fights are title_bouts
    
    #Loop through elements of detail_data (html table) to scrape fight details:
    for index, item in enumerate(detail_data):
        
        #find image of belt == title_bout
        image = item.find('img')
        if find_belt(image):
            title_match_index.append(index) #get index of fight in which belt appears
            
        #contents is list of all text from each element of table     
        contents.append(item.text)  

    
    #Clean up elements
    contents = list(map(lambda x: remove_space_lines(x), contents))
    contents = list(map(lambda x: x.strip(), contents)) 
    
    draw_index = []
    
    #When there's a draw or NC, additional tags are created --> remove the tag to reformat correctly   
    for i in np.arange(0, len(contents)-10, 16):

        if contents[i] != 'win':
            
            #Get the index of the match that was drawn & remove that element
            draw_index.append(np.floor_divide(i, 16)) 
            contents.pop(i)
                    
    #Extract links to more detailed fight statistics
    fight_links = table_data.find_all('a', {'class': 'b-flag b-flag_style_green'})
    fight_links = [item['href'] for item in fight_links]
    
    draw_links = table_data.find_all('a', {'class': 'b-flag b-flag_style_bordered'})
    draw_links = [item['href'] for item in draw_links]
    draw_links = list(dict.fromkeys(draw_links)) #Remove duplicate links from the drawn fights
    
    for index, link in zip(draw_index, draw_links):
        fight_links.insert(index, link)
    
    #each row of data is 16 elements: reformats 1 observation per row
    formatted_contents = np.array(contents).reshape((-1, 16))
    formatted_contents = pd.DataFrame(formatted_contents)
    
    #the first row is a list of 'wins'
    #formatted_contents.drop(0, axis = 1, inplace = True)
    
    #Run a floor_divide to put the image of the belt in the correct fight

    title_match = np.floor_divide(title_match_index, 16) 

    #Initialize title_bout column with all 0's
    titles = np.zeros(16)
    if len(title_match) != 0:
        titles[title_match] = 1
    
    title_series = pd.Series(titles)
    
    formatted_contents['title_bout'] = title_series
    
    #rename columns
    formatted_contents.columns = ['Winner', 'R_fighter', 'B_fighter', 'R_STR', 'B_STR', 
                               'R_TD', 'B_TD', 'R_SUB', 'R_SUB', 'R_PASS', 'B_PASS',
                              'WEIGHT_CLASS', 'METHOD', 'DETAIL', 'ROUND', 'TIME', 'title_bout']
    
    #convert columns to appropriate data types
    formatted_contents.replace('--', 99999, inplace = True)
    formatted_contents = formatted_contents.astype(data_types)
    formatted_contents['TIME'] = formatted_contents['TIME'].apply(lambda x: dt.strptime(x, '%M:%S').time())
    formatted_contents['link'] = fight_links
    
    auxiliary_data = get_fight_auxiliary(soup) #Returns date, location, attendance of event
    
    return (formatted_contents, auxiliary_data)

# Test:

In [15]:
url = 'http://ufcstats.com/event-details/53278852bcd91e11'

In [16]:
test = get_page_stats(url)

In [17]:
test[0]

Unnamed: 0,Winner,R_fighter,B_fighter,R_STR,B_STR,R_TD,B_TD,R_SUB,R_SUB.1,R_PASS,B_PASS,WEIGHT_CLASS,METHOD,DETAIL,ROUND,TIME,title_bout,link
0,win,Charles Oliveira,Kevin Lee,43,41,0,2,2,0,0,2,Lightweight,SUB,Guillotine Choke,3,00:00:28,0.0,http://ufcstats.com/fight-details/e0b323dae5bf...
1,win,Gilbert Burns,Demian Maia,13,4,0,2,0,0,0,1,Welterweight,KO/TKO,Punch,1,00:02:34,0.0,http://ufcstats.com/fight-details/5cee1d8f1e43...
2,win,Renato Moicano,Damir Hadzovic,1,1,1,0,1,0,1,0,Lightweight,SUB,Rear Naked Choke,1,00:00:44,0.0,http://ufcstats.com/fight-details/c26a3f4c0833...
3,win,Nikita Krylov,Johnny Walker,45,37,3,0,0,0,4,2,Light Heavyweight,U-DEC,,3,00:05:00,0.0,http://ufcstats.com/fight-details/5bba49d88db7...
4,win,Francisco Trinaldo,John Makdessi,55,67,0,0,0,0,0,0,Lightweight,U-DEC,,3,00:05:00,0.0,http://ufcstats.com/fight-details/dc45c8d70e25...
5,win,Brandon Moreno,Jussier Formiga,33,15,0,1,0,1,1,3,Flyweight,U-DEC,,3,00:05:00,0.0,http://ufcstats.com/fight-details/b3c74554c871...
6,win,Amanda Ribas,Randa Markos,85,36,1,1,2,0,4,0,Women's Strawweight,U-DEC,,3,00:05:00,0.0,http://ufcstats.com/fight-details/ea303a8e6e31...
7,win,Elizeu Zaleski dos Santos,Aleksei Kunchenko,62,57,2,0,0,0,0,0,Welterweight,U-DEC,,3,00:05:00,0.0,http://ufcstats.com/fight-details/9fc6aba53508...
8,draw,Rani Yahya,Enrique Barzola,24,57,2,1,0,1,3,5,Bantamweight,M-DEC,,3,00:05:00,0.0,http://ufcstats.com/fight-details/e2503ed6a1f9...
9,win,Maryna Moroz,Mayra Bueno Silva,139,88,2,0,0,0,1,0,Women's Flyweight,U-DEC,,3,00:05:00,0.0,http://ufcstats.com/fight-details/2a1826612a7d...


# Start Data Pipeline

In [7]:
#Loop through main event pages (ie. http://www.ufcstats.com/statistics/events/completed)
#extract each events' url in all_events_url:

base_url = 'http://ufcstats.com/statistics/events/completed'

all_event_urls = []
page_index = 1

while(True):
    
    url = base_url + '?page={}'.format(page_index)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    links = soup.find_all(class_='b-link b-link_style_black')
    
    if len(links) == 0:
        break
    
    for item in links:
        all_event_urls.append(item['href'])
    
    page_index += 1
    
    time.sleep(1)
        
#all_event_urls

KeyboardInterrupt: 

In [24]:
#Loop through all events and extract summary statistics:
all_events = []

for index, url in enumerate(all_event_urls):
    try:
        all_events.append(get_page_stats(url))
    except:
        all_events.append(index)
        print(index)
        
    time.sleep(5)

483
488
492
493
494
499
500
501
502
504
505
506
507
508
509


# Debug:

In [53]:
error_index = [item for item in all_events if type(item) != tuple]
error_urls = [all_event_urls[item] for item in error_index]

In [51]:
error_index

[483, 488, 492, 493, 494, 499, 500, 501, 502, 504, 505, 506, 507, 508, 509]

In [46]:
error_url = all_event_urls[error_index[2]]
error_url

'http://ufcstats.com/event-details/32a3025d5db456ae'

In [55]:
input_error_entries = []
for index, url in zip(error_index, error_urls):
    print(index, url)
    input_error_entries.append(get_page_stats(url))

483 http://ufcstats.com/event-details/cedfdf8d423d500c
488 http://ufcstats.com/event-details/1a1a4d7a29041d77
492 http://ufcstats.com/event-details/32a3025d5db456ae
493 http://ufcstats.com/event-details/4a01dc8376736ef5
494 http://ufcstats.com/event-details/749685d24e2cac50
499 http://ufcstats.com/event-details/96eff1a628adcc7f
500 http://ufcstats.com/event-details/9b5b5a75523728f3
501 http://ufcstats.com/event-details/6ceff86fae4f6b3b
502 http://ufcstats.com/event-details/aee8eecfc4bfb1e7
504 http://ufcstats.com/event-details/b63e800c18e011b5
505 http://ufcstats.com/event-details/31bbd46d57dfbcb7
506 http://ufcstats.com/event-details/5af480a3b2e1726b
507 http://ufcstats.com/event-details/1c3f5e85b59ec710
508 http://ufcstats.com/event-details/dedc3bb440d09554
509 http://ufcstats.com/event-details/b60391da771deefe


In [58]:
for index, data in zip(error_index, input_error_entries):
    all_events[index] = data
    

# To File:

In [153]:
#Assign date, location, attendnace from auxilliary fight stats to each entry:

for index in np.arange(0, len(all_events)):
    all_events[index][0]['date'] = all_events[index][1].loc['date']
    all_events[index][0]['location'] = all_events[index][1].loc['location']
    all_events[index][0]['attendance'] = all_events[index][1].loc['attendance']

In [155]:
#Combine all list elements into one combined pandas DataFrame:

event_level_data = pd.DataFrame([])
for item in all_events:
    event_level_data = pd.concat([event_level_data, item[0]], axis=0)

In [158]:
event_level_data.reset_index(drop=True, inplace=True)
event_level_data.to_csv('../Data/event_level_data.csv')