In [2]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re
from html import unescape
from datetime import datetime as dt
import time
import lxml


# Initial Data Load & Preprocessing

In [2]:
processed_df = pd.read_csv('preprocessed_data.csv') #DF of matches, each observation is a match between two fighters
fighter_stats = pd.read_csv('data.csv') #fighter's cumulative statistics database

In [3]:
processed_cols = list(processed_df.columns) #all columns in preprocessed data
fighter_cols = list(fighter_stats.columns) #all columns in the fighter stats


In [4]:
#returns a sublist of strings that begin with letter from a list of strings
def select_cols_start(letter, list_str):
    pattern = re.compile(r'{}.*'.format(letter))
    col_matches = list(filter(None, [re.match(pattern, item) for item in list_str]))
    cols = [item.group(0) for item in col_matches]
    
    return cols

In [5]:
r_match_cols = select_cols_start('R_', processed_cols)
b_match_cols = select_cols_start('B_', processed_cols)

r_fighter_cols = select_cols_start('R_', fighter_cols)
b_fighter_cols = select_cols_start('B_', fighter_cols)

# Web Scrape

### Setup URL Structure

In [6]:
url = 'http://ufcstats.com/statistics/events/completed'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

In [7]:
#List of more urls to scrape:
detail_urls = [] #this grabs urls for each day recorded
for url in soup.find_all(class_='b-link b-link_style_black'):
    detail_urls.append(url['href'])

In [8]:
detail_urls[1]

'http://ufcstats.com/event-details/0b5b6876c2a4723f'

### Individual Fight Day Scrape

In [9]:
data_types = {
    'R_STR': int, 
    'B_STR': int,
    'R_TD': int, 
    'B_TD': int, 
    'R_SUB': int, 
    'R_SUB': int, 
    'R_PASS': int, 
    'B_PASS': int,
    'ROUND': int,
}

In [3]:
def remove_space_lines(text):
    pattern1 = re.compile(r'[\s\s+]')
    return re.sub(pattern1, ' ', text)

In [4]:
#Determine if observation is a title-bout
def find_belt(img_tag):
    try:
        image_link = img_tag['src']
        if re.match(r'.*belt.*', image_link) != None:
            return True
    except:
        return False

In [5]:
def get_fight_auxiliary(soup):
    
    table = []
    
    auxiliary_table = soup.find_all('li', {'class': 'b-list__box-list-item'})
    for item in auxiliary_table:
        attribute = remove_space_lines(item.text).strip()

        try:
            attribute = re.findall(r'\s\s+(.*)', attribute)[0]
        except:
            attribute = '' 
        
        table.append(attribute)
        
    table_series = pd.Series(table)
    table_series.index = ['date', 'location', 'attendance']
    
    if table_series['attendance'] != '':
        table_series['attendance'] = re.sub(',', '', table_series['attendance'])
        table_series['attendance'] = int(table_series['attendance'])
    
    table_series['date'] = dt.strptime(table_series['date'], '%B %d, %Y').strftime('%d-%m-%Y')

    return table_series

In [6]:
def get_page_stats(url):
    
    #Given url of list of events, returns list of event details:
    
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    stat_table = soup.findAll('table')[0].contents #Contents of the main table in html
    
    table_data = stat_table[3] #first 2 indices are empty strings, table_data is html starting from first table row
    detail_data = table_data.find_all('p') #within table rows, there are <p> labels for table text
    auxiliary_data = get_fight_auxiliary(soup)
    
    image_data = table_data.find_all('img') #get image links to find belt for 
    
    contents = [] #table contents
    title_match_index = [] #track which fights are title_bouts
    
    #Loop through elements of detail_data (html table) to scrape fight details:
    for index, item in enumerate(detail_data):
        
        #find image of belt == title_bout
        image = item.find('img')
        if find_belt(image):
            title_match_index.append(index) 
            
        #contents is list of all text from each element of table     
        contents.append(item.text)  

    
    #Clean up elements
    contents = list(map(lambda x: remove_space_lines(x), contents))
    contents = list(map(lambda x: x.strip(), contents)) 
    
    draw_index = []
    
    #When there's a draw or NC, additional tags are created --> remove the tag to reformat correctly   
    for i in np.arange(0, len(contents)-10, 16):

        if contents[i] != 'win':
            
            #Get the index of the match that was drawn & remove that element
            draw_index.append(np.floor_divide(i, 16)) 
            contents.pop(i)
                    
    #Extract links to more detailed fight statistics
    fight_links = table_data.find_all('a', {'class': 'b-flag b-flag_style_green'})
    fight_links = [item['href'] for item in fight_links]
    
    draw_links = table_data.find_all('a', {'class': 'b-flag b-flag_style_bordered'})
    draw_links = [item['href'] for item in draw_links]
    draw_links = list(dict.fromkeys(draw_links)) #Remove duplicate links from the drawn fights
    
    for index, link in zip(draw_index, draw_links):
        fight_links.insert(index, link)
    
    #each row of data is 16 elements: reformats 1 observation per row
    formatted_contents = np.array(contents).reshape((-1, 16))
    formatted_contents = pd.DataFrame(formatted_contents)
    
    #the first row is a list of 'wins'
    #formatted_contents.drop(0, axis = 1, inplace = True)
    
    #Run a floor_divide to put the image of the belt in the correct fight

    title_match = np.floor_divide(title_match_index, 16) 

    #Initialize title_bout column with all 0's
    titles = np.zeros(16)
    if len(title_match) != 0:
        titles[title_match] = 1
    
    title_series = pd.Series(titles)
    
    formatted_contents['title_bout'] = title_series
    
    #rename columns
    formatted_contents.columns = ['Winner', 'R_fighter', 'B_fighter', 'R_STR', 'B_STR', 
                               'R_TD', 'B_TD', 'R_SUB', 'R_SUB', 'R_PASS', 'B_PASS',
                              'WEIGHT_CLASS', 'METHOD', 'DETAIL', 'ROUND', 'TIME', 'title_bout']
    
    #convert columns to appropriate data types
    formatted_contents.replace('--', 99999, inplace = True)
    formatted_contents = formatted_contents.astype(data_types)
    formatted_contents['TIME'] = formatted_contents['TIME'].apply(lambda x: dt.strptime(x, '%M:%S').time())
    formatted_contents['link'] = fight_links
    
    return (formatted_contents, auxiliary_data)

# Create Data Pipeline

    1. Extract all fight day / event URLs from the main page
    2. Extract all fights from a particular event / day | Input event url --> get_page_stats
    3. Within each event, extract get_detailed_page_statistics | Input fight url --> get_detailed_page_stats 

In [7]:
#STEP 1:
base_url = 'http://ufcstats.com/statistics/events/completed'

url = 'http://ufcstats.com/statistics/events/completed?page=22'

all_event_urls = []
page_index = 1

while(True):
    
    url = base_url + '?page={}'.format(page_index)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    links = soup.find_all(class_='b-link b-link_style_black')
    
    if len(links) == 0:
        break
    
    for item in links:
        all_event_urls.append(item['href'])
    
    page_index += 1
    
    time.sleep(1)
        
#all_event_urls

KeyboardInterrupt: 

In [20]:
class event_details(object):
    def __init__(self, url):
        self.all_details = get_page_stats(url)
        self.fight_df = self.all_details[0]
        self.attributes = self.all_details[1]
    

In [24]:
all_events = []

for index, url in enumerate(all_event_urls):
    try:
        all_events.append(get_page_stats(url))
    except:
        all_events.append(index)
        print(index)
        
    time.sleep(5)

483
488
492
493
494
499
500
501
502
504
505
506
507
508
509


# Debug:

In [53]:
error_index = [item for item in all_events if type(item) != tuple]
error_urls = [all_event_urls[item] for item in error_index]

In [51]:
error_index

[483, 488, 492, 493, 494, 499, 500, 501, 502, 504, 505, 506, 507, 508, 509]

In [46]:
error_url = all_event_urls[error_index[2]]
error_url

'http://ufcstats.com/event-details/32a3025d5db456ae'

In [55]:
input_error_entries = []
for index, url in zip(error_index, error_urls):
    print(index, url)
    input_error_entries.append(get_page_stats(url))

483 http://ufcstats.com/event-details/cedfdf8d423d500c
488 http://ufcstats.com/event-details/1a1a4d7a29041d77
492 http://ufcstats.com/event-details/32a3025d5db456ae
493 http://ufcstats.com/event-details/4a01dc8376736ef5
494 http://ufcstats.com/event-details/749685d24e2cac50
499 http://ufcstats.com/event-details/96eff1a628adcc7f
500 http://ufcstats.com/event-details/9b5b5a75523728f3
501 http://ufcstats.com/event-details/6ceff86fae4f6b3b
502 http://ufcstats.com/event-details/aee8eecfc4bfb1e7
504 http://ufcstats.com/event-details/b63e800c18e011b5
505 http://ufcstats.com/event-details/31bbd46d57dfbcb7
506 http://ufcstats.com/event-details/5af480a3b2e1726b
507 http://ufcstats.com/event-details/1c3f5e85b59ec710
508 http://ufcstats.com/event-details/dedc3bb440d09554
509 http://ufcstats.com/event-details/b60391da771deefe


In [58]:
for index, data in zip(error_index, input_error_entries):
    all_events[index] = data
    

# To File:

In [153]:
for index in np.arange(0, len(all_events)):
    all_events[index][0]['date'] = all_events[index][1].loc['date']
    all_events[index][0]['location'] = all_events[index][1].loc['location']
    all_events[index][0]['attendance'] = all_events[index][1].loc['attendance']

In [155]:
event_level_data = pd.DataFrame([])
for item in all_events:
    event_level_data = pd.concat([event_level_data, item[0]], axis=0)

In [158]:
event_level_data.reset_index(drop=True, inplace=True)
event_level_data.to_csv('event_level_data.csv')

# For each fight in fight_day scrape detail statistics:

In [15]:
def get_totals_table(soup):

    html_table = soup.find_all('table')[0]
    
    #Scrape Statistics: separated by double-space
    stats_table = pd.read_html(str(html_table))[0]
    table_columns = stats_table.columns
    total_statistics = [item.split('  ') for item in stats_table.loc[0][1:]]
    
    #Scrape Fighter names
    names_table = soup.find('td', {'class': 'b-fight-details__table-col l-page_align_left'})    
    names = [remove_space_lines(item.text).strip() for item in names_table.find_all('p')]
    
    #Append together, rearrange columns, and rename columns:
    total_statistics.append(names)

    total_statistics = pd.DataFrame(total_statistics).T
    total_statistics = total_statistics[[9,0,1,2,3,4,5,6,7,8]]
    total_statistics.columns = table_columns
    
    return total_statistics

def get_ss_table(soup):
    html_table = soup.find_all('table')[2]
    
    #Scrape Statistics: separated by double-space
    stats_table = pd.read_html(str(html_table))[0]
    table_columns = stats_table.columns
    total_statistics = [item.split('  ') for item in stats_table.loc[0][1:]]
    
    #Append together, rearrange columns, and rename columns:
    #total_statistics.append(names)

    total_statistics = pd.DataFrame(total_statistics).T
    total_statistics.columns = table_columns[1:]
    
    return total_statistics
    
def get_combined(soup):
    name_stats = get_totals_table(soup)
    significant_stats = get_ss_table(soup)
    
    combined = pd.concat([name_stats, significant_stats], axis = 1)
    combined = combined.loc[:, ~combined.columns.duplicated()]
    combined.drop('Sig. str', axis = 1, inplace = True)
        
    return combined
 

### Split Countables:

In [16]:
import itertools

In [17]:
countables = ['Sig. str.', 'Total str.', 'Td', 'Head', 'Body', 'Leg', 'Distance', 'Clinch', 'Ground']
hit_att = ['Hits', 'Attempts']

countable_cols = list(itertools.product(countables, hit_att))
countable_cols = ['{} {}'.format(item[0], item[1]) for item in countable_cols]
#countable_cols

In [18]:
def fight_attributes(soup):
        
    header_table = soup.find_all('i', {'class': 'b-fight-details__text-item'})[0:4]
    
    table = []
    for item in header_table:

        detail = remove_space_lines(item.text).strip()
        
        try:
            table.append(re.findall(r'\s\s+(.*)', detail)[0])
        except:
            pass
     
    table_series = pd.Series(table, index=['rounds', 'time', 'format', 'referee'])
    #table_series = table_series.astype(data_types)
    table_series['rounds'] = int(table_series['rounds'])
    table_series['time'] = dt.strptime(table_series['time'], '%M:%S').time()
        

    return table_series

def split_countable(combined_df):
    split = combined_df.apply(lambda x: x.apply(lambda y: y.split('of')))
    
    split_stats = []
    for index, series in split.iterrows():
        split_stats.append(list(series.apply(pd.Series).stack()))
        
    split_df = pd.DataFrame(split_stats)
    split_df.columns = countable_cols
    
    return split_df
    
def get_detailed_page_stats(url):
    
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    #get Referee:
    attributes = fight_attributes(soup)
    
    #Get fight_df:
    combined_df = get_combined(soup)
    split_df = combined_df[countables]
    
    combined_df.drop(countables, axis = 1, inplace = True)
    
    countable_df = split_countable(split_df)
    
    combined_df = pd.concat([combined_df, countable_df], axis = 1)
    return (combined_df, attributes)    
    

# Debug:

In [None]:
def get_page_stats(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    stat_table = soup.findAll('table')[0].contents #Contents of the main table in html
    
    table_data = stat_table[3] #first 2 indices are empty strings, table_data is html starting from first table row
    detail_data = table_data.find_all('p') #within table rows, there are <p> labels for table text
    auxiliary_data = get_fight_auxiliary(soup)
    
    image_data = table_data.find_all('img') #get image links to find belt for 
    
    contents = []
    title_match_index = []
    
    #Loop through elements of detail_data (html table) to scrape fight details:
    for index, item in enumerate(detail_data):
        image = item.find('img')
        if find_belt(image):
            title_match_index.append(index) #find image of belt == title_bout
        contents.append(item.text) #contents is list of all text from each element of table  

    
    #Clean up elements
    contents = list(map(lambda x: remove_space_lines(x), contents))
    contents = list(map(lambda x: x.strip(), contents)) 
    
    draw_index = []
    
    #When there's a draw or NC, additional tags are created --> remove the tag to reformat correctly   
    for i in np.arange(0, len(contents)-10, 16):

        if contents[i] != 'win':
            draw_index.append(np.floor_divide(i, 16))
            contents.pop(i)
        
                    
    #Extract links to more detailed fight statistics
    fight_links = table_data.find_all('a', {'class': 'b-flag b-flag_style_green'})
    fight_links = [item['href'] for item in fight_links]
    
    draw_links = table_data.find_all('a', {'class': 'b-flag b-flag_style_bordered'})
    draw_links = [item['href'] for item in draw_links]
    draw_links = list(dict.fromkeys(draw_links))
    
    for index, link in zip(draw_index, draw_links):
        fight_links.insert(index, link)
    
    #each row of data is 16 elements: reformats 1 observation per row
    formatted_contents = np.array(contents).reshape((-1, 16))
    formatted_contents = pd.DataFrame(formatted_contents)
    
    #the first row is a list of 'wins'
    #formatted_contents.drop(0, axis = 1, inplace = True)
    
    #Run a floor_divide to put the image of the belt in the correct fight

    title_match = np.floor_divide(title_match_index, 16) 

    #Initialize title_bout column with all 0's
    titles = np.zeros(16)
    if len(title_match) != 0:
        titles[title_match] = 1
    
    title_series = pd.Series(titles)
    
    formatted_contents['title_bout'] = title_series
    
    #rename columns
    formatted_contents.columns = ['Winner', 'R_fighter', 'B_fighter', 'R_STR', 'B_STR', 
                               'R_TD', 'B_TD', 'R_SUB', 'R_SUB', 'R_PASS', 'B_PASS',
                              'WEIGHT_CLASS', 'METHOD', 'DETAIL', 'ROUND', 'TIME', 'title_bout']
    
    #convert columns to appropriate data types
    formatted_contents = formatted_contents.astype(data_types)
    formatted_contents['TIME'] = formatted_contents['TIME'].apply(lambda x: dt.strptime(x, '%M:%S').time())
    formatted_contents['link'] = fight_links
    
    return (formatted_contents, auxiliary_data)

In [None]:
link = all_event_urls[21]
link

In [None]:
soup = BeautifulSoup(requests.get(link).content, 'html.parser')

In [None]:
re.findall(r'\s\s+(.*)', 'Date:              September 07, 2019')[0]

In [None]:
get_fight_auxiliary(soup)

In [None]:
get_page_stats(link)

In [None]:
class fight_details(object):
    def __init__(self, url):
        self.all_details = get_detailed_page_stats(url)
        self.fight_df = self.all_details[0]
        self.attributes = self.all_details[1]

In [None]:
test = fight_details(test_links[0])

In [None]:
test.attributes

In [None]:
test_page1 = get_page_stats(detail_urls[1])

In [None]:
test_page1

In [None]:
test_links = test_page1.link

In [None]:
a = []
for item in test_links:
    a.append(fight_details(item))
    time.sleep(1)

In [None]:
a[1].attributes

In [None]:
link1 = page1.link[0]

In [None]:
get_detailed_page_stats(link1)