In [10]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re
from html import unescape
from datetime import datetime as dt
import time
import lxml
import itertools


In [11]:
events = pd.read_csv('../Data/event_level_data.csv', index_col = 0)
events.isna().any()

Winner          False
R_fighter       False
B_fighter       False
R_STR           False
B_STR           False
R_TD            False
B_TD            False
R_SUB           False
B_SUB           False
R_PASS          False
B_PASS          False
WEIGHT_CLASS    False
METHOD          False
DETAIL           True
ROUND           False
TIME            False
title_bout      False
link            False
date            False
location        False
attendance       True
dtype: bool

In [12]:
events.head(5)

Unnamed: 0,Winner,R_fighter,B_fighter,R_STR,B_STR,R_TD,B_TD,R_SUB,B_SUB,R_PASS,...,WEIGHT_CLASS,METHOD,DETAIL,ROUND,TIME,title_bout,link,date,location,attendance
0,win,Charles Oliveira,Kevin Lee,43,41,0,2,2,0,0,...,Lightweight,SUB,Guillotine Choke,3,00:00:28,0.0,http://ufcstats.com/fight-details/e0b323dae5bf...,14-03-2020,"Brasilia, Distrito Federal, Brazil",0.0
1,win,Gilbert Burns,Demian Maia,13,4,0,2,0,0,0,...,Welterweight,KO/TKO,Punch,1,00:02:34,0.0,http://ufcstats.com/fight-details/5cee1d8f1e43...,14-03-2020,"Brasilia, Distrito Federal, Brazil",0.0
2,win,Renato Moicano,Damir Hadzovic,1,1,1,0,1,0,1,...,Lightweight,SUB,Rear Naked Choke,1,00:00:44,0.0,http://ufcstats.com/fight-details/c26a3f4c0833...,14-03-2020,"Brasilia, Distrito Federal, Brazil",0.0
3,win,Nikita Krylov,Johnny Walker,45,37,3,0,0,0,4,...,Light Heavyweight,U-DEC,,3,00:05:00,0.0,http://ufcstats.com/fight-details/5bba49d88db7...,14-03-2020,"Brasilia, Distrito Federal, Brazil",0.0
4,win,Francisco Trinaldo,John Makdessi,55,67,0,0,0,0,0,...,Lightweight,U-DEC,,3,00:05:00,0.0,http://ufcstats.com/fight-details/dc45c8d70e25...,14-03-2020,"Brasilia, Distrito Federal, Brazil",0.0


# Scrape Fight Details:

### Helper Functions:

In [13]:
def remove_space_lines(text):
    pattern1 = re.compile(r'[\s\s+]')
    return re.sub(pattern1, ' ', text)


def empty_page(soup):
    
    sections = soup.find('section', {'class': 'b-fight-details__section js-fight-section'})
    
    if sections.contents[0] == '\n    Round-by-round stats not currently available.\n': 
        return True
    return False


def empty_df():
    temp = pd.Series([99999]*len(template), index=template)
    return temp

### Extract TOTALS & SIGNIFICANT STRIKES TABLE

In [14]:
def get_totals_table(soup):
    '''
    Input: beautifulsoup of detailed fight stats site (ie. http://www.ufcstats.com/fight-details/e0b323dae5bf4c90)
    Output: pandas DataFrame matching table labeled 'TOTALS'
    
    '''

    html_table = soup.find_all('table')[0]

    #Scrape Statistics: separated by double-space
    stats_table = pd.read_html(str(html_table))[0]
    table_columns = stats_table.columns
    total_statistics = [item.split('  ') for item in stats_table.loc[0][1:]]
    
    #Scrape Fighter names
    names_table = soup.find('td', {'class': 'b-fight-details__table-col l-page_align_left'})    
    names = [remove_space_lines(item.text).strip() for item in names_table.find_all('p')]
    
    #Append together, rearrange columns, and rename columns:
    total_statistics.append(names)

    total_statistics = pd.DataFrame(total_statistics).T
    total_statistics = total_statistics[[9,0,1,2,3,4,5,6,7,8]]
    total_statistics.columns = table_columns
    
    return total_statistics

def get_ss_table(soup):
    '''
    Input: beautifulsoup of detailed fight stats site (ie. http://www.ufcstats.com/fight-details/e0b323dae5bf4c90)
    Output: pandas DataFrame matching table labeled 'SIGNIFICANT STRIKES'
    
    '''
    html_table = soup.find_all('table')[2]

    
    #Scrape Statistics: separated by double-space
    stats_table = pd.read_html(str(html_table))[0]
    table_columns = stats_table.columns
    total_statistics = [item.split('  ') for item in stats_table.loc[0][1:]]
    
    #Append together, rearrange columns, and rename columns:
    #total_statistics.append(names)

    total_statistics = pd.DataFrame(total_statistics).T
    total_statistics.columns = table_columns[1:]
    
    return total_statistics
    
def get_combined(soup):
    name_stats = get_totals_table(soup)
    significant_stats = get_ss_table(soup)

    combined = pd.concat([name_stats, significant_stats], axis = 1)
    combined = combined.loc[:, ~combined.columns.duplicated()]
    combined.drop('Sig. str', axis = 1, inplace = True)

    return combined
 

In [15]:
def fight_attributes(soup):
        
    header_table = soup.find_all('i', {'class': 'b-fight-details__text-item'})[0:4]
    
    table = []
    for item in header_table:

        detail = remove_space_lines(item.text).strip()
        
        try:
            table.append(re.findall(r'\s\s+(.*)', detail)[0])
        except:
            table.append('')
     
    table_series = pd.Series(table, index=['rounds', 'time', 'format', 'referee'])
    #table_series = table_series.astype(data_types)
    table_series['rounds'] = int(table_series['rounds'])
    table_series['time'] = dt.strptime(table_series['time'], '%M:%S').time()
        

    return table_series

### Split Countables:

Statistics listed in countables listed are extracted in format # of #. We will split these into "hits" of "attempts" to allow calculation of fighter efficiency

In [16]:
countables = ['Sig. str.', 'Total str.', 'Td', 'Head', 'Body', 'Leg', 'Distance', 'Clinch', 'Ground']
hit_att = ['Hits', 'Attempts']

countable_cols = list(itertools.product(countables, hit_att))
countable_cols = ['{} {}'.format(item[0], item[1]) for item in countable_cols]
#countable_cols

In [17]:
def split_countable(combined_df):
    
    split = combined_df.apply(lambda x: x.apply(lambda y: y.split('of')))
    
    split_stats = []
    for index, series in split.iterrows():
        split_stats.append(list(series.apply(pd.Series).stack()))
        
    split_df = pd.DataFrame(split_stats)
    split_df.columns = countable_cols
    
    return split_df
    
def df_single_row_format(combined_df, attr):
    
    df = pd.concat([combined_df.loc[0], combined_df.loc[1]], axis = 0)
    df['rounds'] = attr['rounds']
    df['time'] = attr['time']
    df['format'] = attr['format']
    df['referee'] = attr['referee']
    
    return df
    
#PUT IT ALL TOGETHER:
def get_detailed_page_stats(url):
    
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    #get Attributes:
    attributes = fight_attributes(soup)
    
    
    #If relevant statistics are unavailable return an 'unavailable_df'    
    if empty_page(soup) == True:
        empty = empty_df()   

        combined_df = pd.concat([empty, attributes], axis = 0)
        
        combined_df['url'] = url
        return combined_df
        
    
    else:
     
    #Get fight_df:

        combined_df = get_combined(soup)
        split_df = combined_df[countables]


        combined_df.drop(countables, axis = 1, inplace = True)

        countable_df = split_countable(split_df)

        combined_df = pd.concat([combined_df, countable_df], axis = 1)

        single_df = df_single_row_format(combined_df, attributes)

        single_df['url'] = url

        return single_df   
    

In [18]:
test = get_detailed_page_stats(events.link[0])

In [20]:
test

Fighter                                                        Kevin Lee
KD                                                                     0
Sig. str. %                                                          51%
Td %                                                                 66%
Sub. att                                                               0
Pass                                                                   2
Rev.                                                                   1
Sig. str. Hits                                                       41 
Sig. str. Attempts                                                    80
Total str. Hits                                                      61 
Total str. Attempts                                                  100
Td Hits                                                               2 
Td Attempts                                                            3
Head Hits                                          

In [21]:
events.link[1]

'http://ufcstats.com/fight-details/5cee1d8f1e43d6f5'

In [22]:
test_entry = get_detailed_page_stats(events.link[1])

In [23]:
template = test_entry.index[:-5]

In [24]:
template

Index(['Fighter', 'KD', 'Sig. str. %', 'Td %', 'Sub. att', 'Pass', 'Rev.',
       'Sig. str. Hits', 'Sig. str. Attempts', 'Total str. Hits',
       'Total str. Attempts', 'Td Hits', 'Td Attempts', 'Head Hits',
       'Head Attempts', 'Body Hits', 'Body Attempts', 'Leg Hits',
       'Leg Attempts', 'Distance Hits', 'Distance Attempts', 'Clinch Hits',
       'Clinch Attempts', 'Ground Hits', 'Ground Attempts', 'Fighter', 'KD',
       'Sig. str. %', 'Td %', 'Sub. att', 'Pass', 'Rev.', 'Sig. str. Hits',
       'Sig. str. Attempts', 'Total str. Hits', 'Total str. Attempts',
       'Td Hits', 'Td Attempts', 'Head Hits', 'Head Attempts', 'Body Hits',
       'Body Attempts', 'Leg Hits', 'Leg Attempts', 'Distance Hits',
       'Distance Attempts', 'Clinch Hits', 'Clinch Attempts', 'Ground Hits',
       'Ground Attempts'],
      dtype='object')

# Data Pipeline:

Piece-meal Scrape:

In [25]:
split = np.linspace(0, len(events.link), 10)
split = np.ceil(split).astype(int)
split

array([   0,  619, 1237, 1855, 2473, 3092, 3710, 4328, 4946, 5564])

In [26]:
len(events)

5564

In [27]:
fight_details = []

for index, item in enumerate(events.link[0:10]):
    
    try:
        print(index)
        fight_details.append(get_detailed_page_stats(item))
        
    except:
        print("error: {}".format(item))
        fight_details.append(index)
        
        
    time.sleep(np.random.uniform(3, 10))

0
1
2
3
4
5
6
7
8
9


# Debug:

In [31]:
batch = fight_details


In [440]:
error_index = []
for index, item in enumerate(batch):
    try: 
        len(item)
        
    except:
        error_index.append(index)
        print(item)

In [425]:
start_index = split[5]
start_index

event_errors = np.array(error_index) + start_index
event_errors



array([5317, 5318, 5354, 5382, 5383, 5391, 5399, 5400, 5440, 5441, 5449,
       5450, 5451, 5458, 5459, 5467, 5484, 5492, 5493, 5503, 5504, 5513,
       5514, 5523, 5533, 5534])

In [429]:
fix_errors = []

for item in event_errors:
    fix_errors.append(get_detailed_page_stats(events.link[item]))
    time.sleep(5)
    
fix_errors

[Fighter                                                            99999
 KD                                                                 99999
 Sig. str. %                                                        99999
 Td %                                                               99999
 Sub. att                                                           99999
 Pass                                                               99999
 Rev.                                                               99999
 Sig. str. Hits                                                     99999
 Sig. str. Attempts                                                 99999
 Total str. Hits                                                    99999
 Total str. Attempts                                                99999
 Td Hits                                                            99999
 Td Attempts                                                        99999
 Head Hits                            

In [439]:
for i, item in zip(error_index, fix_errors):
    batch[i] = item

# To File:

In [32]:
batch_to_file = pd.DataFrame()

for item in batch:
    temp = item.to_frame().T
    batch_to_file = pd.concat([batch_to_file, temp], axis = 0)


In [33]:
batch_to_file

Unnamed: 0,Fighter,KD,Sig. str. %,Td %,Sub. att,Pass,Rev.,Sig. str. Hits,Sig. str. Attempts,Total str. Hits,...,Distance Attempts,Clinch Hits,Clinch Attempts,Ground Hits,Ground Attempts,rounds,time,format,referee,url
0,Kevin Lee,0,51%,66%,0,2,1,41,80,61,...,56,2,2,6,7,3,00:00:28,5 Rnd (5-5-5-5-5),Mike Beltran,http://ufcstats.com/fight-details/e0b323dae5bf...
0,Demian Maia,0,57%,100%,0,1,0,4,7,4,...,7,0,0,8,9,1,00:02:34,3 Rnd (5-5-5),Osiris Maia,http://ufcstats.com/fight-details/5cee1d8f1e43...
0,Renato Moicano,0,50%,100%,1,1,0,1,2,4,...,5,0,0,0,0,1,00:00:44,3 Rnd (5-5-5),Eduardo Herdy,http://ufcstats.com/fight-details/c26a3f4c0833...
0,Johnny Walker,0,74%,0%,0,2,1,37,50,91,...,18,2,3,33,37,3,00:05:00,3 Rnd (5-5-5),Mike Beltran,http://ufcstats.com/fight-details/5bba49d88db7...
0,Francisco Trinaldo,0,43%,0%,0,0,0,55,126,55,...,121,2,2,0,0,3,00:05:00,3 Rnd (5-5-5),Osiris Maia,http://ufcstats.com/fight-details/dc45c8d70e25...
0,Jussier Formiga,0,46%,11%,1,3,1,15,32,61,...,61,4,9,9,12,3,00:05:00,3 Rnd (5-5-5),Eduardo Herdy,http://ufcstats.com/fight-details/b3c74554c871...
0,Amanda Ribas,0,53%,33%,2,4,1,85,159,173,...,74,3,3,0,0,3,00:05:00,3 Rnd (5-5-5),Camila Albuquerque,http://ufcstats.com/fight-details/ea303a8e6e31...
0,Elizeu Zaleski dos Santos,0,39%,50%,0,0,0,62,156,74,...,101,8,10,1,1,3,00:05:00,3 Rnd (5-5-5),Osiris Maia,http://ufcstats.com/fight-details/9fc6aba53508...
0,Rani Yahya,0,53%,40%,0,3,1,24,45,58,...,36,1,1,46,49,3,00:05:00,3 Rnd (5-5-5),Julio Catarino,http://ufcstats.com/fight-details/e2503ed6a1f9...
0,Mayra Bueno Silva,0,60%,0%,0,0,0,88,146,92,...,203,10,18,18,26,3,00:05:00,3 Rnd (5-5-5),Camila Albuquerque,http://ufcstats.com/fight-details/2a1826612a7d...


In [447]:
batch_to_file.reset_index(drop = True, inplace = True)
batch_to_file.to_csv('../Data/fight_details_batch6.csv')

# Debug & Combine:

In [8]:
#Load fight_details:
data = []

for i in np.arange(1, 7):
    data.append(pd.read_csv('../Data/batch/fight_details_batch{}.csv'.format(i), index_col = 0))
    
df = pd.DataFrame()

for item in data:
    df = pd.concat([df, item])
    
df.shape

df.reset_index(drop = True, inplace = True)

In [9]:
df.url

0       http://ufcstats.com/fight-details/e0b323dae5bf...
1       http://ufcstats.com/fight-details/e0b323dae5bf...
2       http://ufcstats.com/fight-details/e0b323dae5bf...
3       http://ufcstats.com/fight-details/e0b323dae5bf...
4       http://ufcstats.com/fight-details/e0b323dae5bf...
                              ...                        
5559    http://ufcstats.com/fight-details/ac7ca2ec38b9...
5560    http://ufcstats.com/fight-details/46acd54cc0c9...
5561    http://ufcstats.com/fight-details/cecdc0da5842...
5562    http://ufcstats.com/fight-details/2d2bbc86e941...
5563    http://ufcstats.com/fight-details/567a09fd200c...
Name: url, Length: 5564, dtype: object

In [8]:
error_99 = df[df['Fighter'] == '99999']['url']
error_99_index = error_99.index
print(len(error_99_index))

26


In [15]:
for link, index in zip(error_99, error_99_index):
    
    soup = BeautifulSoup(requests.get(link).content, 'html.parser')
    if empty_page(soup):
        print('empty')
        pass
    else:
        print('not_empty')
        new_df = get_detailed_page_stats(link)
        new_df.index = new_df.index.where(~new_df.index.duplicated(), new_df.index + '.1')
        df.loc[index, : ] = new_df

not_empty
not_empty
not_empty
not_empty
empty
empty
empty
empty
empty
empty
empty
empty
empty
empty
empty
empty
empty
empty
empty
empty
empty
empty
empty
not_empty
empty
empty


In [17]:
df.to_csv('../Data/combined_fight_details.csv')