In [2]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re
from html import unescape
from datetime import datetime as dt
import time
import lxml


# Initial Data Load & Preprocessing

In [3]:
processed_df = pd.read_csv('preprocessed_data.csv') #DF of matches, each observation is a match between two fighters
fighter_stats = pd.read_csv('data.csv') #fighter's cumulative statistics database

In [4]:
processed_cols = list(processed_df.columns) #all columns in preprocessed data
fighter_cols = list(fighter_stats.columns) #all columns in the fighter stats


In [5]:
#returns a sublist of strings that begin with letter from a list of strings
def select_cols_start(letter, list_str):
    pattern = re.compile(r'{}.*'.format(letter))
    col_matches = list(filter(None, [re.match(pattern, item) for item in list_str]))
    cols = [item.group(0) for item in col_matches]
    
    return cols

In [6]:
r_match_cols = select_cols_start('R_', processed_cols)
b_match_cols = select_cols_start('B_', processed_cols)

r_fighter_cols = select_cols_start('R_', fighter_cols)
b_fighter_cols = select_cols_start('B_', fighter_cols)

# Web Scrape

### Setup URL Structure

In [7]:
url = 'http://ufcstats.com/statistics/events/completed'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

In [9]:
#List of more urls to scrape:
detail_urls = [] #this grabs urls for each day recorded
for url in soup.find_all(class_='b-link b-link_style_black'):
    time.sleep(1)
    detail_urls.append(url['href'])

### Individual Fight Day Scrape

In [19]:
data_types = {
    'R_STR': int, 
    'B_STR': int,
    'R_TD': int, 
    'B_TD': int, 
    'R_SUB': int, 
    'R_SUB': int, 
    'R_PASS': int, 
    'B_PASS': int,
    'ROUND': int,
}

In [20]:
def remove_space_lines(text):
    pattern1 = re.compile(r'[\s\s+]')
    return re.sub(pattern1, ' ', text)

In [21]:
def get_page_stats(url):
    page = requests.get(url)
    site = BeautifulSoup(page.text, 'html.parser')
    
    stat_table = site.findAll('table')[0].contents #Contents of the main table in html
    
    table_data = stat_table[3] #first 2 indices are empty strings, table_data is html starting from first table row
    detail_data = table_data.find_all('p') #within table rows, there are <p> labels for table text
    
    contents = []
    
    for item in detail_data:
        contents.append(item.text) #contents is list of all text from each element of table
    
    #Clean up elements
    contents = list(map(lambda x: remove_space_lines(x), contents))
    contents = list(map(lambda x: x.strip(), contents)) 
    
    #each row of data is 16 elements, reformats: 1 observation per row
    formatted_contents = np.array(contents).reshape((-1, 16))
    formatted_contents = pd.DataFrame(formatted_contents)
    
    #the first row is a list of 'wins'
    formatted_contents.drop(0, axis = 1, inplace = True)
    
    #rename columns
    formatted_contents.columns = ['R_fighter', 'B_fighter', 'R_STR', 'B_STR', 
                               'R_TD', 'B_TD', 'R_SUB', 'R_SUB', 'R_PASS', 'B_PASS',
                              'WEIGHT_CLASS', 'METHOD', 'DETAIL', 'ROUND', 'TIME']
    
    #convert columns to appropriate data types
    formatted_contents = formatted_contents.astype(data_types)
    formatted_contents['TIME'] = formatted_contents['TIME'].apply(lambda x: dt.strptime(x, '%H:%M').time())
    
    return formatted_contents

In [22]:
page1 = get_page_stats(detail_urls[1])

In [23]:
page1

Unnamed: 0,R_fighter,B_fighter,R_STR,B_STR,R_TD,B_TD,R_SUB,R_SUB.1,R_PASS,B_PASS,WEIGHT_CLASS,METHOD,DETAIL,ROUND,TIME
0,Dan Hooker,Paul Felder,122,110,1,0,0,0,1,0,Lightweight,S-DEC,,5,05:00:00
1,Jimmy Crute,Michal Oleksiejczuk,2,3,8,0,1,0,4,0,Light Heavyweight,SUB,Kimura,1,03:29:00
2,Xiaonan Yan,Karolina Kowalkiewicz,93,38,5,0,0,1,3,0,Women's Strawweight,U-DEC,,3,05:00:00
3,Marcos Rogerio de Lima,Ben Sosoli,19,5,0,0,0,0,0,0,Heavyweight,KO/TKO,Punch,1,01:28:00
4,Brad Riddell,Magomed Mustafaev,38,28,1,8,0,0,2,0,Lightweight,S-DEC,,3,05:00:00
5,Zubaira Tukhugov,Kevin Aguilar,28,9,0,0,0,0,0,0,Featherweight,KO/TKO,Punches,1,03:21:00
6,Jalin Turner,Josh Culibao,55,12,0,0,3,0,4,0,Lightweight,KO/TKO,Punches,2,03:01:00
7,Jake Matthews,Emil Meek,33,50,4,0,0,0,3,0,Welterweight,U-DEC,,3,05:00:00
8,Kenan Song,Callan Potter,17,7,0,1,0,0,0,1,Welterweight,KO/TKO,Punch,1,02:20:00
9,Kai Kara-France,Tyson Nam,78,50,0,0,0,0,0,0,Flyweight,U-DEC,,3,05:00:00
