In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from bs4 import BeautifulSoup
import requests
import re
from datetime import datetime


In [21]:
# CREATING FUNCTION to get each fight urls from a fighter's URL
def get_fighter_fight_urls(fighter_url):
    page = requests.get(fighter_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    fight_urls = []
    
    fight_rows = soup.find_all('tr', class_='b-fight-details__table-row')
    for fight_row in fight_rows:
        onclick_value = fight_row.get('onclick')
        if onclick_value:
            match = re.search(r"'(http://www.ufcstats.com/fight-details/.+?)'", onclick_value)
            if match:
                fight_link = match.group(1)
                fight_urls.append(fight_link)
    
    return fight_urls


In [22]:
#Scrape fighter statistics
import datetime
def get_numeric_value(value_str):
    try:
        return float(value_str)
    except ValueError:
        return None

def convert_height(height_str):
    height_str = height_str.replace('"', '')
    feet, inches = map(int, height_str.split("'"))
    total_inches = feet * 12 + inches
    return total_inches

def convert_weight(weight_str):
    return float(weight_str.replace('lbs.', '').strip())

def convert_reach(reach_str):
    if reach_str == '--':
        return None
    return float(reach_str.replace('"', '').strip())

def convert_percentage(percentage_str):
    try:
        numeric_value = float(percentage_str.replace('%', '')) / 100
        return numeric_value
    except ValueError:
        return None

def fighter_stats(fighter_url):
    fighter_page = requests.get(fighter_url)
    fighter_soup = BeautifulSoup(fighter_page.content, 'html.parser')

    fighter_data = {}
    fighter_physical_stats = fighter_soup.find('div', class_='b-list__info-box')
    if fighter_physical_stats:
        physical_details = fighter_physical_stats.find_all('li', class_='b-list__box-list-item')
        for detail in physical_details:
            label = detail.find('i', class_='b-list__box-item-title').get_text(strip=True)
            value = detail.get_text(strip=True).replace(label, '').strip()

            if label == 'Height:':
                fighter_data[label] = convert_height(value)
            elif label == 'Weight:':
                fighter_data[label] = convert_weight(value)
            elif label == 'Reach:':
                fighter_data[label] = convert_reach(value)
            else:
                fighter_data[label] = value

    fighter_career_stats = fighter_soup.find('div', class_='b-list__info-box-left')
    if fighter_career_stats:
        career_stats = fighter_career_stats.find_all('li', class_='b-list__box-list-item')
        for stat in career_stats:
            label_element = stat.find('i', class_='b-list__box-item-title')
            value = stat.get_text(strip=True).replace(label_element.get_text(strip=True), '').strip()
            label = label_element.get_text(strip=True).rstrip(':')

            if label in ['Str. Acc.', 'Str. Def', 'TD Acc.', 'TD Def.']:
                new_label = f"{label} (%)"
                converted_value = convert_percentage(value)
                fighter_data[new_label] = converted_value
            else:
                fighter_data[label] = value

    cols_to_strip_percent = ['Str. Acc. (%)', 'Str. Def (%)', 'TD Acc. (%)', 'TD Def. (%)']
    fighter_df = pd.DataFrame([fighter_data])
    fighter_df[cols_to_strip_percent] = fighter_df[cols_to_strip_percent].replace('%', '', regex=True)
    
    
    fighter_df['DOB:'] = pd.to_datetime(fighter_df['DOB:'], format='%b %d, %Y')
    today = datetime.datetime.today()
    fighter_df['Age'] = (today - fighter_df['DOB:']).apply(lambda x: x.days // 365)
    fighter_df.drop(columns=['DOB:'], inplace=True)

    fighter_df.drop('', axis=1, inplace=True)
    return fighter_df

In [23]:
def get_opponent_stats(fighter_url):
    opponent_urls = []

    fighter_page = requests.get(fighter_url)
    fighter_soup = BeautifulSoup(fighter_page.content, 'html.parser')

    recent_fights_table = fighter_soup.find('table')
    recent_fights_rows = recent_fights_table.find_all('tr')

    for row in recent_fights_rows:
        fighter_column = row.find('td', class_='b-fight-details__table-col l-page_align_left')
        
        if fighter_column:
            fighter_links = fighter_column.find_all('a', class_='b-link b-link_style_black')
            if len(fighter_links) >= 2:
                opponent_url = fighter_links[1]['href']
                opponent_urls.append(opponent_url)
    
    opponent_stats_dfs = []
    for opponent_url in opponent_urls:
        opponent_stats_df = fighter_stats(opponent_url)
        opponent_stats_dfs.append(opponent_stats_df)
    
    combined_opponent_stats = pd.concat(opponent_stats_dfs, ignore_index=True)
    combined_opponent_stats.columns = ["Opponent " + col for col in combined_opponent_stats.columns]
    return combined_opponent_stats



In [24]:

fighter_url = 'http://www.ufcstats.com/fighter-details/029eaff01e6bb8f0'
x = get_opponent_stats(fighter_url)


In [25]:
def scrape_fight_data(fight_url):   
    fight_page = requests.get(fight_url)
    fight_soup = BeautifulSoup(fight_page.content, 'html.parser')

    data = []
    fight_table = fight_soup.find('tbody', class_='b-fight-details__table-body')

    fighter_divs = fight_soup.find_all('div', class_='b-fight-details__person')
    outcomes = [outcome_tag.get_text(strip=True) if (outcome_tag := fighter_div.find('i', class_='b-fight-details__person-status')) else None for fighter_div in fighter_divs]

    for row in fight_table.find_all('tr', class_='b-fight-details__table-row'):
        fighter_names = row.find_all('a', class_='b-link_style_black')
        fighters = [name.get_text(strip=True) for name in fighter_names]

        kd = [col.get_text(strip=True) for col in row.find_all('td')[1].find_all('p')]
        sig_str = [col.get_text(strip=True) for col in row.find_all('td')[2].find_all('p')]
        sig_str_percent = [col.get_text(strip=True) for col in row.find_all('td')[3].find_all('p')]
        total_str = [col.get_text(strip=True) for col in row.find_all('td')[4].find_all('p')]
        td = [col.get_text(strip=True) for col in row.find_all('td')[5].find_all('p')]
        td_percent = [col.get_text(strip=True) for col in row.find_all('td')[6].find_all('p')]
        sub_att = [col.get_text(strip=True) for col in row.find_all('td')[7].find_all('p')]
        rev = [col.get_text(strip=True) for col in row.find_all('td')[8].find_all('p')]
        ctrl = [col.get_text(strip=True) for col in row.find_all('td')[9].find_all('p')]

        for i in range(len(fighters)):
            fight_info = {
                'Fighter': fighters[i],
                'KD': kd[i],
                'Sig. str.': sig_str[i],
                'Sig. str. %': sig_str_percent[i],
                'Total str.': total_str[i],
                'Td': td[i],
                'Td %': td_percent[i],
                'Sub. att': sub_att[i],
                'Rev.': rev[i],
                'Ctrl': ctrl[i],
                'Fight URL': fight_url,
                'Outcome': outcomes[i] 
            }
            data.append(fight_info)

    df = pd.DataFrame(data)

    new_data = []
    new_columns = [
        'Fighter', 'Opponent', 'KD', 'Opponent KD', 'Sig. str.', 'Opponent Sig. str.',
        'Sig. str. %', 'Opponent Sig. str. %', 'Total str.', 'Opponent Total str.',
        'Td', 'Opponent Td', 'Td %', 'Opponent Td %', 'Sub. att', 'Opponent Sub. att',
        'Rev.', 'Opponent Rev.', 'Ctrl', 'Opponent Ctrl', 'Outcome'
    ]

    for i in range(0, len(df), 2):
        fighter_row = df.iloc[i]
        opponent_row = df.iloc[i + 1]

        new_row = [
            fighter_row['Fighter'], opponent_row['Fighter'],
            fighter_row['KD'], opponent_row['KD'],
            fighter_row['Sig. str.'], opponent_row['Sig. str.'],
            fighter_row['Sig. str. %'], opponent_row['Sig. str. %'],
            fighter_row['Total str.'], opponent_row['Total str.'],
            fighter_row['Td'], opponent_row['Td'],
            fighter_row['Td %'], opponent_row['Td %'],
            fighter_row['Sub. att'], opponent_row['Sub. att'],
            fighter_row['Rev.'], opponent_row['Rev.'],
            fighter_row['Ctrl'], opponent_row['Ctrl'],
            fighter_row['Outcome']
        ]

        new_data.append(new_row)

    fight_df = pd.DataFrame(new_data, columns=new_columns)

    fight_df.replace('---', None, inplace=True)


    try:
        fight_df['Sig. str. %'] = fight_df['Sig. str. %'].apply(convert_percentage)
        fight_df['Opponent Sig. str. %'] = fight_df['Opponent Sig. str. %'].apply(convert_percentage)
        fight_df['Td %'] = fight_df['Td %'].apply(convert_percentage)
        fight_df['Opponent Td %'] = fight_df['Opponent Td %'].apply(convert_percentage)
    except AttributeError:
        pass  


    fight_df[['Sig. Landed', 'Sig. Attempted']] = fight_df['Sig. str.'].str.split(' of ', expand=True)
    fight_df['Sig. Landed'] = fight_df['Sig. Landed'].astype(int)
    fight_df['Sig. Attempted'] = fight_df['Sig. Attempted'].astype(int)

    fight_df[["Opponent Sig. Landed", "Opponent Sig. Attempted"]] = fight_df["Opponent Sig. str."].str.split(' of ', expand=True)
    fight_df["Opponent Sig. Landed"] = fight_df["Opponent Sig. Landed"].astype(int)
    fight_df["Opponent Sig. Attempted"] = fight_df["Opponent Sig. Attempted"].astype(int)

    fight_df[['Total Landed', 'Total Attempted']] = fight_df['Total str.'].str.split(' of ', expand=True)
    fight_df['Total Landed'] = fight_df['Total Landed'].astype(int)
    fight_df['Total Attempted'] = fight_df['Total Attempted'].astype(int)

    fight_df[['Opponent Total Landed', 'Opponent Total Attempted']] = fight_df['Opponent Total str.'].str.split(' of ', expand=True)
    fight_df['Opponent Total Landed'] = fight_df['Opponent Total Landed'].astype(int)
    fight_df['Opponent Total Attempted'] = fight_df['Opponent Total Attempted'].astype(int)

    fight_df[['Td Landed', 'Td Attempted']] = fight_df['Td'].str.split(' of ', expand=True)
    fight_df['Td Landed'] = fight_df['Td Landed'].astype(int)
    fight_df['Td Attempted'] = fight_df['Td Attempted'].astype(int)

    fight_df[['Opponent Td Landed', 'Opponent Td Attempted']] = fight_df['Opponent Td'].str.split(' of ', expand=True)
    fight_df['Opponent Td Landed'] = fight_df['Opponent Td Landed'].astype(int)
    fight_df['Opponent Td Attempted'] = fight_df['Opponent Td Attempted'].astype(int)

    fight_df.drop(['Sig. str.', "Opponent Sig. str.", 'Total str.', 'Opponent Total str.', 'Td', 'Opponent Td'], axis=1, inplace=True)

    return fight_df

In [26]:
#CREATE FUNCTION TO SCRAPE PAST FIGHT RESULST FROM A FIGHTER'S URL PAGE
def past_fights(fighter_url):
    fighter_page = requests.get(fighter_url)
    fighter_soup = BeautifulSoup(fighter_page.content, 'html.parser')

    # Get searched fighter's name
    fighter_name = fighter_soup.find('span', class_='b-content__title-highlight').get_text(strip=True)
    
    #Create DF from all previous fights from searched fighter
    fight_urls = get_fighter_fight_urls(fighter_url)
    all_fight_dfs = []
    for fight_url in fight_urls:
        fight_df = scrape_fight_data(fight_url)
        all_fight_dfs.append(fight_df)

    combined_df = pd.concat(all_fight_dfs, ignore_index=True)

    #Add the fight information to DF
    fight_table = fighter_soup.find('tbody', class_='b-fight-details__table-body')

    each_fight_details = fight_table.find_all('tr')

    for idx, fight in enumerate(each_fight_details[1:]):
        event_info_elements = fight.find_all('td', class_='b-fight-details__table-col l-page_align_left')
        round_time_elements = fight.find_all('td', class_='b-fight-details__table-col')

        event_name = event_info_elements[1].find_all('p')[0].text.strip()
        event_date = event_info_elements[1].find_all('p')[1].text.strip()
        method_of_victory = event_info_elements[2].find('p').text.strip()
        rounds = round_time_elements[-2].find('p').text.strip()
        time = round_time_elements[-1].find('p').text.strip()

        combined_df.at[idx, 'Event Name'] = event_name
        combined_df.at[idx, 'Event Date'] = event_date
        combined_df.at[idx, 'Method of Victory'] = method_of_victory
        combined_df.at[idx, 'Rounds'] = rounds
        combined_df.at[idx, 'Time'] = time

    # Flip the data from Opponent to Fighter if the searched fighter is Opponent
    for index, row in combined_df.iterrows():
        if row['Opponent'] == fighter_name:
            combined_df.at[index, 'Fighter'] = row['Opponent']
            combined_df.at[index, 'Opponent'] = row['Fighter']

            combined_df.at[index, 'KD'] = row['Opponent KD']
            combined_df.at[index, 'Opponent KD'] = row['KD']

            combined_df.at[index, 'Sig. str. %'] = row['Opponent Sig. str. %']
            combined_df.at[index, 'Opponent Sig. str. %'] = row['Sig. str. %']

            combined_df.at[index, 'Td %'] = row['Opponent Td %']
            combined_df.at[index, 'Opponent Td %'] = row['Td %']

            combined_df.at[index, 'Sub. att'] = row['Opponent Sub. att']
            combined_df.at[index, 'Opponent Sub. att'] = row['Sub. att']

            combined_df.at[index, 'Rev.'] = row['Opponent Rev.']
            combined_df.at[index, 'Opponent Rev.'] = row['Rev.']

            combined_df.at[index, 'Ctrl'] = row['Opponent Ctrl']
            combined_df.at[index, 'Opponent Ctrl'] = row['Ctrl']

            if row['Outcome'] == 'W':
                combined_df.at[index, 'Outcome'] = 'L'
            elif row['Outcome'] == 'L':
                combined_df.at[index, 'Outcome'] = 'W'

            combined_df.at[index, 'Sig. Landed'] = row['Opponent Sig. Landed']
            combined_df.at[index, 'Opponent Sig. Landed'] = row['Sig. Landed']     

            combined_df.at[index, 'Sig. Attempted'] = row['Opponent Sig. Attempted']
            combined_df.at[index, 'Opponent Sig. Attempted'] = row['Sig. Attempted']  

            combined_df.at[index, 'Total Landed'] = row['Opponent Total Landed']
            combined_df.at[index, 'Opponent Total Landed'] = row['Total Landed']   

            combined_df.at[index, 'Total Attempted'] = row['Opponent Total Attempted']
            combined_df.at[index, 'Opponent Total Attempted'] = row['Total Attempted']     

            combined_df.at[index, 'Td Landed'] = row['Opponent Td Landed']
            combined_df.at[index, 'Opponent Td Landed'] = row['Td Landed']

            combined_df.at[index, 'Td Attempted'] = row['Opponent Td Attempted']
            combined_df.at[index, 'Opponent Td Attempted'] = row['Td Attempted']           
    
    #ADD OHTER INTERESTING DATA TO BE USED IN ANALYSIS

    #Add current streak
    current_streak = []
    streak_type = []

    win_streak = 0
    loss_streak = 0

    for index, row in combined_df.iterrows():
        outcome = row['Outcome']

        if outcome == 'W':
            win_streak += 1
            loss_streak = 0
        elif outcome == 'L':
            loss_streak += 1
            win_streak = 0
        else:
            win_streak = 0
            loss_streak = 0

        if win_streak > 0:
            current_streak.append(win_streak)
            streak_type.append('Win')
        elif loss_streak > 0:
            current_streak.append(loss_streak)
            streak_type.append('Loss')
        else:
            current_streak.append(0)
            streak_type.append('None')


    combined_df['Current Streak'] = current_streak
    combined_df['Streak Type'] = streak_type

    # Add how long between each fight for analysis
    
    combined_df['Event Date'] = pd.to_datetime(combined_df['Event Date'])

    combined_df = combined_df.sort_values(by='Event Date', ascending=True)

    combined_df['Days since last fight'] = (combined_df['Event Date'] - combined_df['Event Date'].shift(1)).dt.days

    combined_df['Days since last fight'].fillna(0, inplace=True)


    return combined_df



In [27]:
#CREATE A FUNCTION TO CREATE A DF FROM FIGHTER_URL
def full_past_fights(fighter_url):
    past_fights_df = past_fights(fighter_url)  

    opponent_stats_df = get_opponent_stats(fighter_url)

    combined_df = pd.concat([past_fights_df, opponent_stats_df], axis=1)

    return combined_df


In [28]:

fighter_url = 'http://www.ufcstats.com/fighter-details/029eaff01e6bb8f0'
fighter = full_past_fights(fighter_url)


In [29]:
fighter

Unnamed: 0,Fighter,Opponent,KD,Opponent KD,Sig. str. %,Opponent Sig. str. %,Td %,Opponent Td %,Sub. att,Opponent Sub. att,...,Opponent STANCE:,Opponent SLpM,Opponent Str. Acc. (%),Opponent SApM,Opponent Str. Def (%),Opponent TD Avg.,Opponent TD Acc. (%),Opponent TD Def. (%),Opponent Sub. Avg.,Opponent Age
30,Dustin Poirier,Danny Castillo,0,0,0.44,0.74,,0.27,5,0,...,Orthodox,2.57,0.42,1.96,0.62,2.95,0.38,0.63,0.2,44
29,Dustin Poirier,Zachary Micklewright,0,0,0.59,0.33,,,0,0,...,Southpaw,5.56,0.53,4.72,0.54,0.7,0.12,1.0,0.0,47
28,Dustin Poirier,Josh Grispi,0,0,0.52,0.37,,22%,0,2,...,Orthodox,1.44,0.41,3.71,0.5,1.97,0.36,0.16,2.2,34
27,Dustin Poirier,Jason Young,0,0,0.52,0.35,0.55,,0,0,...,Orthodox,2.97,0.37,3.2,0.5,1.98,0.66,0.55,0.0,37
26,Dustin Poirier,Pablo Garza,0,0,0.36,0.41,1.0,0.0,1,1,...,Orthodox,3.37,0.44,2.7,0.5,0.94,0.25,0.15,1.3,39
25,Dustin Poirier,Max Holloway,0,0,0.44,0.34,0.33,,3,0,...,Orthodox,7.17,0.47,4.75,0.59,0.27,0.53,0.84,0.3,31
24,Dustin Poirier,Chan Sung Jung,0,0,0.44,0.48,0.0,1.0,0,3,...,Orthodox,3.93,0.41,4.52,0.52,0.7,0.44,0.72,0.6,36
23,Dustin Poirier,Jonathan Brookins,0,0,0.47,0.48,0.0,0.0,1,0,...,Southpaw,2.69,0.39,3.63,0.56,1.94,0.17,0.8,0.3,38
22,Dustin Poirier,Cub Swanson,0,0,0.41,0.46,0.22,1.0,0,0,...,Orthodox,4.7,0.5,3.84,0.59,1.09,0.52,0.61,0.4,39
21,Dustin Poirier,Erik Koch,2,0,0.58,0.43,33%,,2,5,...,Southpaw,2.33,0.42,2.82,0.47,1.02,0.42,0.81,1.0,34


In [None]:
# Function to convert '0:00' format to seconds
def convert_ctrl_to_seconds(ctrl_time):
    minutes, seconds = map(int, ctrl_time.split(':'))
    return minutes * 60 + seconds

In [None]:
def preprocessing_data(fighter_url):
    fighter_df = full_past_fights(fighter_url)

    fighter_df.drop(['Fighter', 'Opponent', 'Event Name', 'Event Date', 'Time'], axis=1, inplace=True)

    fighter_encoded = pd.get_dummies(fighter_df, columns=['Method of Victory', "Opponent STANCE:", "Streak Type"])

    outcome_dummies = pd.get_dummies(fighter_encoded['Outcome'], prefix='Outcome', dummy_na=True)
    fighter_encoded = pd.concat([fighter_encoded, outcome_dummies], axis=1)
    fighter_encoded.drop(['Outcome'], axis=1, inplace=True)
    
    fighter_encoded['Ctrl (in seconds)'] = fighter_encoded['Ctrl'].apply(convert_ctrl_to_seconds)
    fighter_encoded['Opponent Ctrl (in seconds)'] = fighter_encoded['Opponent Ctrl'].apply(convert_ctrl_to_seconds)
    fighter_encoded.drop(['Ctrl', 'Opponent Ctrl'], axis=1, inplace=True)
    
    return fighter_encoded