In [136]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re
from html import unescape
from datetime import datetime as dt
import time
import lxml
import itertools

# Event-Level Statistics:

In [137]:
#New Fight Details:
url = 'http://www.ufcstats.com/event-details/898337ef520fe4d3'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

In [138]:
#Set up datatype dictionary: 

data_types = {
    'R_STR': int, 
    'B_STR': int,
    'R_TD': int, 
    'B_TD': int, 
    'R_SUB': int, 
    'R_SUB': int, 
    'R_PASS': int, 
    'B_PASS': int,
    'ROUND': int,
}


def remove_space_lines(text):
    pattern1 = re.compile(r'[\s\s+]')
    return re.sub(pattern1, ' ', text)

#Determine if observation is a title-bout
def find_belt(img_tag):
    try:
        image_link = img_tag['src']
        if re.match(r'.*belt.*', image_link) != None:
            return True
    except:
        return False
    
    
def get_fight_auxiliary(soup):
    '''
    Input: beautifulsoup of an event url: (ie. http://www.ufcstats.com/event-details/53278852bcd91e11)
    Outputs: pandas Series
        date, location, attendance
    '''
    
    table = []
    
    auxiliary_table = soup.find_all('li', {'class': 'b-list__box-list-item'})
    for item in auxiliary_table:
        attribute = remove_space_lines(item.text).strip()

        #If attribute is missing, replace with ''
        try:
            attribute = re.findall(r'\s\s+(.*)', attribute)[0]
        except:
            attribute = '' 
        
        table.append(attribute)
        
    table_series = pd.Series(table)
    table_series.index = ['date', 'location', 'attendance']
    
    if table_series['attendance'] != '':
        table_series['attendance'] = re.sub(',', '', table_series['attendance'])
        table_series['attendance'] = int(table_series['attendance'])
    
    table_series['date'] = dt.strptime(table_series['date'], '%B %d, %Y').strftime('%d-%m-%Y')

    return table_series


def get_page_stats(url):
    
    #Given url of list of events, returns list of event details:
    #ie. return summary statistics for all fights on page like: http://www.ufcstats.com/event-details/53278852bcd91e11
    
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    stat_table = soup.findAll('table')[0].contents #Contents of the main table in html
    
    table_data = stat_table[3] #first 2 indices are empty strings, table_data is html starting from first table row
    detail_data = table_data.find_all('p') #within table rows, there are <p> labels for table text
    auxiliary_data = get_fight_auxiliary(soup) #Returns date, location, attendance of event
    
    image_data = table_data.find_all('img') #get image links to find belt for 
    
    contents = [] #table contents
    title_match_index = [] #used later to track which fights are title_bouts
    
    #Loop through elements of detail_data (html table) to scrape fight details:
    for index, item in enumerate(detail_data):
        
        #find image of belt == title_bout
        image = item.find('img')
        if find_belt(image):
            title_match_index.append(index) #grab index of fight in which belt appears
            
        #contents is list of all text from each element of table     
        contents.append(item.text)  

    
    #Clean up elements
    contents = list(map(lambda x: remove_space_lines(x), contents))
    contents = list(map(lambda x: x.strip(), contents)) 
    
    draw_index = []
    
    #When there's a draw or NC, additional tags are created --> remove the tag to reformat correctly   
    for i in np.arange(0, len(contents)-10, 16):

        if contents[i] != 'win':
            
            #Get the index of the match that was drawn & remove that element
            draw_index.append(np.floor_divide(i, 16)) 
            contents.pop(i)
                    
    #Extract links to more detailed fight statistics
    fight_links = table_data.find_all('a', {'class': 'b-flag b-flag_style_green'})
    fight_links = [item['href'] for item in fight_links]
    
    draw_links = table_data.find_all('a', {'class': 'b-flag b-flag_style_bordered'})
    draw_links = [item['href'] for item in draw_links]
    draw_links = list(dict.fromkeys(draw_links)) #Remove duplicate links from the drawn fights
    
    for index, link in zip(draw_index, draw_links):
        fight_links.insert(index, link)
    
    #each row of data is 16 elements: reformats 1 observation per row
    formatted_contents = np.array(contents).reshape((-1, 16))
    formatted_contents = pd.DataFrame(formatted_contents)
    
    #the first row is a list of 'wins'
    #formatted_contents.drop(0, axis = 1, inplace = True)
    
    #Run a floor_divide to put the image of the belt in the correct fight

    title_match = np.floor_divide(title_match_index, 16) 

    #Initialize title_bout column with all 0's
    titles = np.zeros(16)
    if len(title_match) != 0:
        titles[title_match] = 1
    
    title_series = pd.Series(titles)
    
    formatted_contents['title_bout'] = title_series
    
    #rename columns
    formatted_contents.columns = ['Winner', 'R_fighter', 'B_fighter', 'R_STR', 'B_STR', 
                               'R_TD', 'B_TD', 'R_SUB', 'R_SUB', 'R_PASS', 'B_PASS',
                              'WEIGHT_CLASS', 'METHOD', 'DETAIL', 'ROUND', 'TIME', 'title_bout']
    
    #convert columns to appropriate data types
    formatted_contents.replace('--', 99999, inplace = True)
    formatted_contents = formatted_contents.astype(data_types)
    formatted_contents['TIME'] = formatted_contents['TIME'].apply(lambda x: dt.strptime(x, '%M:%S').time())
    formatted_contents['link'] = fight_links
    
    return (formatted_contents, auxiliary_data)

In [139]:
event_level = get_page_stats(url)

event_level[0]['date'] = event_level[1].loc['date']
event_level[0]['location'] = event_level[1].loc['location']
event_level[0]['attendance'] = event_level[1].loc['attendance']

# Event-Detail Statistics:

In [140]:
def empty_page(soup):
    
    sections = soup.find('section', {'class': 'b-fight-details__section js-fight-section'})
    
    if sections.contents[0] == '\n    Round-by-round stats not currently available.\n': 
        return True
    return False


def empty_df():
    temp = pd.Series([99999]*len(template), index=template)
    return temp




def get_totals_table(soup):
    '''
    Input: beautifulsoup of detailed fight stats site (ie. http://www.ufcstats.com/fight-details/e0b323dae5bf4c90)
    Output: pandas DataFrame matching table labeled 'TOTALS'
    
    '''

    html_table = soup.find_all('table')[0]

    #Scrape Statistics: separated by double-space
    stats_table = pd.read_html(str(html_table))[0]
    table_columns = stats_table.columns
    total_statistics = [item.split('  ') for item in stats_table.loc[0][1:]]
    
    #Scrape Fighter names
    names_table = soup.find('td', {'class': 'b-fight-details__table-col l-page_align_left'})    
    names = [remove_space_lines(item.text).strip() for item in names_table.find_all('p')]
    
    #Append together, rearrange columns, and rename columns:
    total_statistics.append(names)

    total_statistics = pd.DataFrame(total_statistics).T
    total_statistics = total_statistics[[9,0,1,2,3,4,5,6,7,8]]
    total_statistics.columns = table_columns
    
    return total_statistics

def get_ss_table(soup):
    '''
    Input: beautifulsoup of detailed fight stats site (ie. http://www.ufcstats.com/fight-details/e0b323dae5bf4c90)
    Output: pandas DataFrame matching table labeled 'SIGNIFICANT STRIKES'
    
    '''
    html_table = soup.find_all('table')[2]

    
    #Scrape Statistics: separated by double-space
    stats_table = pd.read_html(str(html_table))[0]
    table_columns = stats_table.columns
    total_statistics = [item.split('  ') for item in stats_table.loc[0][1:]]
    
    #Append together, rearrange columns, and rename columns:
    #total_statistics.append(names)

    total_statistics = pd.DataFrame(total_statistics).T
    total_statistics.columns = table_columns[1:]
    
    return total_statistics
    
def get_combined(soup):
    name_stats = get_totals_table(soup)
    significant_stats = get_ss_table(soup)

    combined = pd.concat([name_stats, significant_stats], axis = 1)
    combined = combined.loc[:, ~combined.columns.duplicated()]
    combined.drop('Sig. str', axis = 1, inplace = True)

    return combined



def fight_attributes(soup):
        
    header_table = soup.find_all('i', {'class': 'b-fight-details__text-item'})[0:4]
    
    table = []
    for item in header_table:

        detail = remove_space_lines(item.text).strip()
        
        try:
            table.append(re.findall(r'\s\s+(.*)', detail)[0])
        except:
            table.append('')
     
    table_series = pd.Series(table, index=['rounds', 'time', 'format', 'referee'])
    #table_series = table_series.astype(data_types)
    table_series['rounds'] = int(table_series['rounds'])
    table_series['time'] = dt.strptime(table_series['time'], '%M:%S').time()
        

    return table_series




In [141]:
countables = ['Sig. str.', 'Total str.', 'Td', 'Head', 'Body', 'Leg', 'Distance', 'Clinch', 'Ground']
hit_att = ['Hits', 'Attempts']

countable_cols = list(itertools.product(countables, hit_att))
countable_cols = ['{} {}'.format(item[0], item[1]) for item in countable_cols]
#countable_cols


def split_countable(combined_df):
    
    split = combined_df.apply(lambda x: x.apply(lambda y: y.split('of')))
    
    split_stats = []
    for index, series in split.iterrows():
        split_stats.append(list(series.apply(pd.Series).stack()))
        
    split_df = pd.DataFrame(split_stats)
    split_df.columns = countable_cols
    
    return split_df
    
def df_single_row_format(combined_df, attr):
    
    df = pd.concat([combined_df.loc[0], combined_df.loc[1]], axis = 0)
    df['rounds'] = attr['rounds']
    df['time'] = attr['time']
    df['format'] = attr['format']
    df['referee'] = attr['referee']
    
    return df
    
#PUT IT ALL TOGETHER:
def get_detailed_page_stats(url):
    
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    #get Attributes:
    attributes = fight_attributes(soup)
    
    
    #If relevant statistics are unavailable return an 'unavailable_df'    
    if empty_page(soup) == True:
        empty = empty_df()   

        combined_df = pd.concat([empty, attributes], axis = 0)
        
        combined_df['url'] = url
        return combined_df
        
    
    else:
     
    #Get fight_df:

        combined_df = get_combined(soup)
        split_df = combined_df[countables]


        combined_df.drop(countables, axis = 1, inplace = True)

        countable_df = split_countable(split_df)

        combined_df = pd.concat([combined_df, countable_df], axis = 1)

        single_df = df_single_row_format(combined_df, attributes)

        single_df['url'] = url

        return single_df   
    

In [142]:
event_links = event_level[0]['link']
event_links

0     http://www.ufcstats.com/fight-details/d395828f...
1     http://www.ufcstats.com/fight-details/4e77cc2c...
2     http://www.ufcstats.com/fight-details/fcfaa28e...
3     http://www.ufcstats.com/fight-details/02f5621f...
4     http://www.ufcstats.com/fight-details/f0d4652f...
5     http://www.ufcstats.com/fight-details/b9abcce2...
6     http://www.ufcstats.com/fight-details/593f3b05...
7     http://www.ufcstats.com/fight-details/f2520dd5...
8     http://www.ufcstats.com/fight-details/3f810c34...
9     http://www.ufcstats.com/fight-details/08244974...
10    http://www.ufcstats.com/fight-details/b70dfff1...
Name: link, dtype: object

In [143]:
fight_details = []

for index, item in enumerate(event_links):
    
    try:
        print(index)
        fight_details.append(get_detailed_page_stats(item))
        
    except:
        print("error: {}".format(item))
        fight_details.append(index)
        
    time.sleep(1)

0
1
2
3
4
5
6
7
8
9
10


In [144]:
batch_to_file = pd.DataFrame()

for item in fight_details:
    temp = item.to_frame().T
    batch_to_file = pd.concat([batch_to_file, temp], axis = 0)

# Data Clean & Combine:

In [259]:
df = batch_to_file.copy()
df.reset_index(drop = True, inplace = True)
df.columns = df.columns.where(~df.columns.duplicated(), df.columns + '.1')

In [260]:
df['Sig. str. %'] = df['Sig. str. %'].apply(lambda x: re.sub('%', '', x))
df['Td %'] = df['Td %'].apply(lambda x: re.sub('%', '', x))

df['Sig. str. %.1'] = df['Sig. str. %.1'].apply(lambda x: re.sub('%', '', x))
df['Td %.1'] = df['Td %.1'].apply(lambda x: re.sub('%', '', x))

df['Sig. str. %'] = df['Sig. str. %'].astype(float)
df['Sig. str. %'] = df['Sig. str. %'] / 100

df['Td %'] = df['Td %'].astype(float)
df['Td %'] = df['Td %'] / 100

df['Sig. str. %.1'] = df['Sig. str. %.1'].astype(float)
df['Sig. str. %.1'] = df['Sig. str. %.1'] / 100

df['Td %.1'] = df['Td %.1'].astype(float)
df['Td %.1'] = df['Td %.1'] / 100


In [261]:
combined_df = pd.concat([event_level[0], df], axis = 1)

drop_columns = ['R_STR', 'B_STR', 'R_TD', 'B_TD',
       'R_SUB', 'R_SUB', 'R_PASS', 'B_PASS']
combined_df.drop(drop_columns, axis = 1, inplace = True)
combined_df.drop(['ROUND', 'TIME', 'link'], axis = 1, inplace = True)

In [262]:
winner = []
for index, row in combined_df.iterrows():
    
    if row['Winner'] != 'win':
        winner.append(99999)
    
    elif (row['R_fighter'] == row['Fighter']):
        winner.append(0)
    else:
        winner.append(1)
        
combined_df['label'] = winner

#Reformat match format:

round_times = combined_df['format'].apply(lambda x: re.search(r'\(.*\)', x))
round_times = round_times.apply(lambda x: x.group().strip('()').split('-') if x != None else [1])
round_times = round_times.apply(lambda x: np.array(x, dtype = 'float'))
round_times[0:5]

#Returns time in minutes given datetime object:
def to_minutes(time):
    
    try:
        set_time = dt.strptime(time, '%H:%M:%S')
    except:
        pass
    
    return time.minute + time.second / 60

#Calculate total match time
round_dot = []

for num_rounds, time, item in zip(combined_df['rounds'], combined_df['time'], round_times):
    
    minutes = to_minutes(time)
    
    round_indicator = np.zeros(len(item))
    round_indicator[0: num_rounds-1] = 1
    
    round_time = round(np.dot(round_indicator, item) + minutes, 2)
    
    round_dot.append(round_time)
    
combined_df['match_time'] = round_dot

combined_df.reset_index(drop = True, inplace = True)
combined_df

Unnamed: 0,Winner,R_fighter,B_fighter,WEIGHT_CLASS,METHOD,DETAIL,title_bout,date,location,attendance,...,Clinch Attempts.1,Ground Hits.1,Ground Attempts.1,rounds,time,format,referee,url,label,match_time
0,win,Justin Gaethje,Tony Ferguson,Lightweight,KO/TKO,Punch,1.0,09-05-2020,"Jacksonville, Florida, USA",0,...,0,0,0,5,00:03:39,5 Rnd (5-5-5-5-5),Herb Dean,http://www.ufcstats.com/fight-details/d395828f...,1,23.65
1,win,Henry Cejudo,Dominick Cruz,Bantamweight,KO/TKO,Knee,1.0,09-05-2020,"Jacksonville, Florida, USA",0,...,3,0,0,2,00:04:58,5 Rnd (5-5-5-5-5),Keith Peterson,http://www.ufcstats.com/fight-details/4e77cc2c...,0,9.97
2,win,Francis Ngannou,Jairzinho Rozenstruik,Heavyweight,KO/TKO,Punch,0.0,09-05-2020,"Jacksonville, Florida, USA",0,...,0,0,0,1,00:00:20,3 Rnd (5-5-5),Dan Miragliotta,http://www.ufcstats.com/fight-details/fcfaa28e...,0,0.33
3,win,Calvin Kattar,Jeremy Stephens,Featherweight,KO/TKO,Elbow,0.0,09-05-2020,"Jacksonville, Florida, USA",0,...,4,4,6,2,00:02:49,3 Rnd (5-5-5),Jason Herzog,http://www.ufcstats.com/fight-details/02f5621f...,1,7.82
4,win,Greg Hardy,Yorgan De Castro,Heavyweight,U-DEC,,0.0,09-05-2020,"Jacksonville, Florida, USA",0,...,0,0,0,3,00:05:00,3 Rnd (5-5-5),Herb Dean,http://www.ufcstats.com/fight-details/f0d4652f...,0,15.0
5,win,Anthony Pettis,Donald Cerrone,Welterweight,U-DEC,,0.0,09-05-2020,"Jacksonville, Florida, USA",0,...,5,0,0,3,00:05:00,3 Rnd (5-5-5),Keith Peterson,http://www.ufcstats.com/fight-details/b9abcce2...,0,15.0
6,win,Aleksei Oleinik,Fabricio Werdum,Heavyweight,S-DEC,,0.0,09-05-2020,"Jacksonville, Florida, USA",0,...,20,3,3,3,00:05:00,3 Rnd (5-5-5),Herb Dean,http://www.ufcstats.com/fight-details/593f3b05...,0,15.0
7,win,Carla Esparza,Michelle Waterson,Women's Strawweight,S-DEC,,0.0,09-05-2020,"Jacksonville, Florida, USA",0,...,6,0,0,3,00:05:00,3 Rnd (5-5-5),Dan Miragliotta,http://www.ufcstats.com/fight-details/f2520dd5...,0,15.0
8,win,Vicente Luque,Niko Price,Welterweight,KO/TKO,Punches,0.0,09-05-2020,"Jacksonville, Florida, USA",0,...,19,4,4,3,00:03:37,3 Rnd (5-5-5),Jason Herzog,http://www.ufcstats.com/fight-details/3f810c34...,0,13.62
9,win,Bryce Mitchell,Charles Rosa,Featherweight,U-DEC,,0.0,09-05-2020,"Jacksonville, Florida, USA",0,...,0,1,2,3,00:05:00,3 Rnd (5-5-5),Keith Peterson,http://www.ufcstats.com/fight-details/08244974...,0,15.0


# Add in Fighter Information:

In [266]:
fighters = pd.read_csv('Data/fighter.csv', index_col = 0)

In [300]:
player_columns = ['DOB', 'Height', 'Reach', 'Stance', 'Weight',
                 'DOB.1', 'Height.1', 'Reach.1', 'Stance.1', 'Weight.1',]

R_B_combined = pd.DataFrame()

#Given 2 players in a match, retrieve their corresponding personal attributes (height, weight, reach, etc.)

for index, row in combined_df.iterrows():
    
    rand = np.random.uniform(0, 1)
    print(rand)
    if rand >= .5:
        R_fighter = row['Fighter']
        B_fighter = row['Fighter.1']
    else:
        R_fighter = row['Fighter.1']
        B_fighter = row['Fighter']

    R_stats = fighters[fighters['Name'] == R_fighter].copy()
    B_stats = fighters[fighters['Name'] == B_fighter].copy()
    
    R_stats.reset_index(drop = True, inplace = True)
    B_stats.reset_index(drop = True, inplace = True)
    
    combined = pd.concat([R_stats, B_stats], axis = 1)
    R_B_combined = pd.concat([R_B_combined, combined], axis = 0)

    
    
R_B_combined.drop('Name', axis = 1, inplace = True)
R_B_combined.columns = player_columns
R_B_combined.reset_index(drop = True, inplace = True)

data = pd.concat([combined_df, R_B_combined], axis = 1)
data.shape
data[['R_fighter', 'Fighter']]

0.6320720612556471
0.5284743741755307
0.6635533090317531
0.37428415120316016
0.42997991143003633
0.7640950521791939
0.3107023053839063
0.7693173101118197
0.4561559851653221
0.20175403252689283
0.03925348746421553


Unnamed: 0,R_fighter,Fighter
0,Justin Gaethje,Tony Ferguson
1,Henry Cejudo,Henry Cejudo
2,Francis Ngannou,Francis Ngannou
3,Calvin Kattar,Jeremy Stephens
4,Greg Hardy,Greg Hardy
5,Anthony Pettis,Anthony Pettis
6,Aleksei Oleinik,Aleksei Oleinik
7,Carla Esparza,Carla Esparza
8,Vicente Luque,Vicente Luque
9,Bryce Mitchell,Bryce Mitchell


# Append to existing Fight's Data:

In [268]:
cumulative_fights = pd.read_csv('Data/fights.csv', index_col = 0)
cumulative_fights

Unnamed: 0,Winner,R_fighter,B_fighter,WEIGHT_CLASS,METHOD,DETAIL,title_bout,date,location,attendance,...,DOB,Height,Reach,Stance,Weight,DOB.1,Height.1,Reach.1,Stance.1,Weight.1
0,win,Charles Oliveira,Kevin Lee,Lightweight,SUB,Guillotine Choke,0.0,14-03-2020,"Brasilia, Distrito Federal, Brazil",0.0,...,04-09-1992,69,77,Orthodox,170,17-10-1989,61,74,Orthodox,155
1,win,Gilbert Burns,Demian Maia,Welterweight,KO/TKO,Punch,0.0,14-03-2020,"Brasilia, Distrito Federal, Brazil",0.0,...,06-11-1977,73,72,Southpaw,170,20-07-1986,61,71,Orthodox,170
2,win,Renato Moicano,Damir Hadzovic,Lightweight,SUB,Rear Naked Choke,0.0,14-03-2020,"Brasilia, Distrito Federal, Brazil",0.0,...,21-05-1989,61,72,Orthodox,155,08-08-1986,69,70,Orthodox,155
3,win,Nikita Krylov,Johnny Walker,Light Heavyweight,U-DEC,,0.0,14-03-2020,"Brasilia, Distrito Federal, Brazil",0.0,...,30-03-1992,78,82,Orthodox,205,07-03-1992,75,77,Orthodox,205
4,win,Francisco Trinaldo,John Makdessi,Lightweight,U-DEC,,0.0,14-03-2020,"Brasilia, Distrito Federal, Brazil",0.0,...,24-08-1978,69,70,Southpaw,155,03-05-1985,68,68,Orthodox,155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5538,win,Gerard Gordeau,Kevin Rosier,Open Weight,KO/TKO,,0.0,12-11-1993,"Denver, Colorado, USA",2800.0,...,30-03-1959,77,0,Orthodox,216,0,76,0,Orthodox,275
5539,win,Ken Shamrock,Patrick Smith,Open Weight,SUB,Heel Hook,0.0,12-11-1993,"Denver, Colorado, USA",2800.0,...,11-02-1964,73,72,Orthodox,205,28-08-1963,74,0,Orthodox,225
5540,win,Royce Gracie,Art Jimmerson,Open Weight,SUB,Other,0.0,12-11-1993,"Denver, Colorado, USA",2800.0,...,12-12-1966,73,0,Southpaw,175,04-08-1963,73,0,Orthodox,196
5541,win,Kevin Rosier,Zane Frazier,Open Weight,KO/TKO,,0.0,12-11-1993,"Denver, Colorado, USA",2800.0,...,0,76,0,Orthodox,275,0,77,0,Orthodox,250


In [269]:
append_new_df = pd.concat([data, cumulative_fights], axis = 0)
append_new_df.reset_index(drop=True, inplace = True)

In [270]:
append_new_df.to_csv('Data/test_data.csv')

# Update Relevant Personals:

In [304]:
new_fight_names = [name for name in data['R_fighter']] + [name for name in data['B_fighter']]

In [305]:
pd.set_option('display.max_rows', 100)

df = append_new_df.copy()
for col in df.columns:
    try:
        df[col] = df[col].astype('float')
    except:
        pass

In [306]:
#Relevant columns when breaking out data for specific fighters

fight_columns = ['Winner', 'R_fighter', 'Fighter', 'KD',
       'Sig. str. %', 'Td %', 'Sub. att', 'Pass', 'Rev.', 'Sig. str. Hits',
       'Sig. str. Attempts', 'Total str. Hits', 'Total str. Attempts',
       'Td Hits', 'Td Attempts', 'Head Hits', 'Head Attempts', 'Body Hits',
       'Body Attempts', 'Leg Hits', 'Leg Attempts', 'Distance Hits',
       'Distance Attempts', 'Clinch Hits', 'Clinch Attempts', 'Ground Hits',
       'Ground Attempts', 'DOB', 'Height', 'Reach',
       'Stance', 'Weight', 'date', 'match_time']

fight_columns_1 = ['Winner','R_fighter','Fighter.1', 'KD.1', 'Sig. str. %.1', 'Td %.1',
       'Sub. att.1', 'Pass.1', 'Rev..1', 'Sig. str. Hits.1',
       'Sig. str. Attempts.1', 'Total str. Hits.1', 'Total str. Attempts.1',
       'Td Hits.1', 'Td Attempts.1', 'Head Hits.1', 'Head Attempts.1',
       'Body Hits.1', 'Body Attempts.1', 'Leg Hits.1', 'Leg Attempts.1',
       'Distance Hits.1', 'Distance Attempts.1', 'Clinch Hits.1',
       'Clinch Attempts.1', 'Ground Hits.1', 'Ground Attempts.1', 'DOB.1', 'Height.1', 'Reach.1', 'Stance.1',
       'Weight.1', 'date', 'match_time']

def get_fight_attributes(index):
    
    temp = df.loc[index]
    return temp[['Fighter', 'Fighter.1', 'title_bout',
                'date', 'location', 'attendance', 'rounds', 'time', 'format', 'referee']]
    

def get_fights(fighter):
    
    data = df[(df['Fighter'] == fighter) | (df['Fighter.1'] == fighter)]
    
    return data


#get_personal returns fight statistics only fighter  
def get_personal(fights, fighter):
    
    a = fights[fights['Fighter'] == fighter]
    a1 = fights[fights['Fighter.1'] == fighter]
    
    a = a[fight_columns].copy()
    a1 = a1[fight_columns_1].copy()

    a1.columns = fight_columns  
    
    combined = pd.concat([a, a1], axis = 0)
    combined.sort_index(ascending = False, inplace = True)
    
    return combined

#Calculate age of fighter given DOB:
def calc_age(personal):
    try:
        dob = dt.strptime(personal['DOB'].iloc[0], '%d-%m-%Y')
        match_dates = personal['date'].apply(lambda x: dt.strptime(x, '%d-%m-%Y'))

        ages = match_dates.apply(lambda x: round(((x - dob).days)/365,2))
    except:
        ages = pd.Series(float(0.0), index=personal.index)
        
    ages = ages.rename('age')

    return ages

#Calculate a fighter's # of Wins & Losses:
def calc_WL(personal, shift = True):
        
    wins = personal['R_fighter'].apply(lambda x: x == personal['Fighter'].iloc[0])
    losses = personal['R_fighter'].apply(lambda x: x != personal['Fighter'].iloc[0])
    draws = pd.Series(0, index = wins.index)
    
    for idx in wins.index:
        if personal['Winner'].loc[idx] != 'win':
            draws[idx] = 1
            wins[idx] = False
            losses[idx] = False
    
    #Winning/Losing Streak Algorith:
    switch = []
    for item in np.arange(1, len(wins)):
        if wins.iloc[item] == wins.iloc[item-1]:
            switch.append(False)
        else:
            switch.append(True)
    
    if wins.iloc[0]:
        streak = [1]
    else:
        streak = [-1]

    for item, swap in zip(wins, switch):
        if swap:
            streak.append(1 * -np.sign(streak[-1]))
        else:
            streak.append(streak[-1] + np.sign(streak[-1])*1)

            
    streak = pd.Series(streak, index = wins.index)
    #End streak algorithm
        
    
    wl = pd.DataFrame([np.cumsum(losses), np.cumsum(wins), np.cumsum(draws)]).T
    wl = pd.concat([wl, streak], axis = 1)
    wl.columns = ['loss', 'win', 'draws', 'streak']
    
    if shift:
        wl = wl.shift(1)
    
    wl.iloc[0, :] = 0

    return wl

#Calculate a fighter's cumulative time in the ring (across all matches):
def calc_time(personal, shift = True):

    time = np.cumsum(personal['match_time'])
    initial = time.iloc[0]
    
    if shift:
        time = time.shift(1)
    
    time.iloc[0] = initial
    time = time.rename('cum_match_time')

    return time

#Calculate cumulative fight statistics for each fighter
def calc_stats(personal, shift = True):
    stat_columns = ['KD', 'Sub. att', 'Pass',
       'Rev.', 'Sig. str. Hits', 'Sig. str. Attempts', 'Total str. Hits',
       'Total str. Attempts', 'Td Hits', 'Td Attempts', 'Head Hits',
       'Head Attempts', 'Body Hits', 'Body Attempts', 'Leg Hits',
       'Leg Attempts', 'Distance Hits', 'Distance Attempts', 'Clinch Hits',
       'Clinch Attempts', 'Ground Hits', 'Ground Attempts']
    
    stats = personal[stat_columns]
    
    cumulative_columns = ['cum_{}'.format(item) for item in stat_columns]
    
    stats = np.cumsum(stats)
    initial = stats.iloc[0, :].copy()
    
    if shift:
        stats = stats.shift(1)
    
    stats.iloc[0, :] = initial
    stats.columns = cumulative_columns
    
    return stats


#The calculated personal is the personal_df with all cumulative statistics included

def get_calculated_personal(fighter, shift = True):
    
    if (len(df[df['Fighter'] == fighter]) == 0) & (len(df[df['Fighter.1'] == fighter]) == 0):
        return None
    
    
    personal = get_personal(df, fighter)
    
    ages = calc_age(personal)
    
    wl = calc_WL(personal, shift)
    round_times = calc_time(personal, shift)
    stats = calc_stats(personal, shift)
    
    final = pd.concat([personal, ages, wl, round_times, stats], axis = 1)
    
    return final



In [307]:
new_personals = []
final_df = pd.DataFrame()

for name in new_fight_names:
    new_personals.append(get_calculated_personal(name))
    
for personal in new_personals:
    final_df = final_df.append(personal.iloc[-1, :])
    
final_df = final_df[new_personals[0].columns]

In [308]:
cumulative_columns = ['cum_KD', 'cum_Sub. att', 'cum_Pass', 'cum_Rev.',
       'cum_Sig. str. Hits', 'cum_Sig. str. Attempts', 'cum_Total str. Hits',
       'cum_Total str. Attempts', 'cum_Td Hits', 'cum_Td Attempts',
       'cum_Head Hits', 'cum_Head Attempts', 'cum_Body Hits',
       'cum_Body Attempts', 'cum_Leg Hits', 'cum_Leg Attempts',
       'cum_Distance Hits', 'cum_Distance Attempts', 'cum_Clinch Hits',
       'cum_Clinch Attempts', 'cum_Ground Hits', 'cum_Ground Attempts']

#Use cumulative statistics / 10 minutes of match time (10 min. is global match time average across all fighters)
for item in cumulative_columns:
    a = round(final_df[item] / final_df['cum_match_time'] * 10,2)
    final_df['avg_{}'.format(item)] = a

In [309]:
drop_columns = ['KD', 'Sig. str. %', 'Td %',
       'Sub. att', 'Pass', 'Rev.', 'Sig. str. Hits', 'Sig. str. Attempts',
       'Total str. Hits', 'Total str. Attempts', 'Td Hits', 'Td Attempts',
       'Head Hits', 'Head Attempts', 'Body Hits', 'Body Attempts', 'Leg Hits',
       'Leg Attempts', 'Distance Hits', 'Distance Attempts', 'Clinch Hits',
       'Clinch Attempts', 'Ground Hits', 'Ground Attempts']

final_df.drop(drop_columns, axis = 1, inplace = True)

sig_str = final_df['cum_Sig. str. Hits'] / final_df['cum_Sig. str. Attempts']
hits = final_df['cum_Total str. Hits'] / final_df['cum_Total str. Attempts']
tds = final_df['cum_Td Hits'] / final_df['cum_Td Attempts']
head = final_df['cum_Head Hits'] / final_df['cum_Head Attempts']
leg = final_df['cum_Leg Hits'] / final_df['cum_Leg Attempts']
body = final_df['cum_Body Hits'] / final_df['cum_Body Attempts']
distance = final_df['cum_Distance Hits'] / final_df['cum_Distance Attempts']
clinch = final_df['cum_Clinch Hits'] / final_df['cum_Clinch Attempts']
ground = final_df['cum_Ground Hits'] / final_df['cum_Ground Attempts']

efficiency_stats = [sig_str, hits, tds, head, leg, body, distance, clinch, ground]
efficiency_columns = ['eff_sig_str', 'eff_hits', 'eff_tds', 'eff_head', 'eff_leg', 'eff_body', 
                      'eff_distance', 'eff_clinch', 'eff_ground']

for item in efficiency_stats:
    item.fillna(0, inplace = True)
    
for name, item in zip(efficiency_columns, efficiency_stats):
    final_df[name] = item
    final_df[name] = final_df[name].apply(lambda x: round(x, 2))

    
drop_columns = ['cum_KD', 'cum_Sub. att', 'cum_Pass', 'cum_Rev.',
       'cum_Sig. str. Hits', 'cum_Sig. str. Attempts', 'cum_Total str. Hits',
       'cum_Total str. Attempts', 'cum_Td Hits', 'cum_Td Attempts',
       'cum_Head Hits', 'cum_Head Attempts', 'cum_Body Hits',
       'cum_Body Attempts', 'cum_Leg Hits', 'cum_Leg Attempts',
       'cum_Distance Hits', 'cum_Distance Attempts', 'cum_Clinch Hits',
       'cum_Clinch Attempts', 'cum_Ground Hits', 'cum_Ground Attempts']

final_df.drop(drop_columns, axis = 1, inplace = True)

In [310]:
final_columns = ['Winner', 'R_fighter', 'Fighter', 'DOB', 'Height', 'Reach', 'Stance',
       'Weight', 'date', 'match_time', 'age', 'loss', 'win', 'draws', 'streak',
       'cum_match_time', 'avg_cum_KD', 'avg_cum_Sub. att', 'avg_cum_Pass',
       'avg_cum_Rev.', 'avg_cum_Sig. str. Hits', 'avg_cum_Sig. str. Attempts',
       'avg_cum_Total str. Hits', 'avg_cum_Total str. Attempts',
       'avg_cum_Td Hits', 'avg_cum_Td Attempts', 'avg_cum_Head Hits',
       'avg_cum_Head Attempts', 'avg_cum_Body Hits', 'avg_cum_Body Attempts',
       'avg_cum_Leg Hits', 'avg_cum_Leg Attempts', 'avg_cum_Distance Hits',
       'avg_cum_Distance Attempts', 'avg_cum_Clinch Hits',
       'avg_cum_Clinch Attempts', 'avg_cum_Ground Hits',
       'avg_cum_Ground Attempts', 'eff_sig_str', 'eff_hits', 'eff_tds',
       'eff_head', 'eff_leg', 'eff_body', 'eff_distance', 'eff_clinch',
       'eff_ground']

final_columns_1 = ['{}.1'.format(item) for item in final_columns]

pd.set_option('display.max_rows', 100)

reformat_final_df = pd.DataFrame()

#For every fight_index, there are exactly two fighers that participated. Combine them into one fight observation:
for item in set(final_df.index):
    temp = pd.DataFrame(final_df.loc[item].iloc[0, :]).T
    temp1 = pd.DataFrame(final_df.loc[item].iloc[1, :]).T

    temp.columns = final_columns
    temp1.columns = final_columns_1
    
    combined = pd.concat([temp, temp1], axis = 1)
    reformat_final_df = pd.concat([reformat_final_df, combined], axis = 0)
    
drop_columns = ['Winner.1', 'R_fighter.1', 'match_time', 'match_time.1', 'DOB', 'DOB.1', 'date', 'date.1']
reformat_final_df.drop(drop_columns, axis = 1, inplace = True)

In [311]:
combined_df['num_rounds'] = combined_df['format'].apply(lambda x: int(x[0]) if x[0] != 'N' else 1)
combined_df.reset_index(drop = True, inplace = True)

keep_columns = ['WEIGHT_CLASS', 'title_bout', 'location', 'attendance', 'num_rounds']
event_attr = combined_df[keep_columns].copy()

reformat_final_df = pd.concat([reformat_final_df, event_attr], axis = 1)

winner = []
for index, row in reformat_final_df.iterrows():
    
    if row['Winner'] != 'win':
        winner.append(99999)
    
    elif (row['R_fighter'] == row['Fighter']):
        winner.append(0)
    else:
        winner.append(1)
        
reformat_final_df['label'] = winner

reformat_final_df['matches'] = reformat_final_df['win'] + reformat_final_df['loss'] + reformat_final_df['draws']+1
reformat_final_df['matches.1'] = reformat_final_df['win.1'] + reformat_final_df['loss.1'] + reformat_final_df['draws.1']+1

In [312]:
reformat_final_df

Unnamed: 0,Winner,R_fighter,Fighter,Height,Reach,Stance,Weight,age,loss,win,...,eff_clinch.1,eff_ground.1,WEIGHT_CLASS,title_bout,location,attendance,num_rounds,label,matches,matches.1
0,win,Justin Gaethje,Justin Gaethje,61,76,Orthodox,155,31.5,2,4,...,0.6,0.8,Lightweight,1.0,"Jacksonville, Florida, USA",0,5,0,7,17
1,win,Henry Cejudo,Henry Cejudo,68,68,Orthodox,135,33.27,2,9,...,0.81,0.72,Bantamweight,1.0,"Jacksonville, Florida, USA",0,5,0,12,7
2,win,Francis Ngannou,Francis Ngannou,76,83,Orthodox,250,33.7,2,9,...,0.75,1.0,Heavyweight,0.0,"Jacksonville, Florida, USA",0,3,0,12,5
3,win,Calvin Kattar,Calvin Kattar,61,72,Orthodox,145,32.14,2,4,...,0.72,0.65,Featherweight,0.0,"Jacksonville, Florida, USA",0,3,0,7,33
4,win,Greg Hardy,Greg Hardy,77,80,Orthodox,265,31.8,2,2,...,0.0,0.0,Heavyweight,0.0,"Jacksonville, Florida, USA",0,3,0,6,2
5,win,Anthony Pettis,Anthony Pettis,73,73,Orthodox,155,33.3,9,9,...,0.65,0.69,Welterweight,0.0,"Jacksonville, Florida, USA",0,3,0,19,35
6,win,Aleksei Oleinik,Aleksei Oleinik,74,80,Orthodox,240,42.92,4,7,...,0.71,0.63,Heavyweight,0.0,"Jacksonville, Florida, USA",0,3,0,12,17
7,win,Carla Esparza,Carla Esparza,61,63,,115,32.6,4,6,...,0.85,0.85,Women's Strawweight,0.0,"Jacksonville, Florida, USA",0,3,0,11,9
8,win,Vicente Luque,Vicente Luque,61,75,Orthodox,170,28.47,3,10,...,0.48,0.67,Welterweight,0.0,"Jacksonville, Florida, USA",0,3,0,14,11
9,win,Bryce Mitchell,Bryce Mitchell,61,70,Southpaw,145,25.61,0,3,...,0.48,0.55,Featherweight,0.0,"Jacksonville, Florida, USA",0,3,0,4,7


In [320]:
#to_viz = pd.read_csv('Data/to_vis.csv', index_col = 0)
#to_viz = to_viz.append(reformat_final_df)
#to_viz.reset_index(drop=True, inplace = True)
#to_viz.to_csv('Data/to_vis.csv')

In [329]:
#Difference is measured as fighter 0 - fighter 1

reformat_final_df['Reach_diff'] = reformat_final_df['Reach'] - reformat_final_df['Reach.1']
reformat_final_df['Weight_diff'] = reformat_final_df['Weight'] - reformat_final_df['Weight.1']
reformat_final_df['Height_diff'] = reformat_final_df['Height'] - reformat_final_df['Height.1']

reformat_final_df['round_age'] = reformat_final_df['age'].apply(lambda x: round(x,0))
reformat_final_df['round_age.1'] = reformat_final_df['age.1'].apply(lambda x: round(x,0))

In [330]:
reformat_final_df

Unnamed: 0,Winner,R_fighter,Fighter,Height,Reach,Stance,Weight,age,loss,win,...,attendance,num_rounds,label,matches,matches.1,Reach_diff,Weight_diff,Height_diff,round_age,round_age.1
0,win,Justin Gaethje,Justin Gaethje,61,76,Orthodox,155,31.5,2,4,...,0,5,0,7,17,6,0,0,32.0,36.0
1,win,Henry Cejudo,Henry Cejudo,68,68,Orthodox,135,33.27,2,9,...,0,5,0,12,7,4,0,4,33.0,35.0
2,win,Francis Ngannou,Francis Ngannou,76,83,Orthodox,250,33.7,2,9,...,0,3,0,12,5,5,8,0,34.0,32.0
3,win,Calvin Kattar,Calvin Kattar,61,72,Orthodox,145,32.14,2,4,...,0,3,0,7,33,1,0,-8,32.0,34.0
4,win,Greg Hardy,Greg Hardy,77,80,Orthodox,265,31.8,2,2,...,0,3,0,6,2,6,15,5,32.0,33.0
5,win,Anthony Pettis,Anthony Pettis,73,73,Orthodox,155,33.3,9,9,...,0,3,0,19,35,1,0,12,33.0,37.0
6,win,Aleksei Oleinik,Aleksei Oleinik,74,80,Orthodox,240,42.92,4,7,...,0,3,0,12,17,3,9,-2,43.0,43.0
7,win,Carla Esparza,Carla Esparza,61,63,,115,32.6,4,6,...,0,3,0,11,9,1,0,-2,33.0,34.0
8,win,Vicente Luque,Vicente Luque,61,75,Orthodox,170,28.47,3,10,...,0,3,0,14,11,-1,0,-11,28.0,31.0
9,win,Bryce Mitchell,Bryce Mitchell,61,70,Southpaw,145,25.61,0,3,...,0,3,0,4,7,1,0,-8,26.0,34.0


In [350]:
update_df = pd.read_csv('Data/to_model.csv', index_col = 0)
update_df = pd.concat([reformat_final_df, update_df], axis = 0)
update_df.reset_index(drop = True, inplace = True)
update_df

Unnamed: 0,Winner,R_fighter,Fighter,Height,Reach,Stance,Weight,age,loss,win,...,attendance,num_rounds,label,matches,matches.1,Reach_diff,Weight_diff,Height_diff,round_age,round_age.1
0,win,Justin Gaethje,Justin Gaethje,61,76,Orthodox,155,31.5,2,4,...,0.0,5,0,7,17,6,0,0,32.0,36.0
1,win,Henry Cejudo,Henry Cejudo,68,68,Orthodox,135,33.27,2,9,...,0.0,5,0,12,7,4,0,4,33.0,35.0
2,win,Francis Ngannou,Francis Ngannou,76,83,Orthodox,250,33.7,2,9,...,0.0,3,0,12,5,5,8,0,34.0,32.0
3,win,Calvin Kattar,Calvin Kattar,61,72,Orthodox,145,32.14,2,4,...,0.0,3,0,7,33,1,0,-8,32.0,34.0
4,win,Greg Hardy,Greg Hardy,77,80,Orthodox,265,31.8,2,2,...,0.0,3,0,6,2,6,15,5,32.0,33.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5456,win,Gerard Gordeau,Gerard Gordeau,77,77.3347,Orthodox,216,34.65,0,1,...,2800.0,1,0,2,2,-4.39899,-59,1,35.0,0.0
5457,win,Ken Shamrock,Patrick Smith,74,77.5172,Orthodox,225,30.23,0,0,...,2800.0,1,1,1,1,5.51721,20,1,30.0,30.0
5458,win,Royce Gracie,Royce Gracie,73,73.4723,Southpaw,175,26.94,0,0,...,2800.0,1,0,1,1,-1.62681,-21,0,27.0,30.0
5459,win,Kevin Rosier,Kevin Rosier,76,81.7337,Orthodox,275,0,0,0,...,2800.0,1,0,1,1,1.7651,25,-1,0.0,0.0


In [355]:
update_df.to_csv('Data/to_model.csv')