In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re
from html import unescape
from datetime import datetime as dt
import time
import lxml


# Data load:

In [2]:
processed_df = pd.read_csv('preprocessed_data.csv') #DF of matches, each observation is a match between two fighters
fighter_stats = pd.read_csv('data.csv') #fighter's cumulative statistics database
fighter_raw_db = pd.read_csv('raw_fighter_details.csv')


In [3]:
print(processed_df.head(5))
print('\n')
print(fighter_stats.head(5))

  Winner  title_bout  no_of_rounds  B_current_lose_streak  \
0    Red        True             5                    0.0   
1    Red        True             5                    0.0   
2    Red       False             3                    0.0   
3   Blue       False             3                    0.0   
4   Blue       False             3                    0.0   

   B_current_win_streak  B_draw  B_avg_BODY_att  B_avg_BODY_landed  \
0                   4.0     0.0        9.200000           6.000000   
1                   3.0     0.0       14.600000           9.100000   
2                   3.0     0.0       15.354839          11.322581   
3                   4.0     0.0       17.000000          14.000000   
4                   1.0     0.0       17.000000          14.500000   

   B_avg_CLINCH_att  B_avg_CLINCH_landed  ...  \
0          0.200000             0.000000  ...   
1         11.800000             7.300000  ...   
2          6.741935             4.387097  ...   
3         13.750

### Initial Preprocessing:

In [4]:
processed_cols = list(processed_df.columns) #all columns in preprocessed data
fighter_cols = list(fighter_stats.columns) #all columns in the fighter stats


In [5]:
#returns a sublist of strings that begin with letter from a list of strings
def select_cols_start(letter, list_str):
    pattern = re.compile(r'{}.*'.format(letter))
    col_matches = list(filter(None, [re.match(pattern, item) for item in list_str]))
    cols = [item.group(0) for item in col_matches]
    
    return cols

In [6]:
r_match_cols = select_cols_start('R_', processed_cols)
b_match_cols = select_cols_start('B_', processed_cols)

r_fighter_cols = select_cols_start('R_', fighter_cols)
b_fighter_cols = select_cols_start('B_', fighter_cols)

# Web Scrape

### Setup URL Structure

In [7]:
url = 'http://ufcstats.com/statistics/events/completed'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

In [8]:
#List of more urls to scrape:
detail_urls = [] #this grabs urls for each day recorded
for url in soup.find_all(class_='b-link b-link_style_black'):
    time.sleep(1)
    detail_urls.append(url['href'])


List containing URLs to each fight day (multiple fights / day):

In [9]:
detail_urls

['http://ufcstats.com/event-details/fc9a9559a05f2704',
 'http://ufcstats.com/event-details/33b2f68ef95252e0',
 'http://ufcstats.com/event-details/5df17b3620145578',
 'http://ufcstats.com/event-details/b26d3e3746fb4024',
 'http://ufcstats.com/event-details/44aa652b181bcf68',
 'http://ufcstats.com/event-details/0c1773639c795466',
 'http://ufcstats.com/event-details/74fefd43f073cd2f',
 'http://ufcstats.com/event-details/4565d435005319c0',
 'http://ufcstats.com/event-details/b09890ba7ce1d1e2',
 'http://ufcstats.com/event-details/81ca2c245b19b3c5',
 'http://ufcstats.com/event-details/8d5daf67983b65ba',
 'http://ufcstats.com/event-details/fd87b1bbfcde9d5e',
 'http://ufcstats.com/event-details/df05aa15b2d66f57',
 'http://ufcstats.com/event-details/3ae10ac4df3df05c',
 'http://ufcstats.com/event-details/0941df56f6ac954b',
 'http://ufcstats.com/event-details/3cf68c1d17f66af7',
 'http://ufcstats.com/event-details/1bf49bf829964144',
 'http://ufcstats.com/event-details/94a5aaf573f780ad',
 'http://u

### Scrape for individual URL observation

In [163]:
day = BeautifulSoup(requests.get(detail_urls[3]).text, 'html.parser')

In [164]:
#Grab each individual Table Row:

In [165]:
stat_table = day.findAll('table')[0].contents

In [166]:
#Observe that relevant statistics are in 4th element (index: 3) 
for index, item in enumerate(stat_table):
    print(index)
    print(item)
    print('\n')

0




1
<thead class="b-fight-details__table-head">
<tr class="b-fight-details__table-row">
<th class="b-fight-details__table-col">
    W/L
  </th>
<th class="b-fight-details__table-col l-page_align_left">
    Fighter
  </th>
<th class="b-fight-details__table-col">
    Str
  </th>
<th class="b-fight-details__table-col">
    Td
  </th>
<th class="b-fight-details__table-col">
    Sub
  </th>
<th class="b-fight-details__table-col">
    Pass
  </th>
<th class="b-fight-details__table-col l-page_align_left">
    Weight class
  </th>
<th class="b-fight-details__table-col l-page_align_left">
    Method
  </th>
<th class="b-fight-details__table-col">
    Round
  </th>
<th class="b-fight-details__table-col">
    Time
  </th>
</tr>
</thead>


2




3
<tbody class="b-fight-details__table-body">
<tr class="b-fight-details__table-row b-fight-details__table-row__hover js-fight-details-click" data-link="http://ufcstats.com/fight-details/fae99a4089f8abb0" onclick="doNav('http://ufcstats.com/fight-detai

In [167]:
table_data = stat_table[3]
detail_data = table_data.find_all('p')
img_table = table_data.find_all('img')

In [168]:
img_table[0]['src']

'http://1e49bc5171d173577ecd-1323f4090557a33db01577564f60846c.r80.cf1.rackcdn.com/belt.png'

In [169]:
re.match(r'.*belt.*', img_table[0]['src'])

<re.Match object; span=(0, 89), match='http://1e49bc5171d173577ecd-1323f4090557a33db0157>

In [170]:
detail_data[3].text

'\n               \n               104\n\n          '

In [171]:
detail_data

[<p class="b-fight-details__table-text">
 <a class="b-flag b-flag_style_green" href="http://ufcstats.com/fight-details/fae99a4089f8abb0"><i class="b-flag__inner"><i class="b-flag__text">win<i class="b-flag__corner"></i><i class="b-flag__corner-substrate"></i></i></i></a>
 </p>, <p class="b-fight-details__table-text">
 <a class="b-link b-link_style_black" href="http://ufcstats.com/fighter-details/07f72a2a7591b409">
               Jon Jones
             </a>
 </p>, <p class="b-fight-details__table-text">
 <a class="b-link b-link_style_black" href="http://ufcstats.com/fighter-details/2e19380f34871c6a">
               Dominick Reyes
             </a>
 </p>, <p class="b-fight-details__table-text">
                
                104
 
           </p>, <p class="b-fight-details__table-text">
 
             
             116
           </p>, <p class="b-fight-details__table-text">
             
             2
           </p>, <p class="b-fight-details__table-text">
             
            

In [172]:
contents = []

In [173]:
def find_belt(img_tag):
    try:
        image_link = img_tag['src']
        if re.match(r'.*belt.*', image_link) != None:
            return True
    except:
        return False

In [174]:
for index, item in enumerate(detail_data):
    a = [index, item.find('img')]
    print(find_belt(a[1]))
    contents.append(item.text)

False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
None
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
None
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
None
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fal

In [175]:
contents

['\nwin\n',
 '\n\n              Jon Jones\n            \n',
 '\n\n              Dominick Reyes\n            \n',
 '\n               \n               104\n\n          ',
 '\n\n            \n            116\n          ',
 '\n            \n            2\n          ',
 '\n            \n            0\n          ',
 '\n            \n            0\n          ',
 '\n            \n            0\n          ',
 '\n            \n            0\n          ',
 '\n            \n            0\n          ',
 '\n        Light Heavyweight\n\n',
 '\n        \n        \n\n        U-DEC\n\n      ',
 '\n',
 '\n\n        5\n      ',
 '\n\n        \n        5:00\n      ',
 '\nwin\n',
 '\n\n              Valentina Shevchenko\n            \n',
 '\n\n              Katlyn Chookagian\n            \n',
 '\n               \n               40\n\n          ',
 '\n\n            \n            17\n          ',
 '\n            \n            3\n          ',
 '\n            \n            0\n          ',
 '\n            \n    

In [176]:
def remove_space_lines(text):
    pattern1 = re.compile(r'[\s\s+]')
    return re.sub(pattern1, ' ', text)

In [177]:
contents = list(map(lambda x: remove_space_lines(x), contents))
contents = list(map(lambda x: x.strip(), contents))

In [178]:
contents_series = pd.Series(contents)
contents_series[contents_series == 'win'].index
contents_series

0                 win
1           Jon Jones
2      Dominick Reyes
3                 104
4                 116
            ...      
187     Featherweight
188             U-DEC
189                  
190                 3
191              5:00
Length: 192, dtype: object

In [179]:
formatted_contents = np.array(contents).reshape(-1, 16)

In [180]:
formatted_contents.shape

(12, 16)

In [181]:
formatted_contents = pd.DataFrame(formatted_contents[0:formatted_contents.shape[1]])

In [182]:
formatted_contents.drop(0, axis = 1, inplace = True)

In [183]:
formatted_contents.columns = ['R_fighter', 'B_fighter', 'R_STR', 'B_STR', 
                               'R_TD', 'B_TD', 'R_SUB', 'B_SUB', 'R_PASS', 'B_PASS',
                              'WEIGHT_CLASS', 'METHOD', 'DETAIL', 'ROUND', 'TIME']

In [184]:
formatted_contents['TIME'] = formatted_contents['TIME'].apply(lambda x: dt.strptime(x, '%H:%M').time())

In [185]:
formatted_contents['TIME']

0     05:00:00
1     01:03:00
2     01:59:00
3     05:00:00
4     05:00:00
5     05:00:00
6     05:00:00
7     00:27:00
8     01:41:00
9     00:38:00
10    05:00:00
11    05:00:00
Name: TIME, dtype: object

In [208]:
data_types = {
    'R_STR': int, 
    'B_STR': int,
    'R_TD': int, 
    'B_TD': int, 
    'R_SUB': int, 
    'R_SUB': int, 
    'R_PASS': int, 
    'B_PASS': int,
    'ROUND': int,
    'title_bout': int
}

In [209]:
def get_page_stats(site):
    stat_table = site.findAll('table')[0].contents #Contents of the main table in html
    
    table_data = stat_table[3] #first 2 indices are empty strings, table_data is html starting from first table row
    detail_data = table_data.find_all('p') #within table rows, there are <p> labels for table text
    image_data = table_data.find_all('img') #get image links to find belt for 
    
    contents = []
    title_match_index = []
    
    for index, item in enumerate(detail_data):
        image = item.find('img')
        if find_belt(image):
            title_match_index.append(index)
        contents.append(item.text) #contents is list of all text from each element of table  
    
    #Clean up elements
    contents = list(map(lambda x: remove_space_lines(x), contents))
    contents = list(map(lambda x: x.strip(), contents)) 
    
    #each row of data is 16 elements, reformats: 1 observation per row
    formatted_contents = np.array(contents).reshape((-1, 16))
    formatted_contents = pd.DataFrame(formatted_contents)

    
    
    #the first row is a list of 'wins'
    formatted_contents.drop(0, axis = 1, inplace = True)
    
        
    title_match = np.floor_divide(title_match_index, 16)
    titles = np.zeros(16)
    titles[title_match] = 1
    title_series = pd.Series(titles)
    
    formatted_contents['title_bout'] = title_series
    
    #rename columns
    formatted_contents.columns = ['R_fighter', 'B_fighter', 'R_STR', 'B_STR', 
                               'R_TD', 'B_TD', 'R_SUB', 'R_SUB', 'R_PASS', 'B_PASS',
                              'WEIGHT_CLASS', 'METHOD', 'DETAIL', 'ROUND', 'TIME', 'title_bout']
    
    #convert columsn to appropriate data types
    formatted_contents = formatted_contents.astype(data_types)
    formatted_contents['TIME'] = formatted_contents['TIME'].apply(lambda x: dt.strptime(x, '%H:%M').time())
    
    return formatted_contents

In [210]:
test = get_page_stats(day)

In [211]:
test

Unnamed: 0,R_fighter,B_fighter,R_STR,B_STR,R_TD,B_TD,R_SUB,R_SUB.1,R_PASS,B_PASS,WEIGHT_CLASS,METHOD,DETAIL,ROUND,TIME,title_bout
0,Jon Jones,Dominick Reyes,104,116,2,0,0,0,0,0,Light Heavyweight,U-DEC,,5,05:00:00,1
1,Valentina Shevchenko,Katlyn Chookagian,40,17,3,0,0,0,2,0,Women's Flyweight,KO/TKO,Punches,3,01:03:00,1
2,Justin Tafa,Juan Adams,10,7,0,0,0,0,0,0,Heavyweight,KO/TKO,Punch,1,01:59:00,0
3,Dan Ige,Mirsad Bektic,43,17,1,3,0,1,1,6,Featherweight,S-DEC,,3,05:00:00,0
4,Derrick Lewis,Ilir Latifi,20,5,0,3,0,0,0,2,Heavyweight,U-DEC,,3,05:00:00,0
5,Trevin Giles,James Krause,71,44,0,1,1,4,3,1,Middleweight,S-DEC,,3,05:00:00,0
6,Lauren Murphy,Andrea Lee,80,104,2,0,0,1,1,0,Women's Flyweight,S-DEC,,3,05:00:00,0
7,Khaos Williams,Alex Morono,10,1,0,0,0,0,0,0,Welterweight,KO/TKO,Punch,1,00:27:00,0
8,Mario Bautista,Miles Johns,18,13,0,0,0,0,0,0,Bantamweight,KO/TKO,Flying Knee,2,01:41:00,0
9,Journey Newson,Domingo Pilarte,6,1,0,0,0,0,0,0,Bantamweight,KO/TKO,Punch,1,00:38:00,0


In [193]:
a = np.zeros(16)

In [195]:
a[test[1]] = 2

In [197]:
pd.Series(a)

0     2.0
1     2.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
12    0.0
13    0.0
14    0.0
15    0.0
dtype: float64

# Combine Scrape with Fight Database

In [32]:
fighter_stats.head(5)

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,no_of_rounds,B_current_lose_streak,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Henry Cejudo,Marlon Moraes,Marc Goddard,2019-06-08,"Chicago, Illinois, USA",Red,True,Bantamweight,5,0.0,...,2.0,0.0,0.0,8.0,Orthodox,162.56,162.56,135.0,31.0,32.0
1,Valentina Shevchenko,Jessica Eye,Robert Madrigal,2019-06-08,"Chicago, Illinois, USA",Red,True,Women's Flyweight,5,0.0,...,0.0,2.0,0.0,5.0,Southpaw,165.1,167.64,125.0,32.0,31.0
2,Tony Ferguson,Donald Cerrone,Dan Miragliotta,2019-06-08,"Chicago, Illinois, USA",Red,False,Lightweight,3,0.0,...,3.0,6.0,1.0,14.0,Orthodox,180.34,193.04,155.0,36.0,35.0
3,Jimmie Rivera,Petr Yan,Kevin MacDonald,2019-06-08,"Chicago, Illinois, USA",Blue,False,Bantamweight,3,0.0,...,1.0,0.0,0.0,6.0,Orthodox,162.56,172.72,135.0,26.0,29.0
4,Tai Tuivasa,Blagoy Ivanov,Dan Miragliotta,2019-06-08,"Chicago, Illinois, USA",Blue,False,Heavyweight,3,0.0,...,2.0,0.0,0.0,3.0,Southpaw,187.96,190.5,264.0,32.0,26.0


In [109]:
list(processed_df.columns)

['Winner',
 'title_bout',
 'no_of_rounds',
 'B_current_lose_streak',
 'B_current_win_streak',
 'B_draw',
 'B_avg_BODY_att',
 'B_avg_BODY_landed',
 'B_avg_CLINCH_att',
 'B_avg_CLINCH_landed',
 'B_avg_DISTANCE_att',
 'B_avg_DISTANCE_landed',
 'B_avg_GROUND_att',
 'B_avg_GROUND_landed',
 'B_avg_HEAD_att',
 'B_avg_HEAD_landed',
 'B_avg_KD',
 'B_avg_LEG_att',
 'B_avg_LEG_landed',
 'B_avg_PASS',
 'B_avg_REV',
 'B_avg_SIG_STR_att',
 'B_avg_SIG_STR_landed',
 'B_avg_SIG_STR_pct',
 'B_avg_SUB_ATT',
 'B_avg_TD_att',
 'B_avg_TD_landed',
 'B_avg_TD_pct',
 'B_avg_TOTAL_STR_att',
 'B_avg_TOTAL_STR_landed',
 'B_longest_win_streak',
 'B_losses',
 'B_avg_opp_BODY_att',
 'B_avg_opp_BODY_landed',
 'B_avg_opp_CLINCH_att',
 'B_avg_opp_CLINCH_landed',
 'B_avg_opp_DISTANCE_att',
 'B_avg_opp_DISTANCE_landed',
 'B_avg_opp_GROUND_att',
 'B_avg_opp_GROUND_landed',
 'B_avg_opp_HEAD_att',
 'B_avg_opp_HEAD_landed',
 'B_avg_opp_KD',
 'B_avg_opp_LEG_att',
 'B_avg_opp_LEG_landed',
 'B_avg_opp_PASS',
 'B_avg_opp_REV',
 

In [34]:
fighter_raw_db.head(5)

Unnamed: 0,fighter_name,Height,Weight,Reach,Stance,DOB
0,AJ Fonseca,"5' 4""",145 lbs.,,,
1,AJ Matthews,"5' 11""",185 lbs.,,,
2,AJ McKee,"5' 10""",145 lbs.,,,
3,AJ Siscoe,"5' 7""",135 lbs.,,,
4,Aalon Cruz,"6' 0""",145 lbs.,,,


### Get Fighter's Most Recent Statistics

In [38]:
fighter_stats['date'] = fighter_stats['date'].apply(lambda x: dt.strptime(x, '%Y-%m-%d'))

In [68]:
def get_recent_fight(fighter):
    #r & b are dataframes with relevant statistics of fighter's fights
    r = fighter_stats[fighter_stats['R_fighter'] == fighter]
    b = fighter_stats[fighter_stats['B_fighter'] == fighter]
    
    #Since this function selects the most recent fight of the fighter, we need to distinguish whether the result is for the 
    #red or blue fighter:
    color = None
    
    #If fighter does not have a recent fight in the directory, return an empty dataframe
    if (r.shape[0] == 0) & (b.shape[0] == 0):
        return [r, color]
    
    #Combine r & b, sort by date, and select the most recent fight:
    #Determine the color of the ring for fighter's most recent fight:
    try: 
        latest_r = max(r['date'])
    except:
        latest_r = pd.Timestamp('1900-01-01')
    try:
        latest_b = max(b['date'])
    except:
        latest_b = pd.Timestamp('1900-01-01')
   
    if latest_r > latest_b:
        color = 'Red'
    else:
        color = 'Blue'
    
    combined_df = pd.concat([r, b], axis = 0)
    combined_df = combined_df.sort_values('date', ascending = False)

    return [combined_df.iloc[0, ], color]

        

In [79]:
def get_fight_statistics(color, fight):
    if color == 'Red':
        letter = 'R_'
    else:
        letter = 'B_'
    pattern = re.compile(r'{}.*'.format(letter))
    col_matches = list(filter(None, [re.match(pattern, item) for item in fight.index]))    
    cols = [item.group(0) for item in col_matches]
    
    
    return fight[cols]
        

In [80]:
def get_recent_stats(fighter):
    recent_fight = get_recent_fight(fighter)
    color = recent_fight[1]
    
    return get_fight_statistics(color, recent_fight[0])
    

In [13]:
site_text = str(day1.find_all(class_='b-statistics__table-events'))

In [14]:
site_text

'[]'

In [104]:
#given a fight url, returns a dataframe 
def extract_fights_df(fight_url):
    fight_day = requests.get(fight_url)
    fight_soup = bs.BeautifulSoup(fight_day.text, 'html.parser')
    
    #grab the table using pandas read_html
    a = pd.read_html(str(soup.find(class_='b-fight-details__table b-fight-details__table_style_margin-top b-fight-details__table_type_event-details js-fight-table')))
    df = a[0]
    
    #expand the names... .we'll need to concatenate them manually to match...
    df = df.drop(['Str', 'Td', 'Sub', 'Pass'], axis = 1)
 
    return df

In [148]:
#Let's extract fights from multiple pages:
combined_df = pd.DataFrame()
for url in detail_urls:
    combined_df = pd.concat([combined_df, extract_fights_df(url)], axis = 0)



In [150]:
#save scraped fights to CSV
combined_df.to_csv('scraped_fights.csv')

In [166]:
combined_df = combined_df.reset_index(drop = True)

In [189]:
def split_name(name):
    indices = [(m.start(0), m.end(0)) for m in re.finditer(r'\s\s', name)][0]
    name1 = name[:indices[0]]
    name2 = name[indices[1]:]
    return [name1, name2]

In [158]:
winner = []
loser = []

for item in combined_df['Fighter']:
    names = split_name(item)
    winner.append(names[0])
    loser.append(names[1])
    

In [159]:
combined_df['R_fighter'] = winner
combined_df['B_fighter'] = loser

In [167]:
combined_df


Unnamed: 0,W/L,Fighter,Weight class,Method,Round,Time,R_fighter,B_fighter
0,win,Jan Blachowicz Jacare Souza,Light Heavyweight,S-DEC,5,5:00,Jan Blachowicz,Jacare Souza
1,draw draw,Mauricio Rua Paul Craig,Light Heavyweight,S-DEC,3,5:00,Mauricio Rua,Paul Craig
2,win,Charles Oliveira Jared Gordon,Lightweight,KO/TKO Punch,1,1:26,Charles Oliveira,Jared Gordon
3,win,Andre Muniz Antonio Arroyo,Middleweight,U-DEC,3,5:00,Andre Muniz,Antonio Arroyo
4,win,Wellington Turman Markus Perez,Middleweight,U-DEC,3,5:00,Wellington Turman,Markus Perez
...,...,...,...,...,...,...,...,...
288,win,Michel Pereira Danny Roberts,Welterweight,KO/TKO Flying Knee,1,1:47,Michel Pereira,Danny Roberts
289,win,Grant Dawson Michael Trizano,Featherweight,SUB Rear Naked Choke,2,2:27,Grant Dawson,Michael Trizano
290,win,Ed Herman Patrick Cummins,Light Heavyweight,KO/TKO Knee,1,3:39,Ed Herman,Patrick Cummins
291,win,Zak Cummings Trevin Giles,Middleweight,SUB Guillotine Choke,3,4:01,Zak Cummings,Trevin Giles


In [6]:
def remove_space(text):
    new_text = unescape(text)
    return re.sub(r'\s\s+'," ", new_text)

In [192]:
recent_fight_stats('Jan Blachowicz')

R_fighter                Jan Blachowicz
R_current_lose_streak                 0
R_current_win_streak                  4
R_draw                                0
R_avg_BODY_att                     13.5
                              ...      
R_Stance                       Orthodox
R_Height_cms                     187.96
R_Reach_cms                      198.12
R_Weight_lbs                        205
R_age                                35
Name: 161, Length: 69, dtype: object

In [23]:
def extract_recent_fight(fighter): #This function returns all the stats from fighter's most recent fight

    try:
        R_df = pd.DataFrame(fighter_stats[fighter_stats['R_fighter'] == fighter])
    except:
        print('Fighter not in list R')
    
    try: 
        B_df = pd.DataFrame(fighter_stats[fighter_stats['B_fighter'] == fighter])
    except:
        print('Fighter not in list B')
        
    combined_df = pd.concat([R_df, B_df], axis = 0)
    combined_df = combined_df.sort_values('date', ascending = False)
        
    return combined_df.iloc[0, ].copy()
    
    
    
    

In [24]:
def recent_fight_stats(fighter): #This function returns the stats for only the fighter's most recent stats:
    fight = extract_recent_fight(fighter)
    if fight['R_fighter'] == fighter:
        return fight[fighter_R]
        
    if fight['B_fighter'] == fighter:    
        return fight[fighter_B]


In [186]:
#These functions take the fighter's most recent stats and changes the colnames to match the final template:

def name_columns_R(fighter):
    data = pd.DataFrame(recent_fight_stats(fighter)).transpose()
    data.columns = fighter_R
    
    return data
    
def name_columns_B(fighter):
    data = pd.DataFrame(recent_fight_stats(fighter)).transpose()
    data.columns = fighter_B
    
    return data
    
    

In [195]:
fighter_R_stats = []

for fighter in combined_df['R_fighter']:
    try:
        fighter_R_stats.append(name_columns_R(fighter))
    except:
        print(fighter)

Andre Muniz
Wellington Turman
Tracy Cortez
Shamil Gamzatov
Ciryl Gane
Loma Lookboonmee
Raphael Pessoa
Sean Woodson
Sean Brady
Brendan Allen
Tanner Boser
Amanda Ribas
Miguel Baeza
Yorgan De Castro
Brad Riddell
Mark Madsen
Makhmud Muradov
Giga Chikadze
Jack Shore
Tristan Connelly
Miles Johns
Hunter Azure
Ottman Azaitar
Jingliang Li
Kenan Song
Mizuki Inoue
Da Un Jung
Heili Alateng
Karol Rosa
Khama Worthy
Eduardo Garagorri
Rodolfo Vieira
Ciryl Gane
Miranda Granger
Brianna Van Buren
Julia Avila
Dalcha Lungiambula
Amanda Ribas
Deron Winn
Xiaonan Yan


In [202]:
fighter_R_stats[1]
pd.concat([fighter_R_stats[0], fighter_R_stats[1]], axis = 0)

Unnamed: 0,R_fighter,R_current_lose_streak,R_current_win_streak,R_draw,R_avg_BODY_att,R_avg_BODY_landed,R_avg_CLINCH_att,R_avg_CLINCH_landed,R_avg_DISTANCE_att,R_avg_DISTANCE_landed,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,R_age
161,Jan Blachowicz,0,4,0,13.5,9.9,10.0,7.2,68.2,29.6,...,3,1,2,0,6,Orthodox,187.96,198.12,205,35
262,Mauricio Rua,1,0,0,8.33333,6.11111,9.72222,6.77778,55.6111,26.0,...,1,7,0,0,9,Orthodox,185.42,193.04,205,37


In [188]:
name_columns_B('Jan Blachowicz')

Unnamed: 0,B_fighter,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_BODY_att,B_avg_BODY_landed,B_avg_CLINCH_att,B_avg_CLINCH_landed,B_avg_DISTANCE_att,B_avg_DISTANCE_landed,...,B_win_by_Decision_Unanimous,B_win_by_KO/TKO,B_win_by_Submission,B_win_by_TKO_Doctor_Stoppage,B_wins,B_Stance,B_Height_cms,B_Reach_cms,B_Weight_lbs,B_age
161,Jan Blachowicz,0,4,0,13.5,9.9,10,7.2,68.2,29.6,...,3,1,2,0,6,Orthodox,187.96,198.12,205,35


In [185]:
pd.DataFrame(recent_fight_stats('Jan Blachowicz')).transpose()

Unnamed: 0,R_fighter,R_current_lose_streak,R_current_win_streak,R_draw,R_avg_BODY_att,R_avg_BODY_landed,R_avg_CLINCH_att,R_avg_CLINCH_landed,R_avg_DISTANCE_att,R_avg_DISTANCE_landed,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,R_age
161,Jan Blachowicz,0,4,0,13.5,9.9,10,7.2,68.2,29.6,...,3,1,2,0,6,Orthodox,187.96,198.12,205,35


In [179]:
pd.DataFrame(stats)

Unnamed: 0,R_fighter,R_current_lose_streak,R_current_win_streak,R_draw,R_avg_BODY_att,R_avg_BODY_landed,R_avg_CLINCH_att,R_avg_CLINCH_landed,R_avg_DISTANCE_att,R_avg_DISTANCE_landed,...,B_win_by_Decision_Unanimous,B_win_by_KO/TKO,B_win_by_Submission,B_win_by_TKO_Doctor_Stoppage,B_wins,B_Stance,B_Height_cms,B_Reach_cms,B_Weight_lbs,B_age
161,Jan Blachowicz,0.0,4.0,0.0,13.500000,9.900000,10.000000,7.200000,68.200000,29.600000,...,,,,,,,,,,
262,Mauricio Rua,1.0,0.0,0.0,8.333333,6.111111,9.722222,6.777778,55.611111,26.000000,...,,,,,,,,,,
29,Charles Oliveira,0.0,4.0,0.0,8.608696,5.739130,6.652174,4.782609,29.434783,12.000000,...,,,,,,,,,,
404,James Krause,0.0,4.0,0.0,9.888889,6.666667,3.333333,1.777778,108.333333,49.666667,...,,,,,,,,,,
208,Ricardo Ramos,0.0,3.0,0.0,13.666667,7.666667,6.666667,5.000000,92.666667,35.666667,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,Orthodox,185.42,185.42,170.0,25.0
34,,,,,,,,,,,...,1.0,0.0,0.0,0.0,1.0,Switch,177.80,182.88,145.0,25.0
35,,,,,,,,,,,...,2.0,3.0,4.0,0.0,10.0,Orthodox,185.42,195.58,205.0,38.0
36,Zak Cummings,0.0,1.0,0.0,8.900000,5.400000,8.300000,4.900000,76.900000,21.600000,...,,,,,,,,,,


In [165]:
combined_df['R_fighter']

0        Jan Blachowicz
1          Mauricio Rua
2      Charles Oliveira
3           Andre Muniz
4     Wellington Turman
            ...        
8        Michel Pereira
9          Grant Dawson
10            Ed Herman
11         Zak Cummings
12           Julio Arce
Name: R_fighter, Length: 293, dtype: object

In [310]:
#detail_urls = detail_urls[:-3] #grab the urls up to the new matches

In [27]:
testdf = extract_fights_df(detail_urls[0])

In [145]:
testdf1 = extract_fights_df(detail_urls[1])

In [147]:
pd.concat([testdf, testdf1], axis = 0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,B_fighter,Fighter,Method,R_fighter,Round,Time,W/L,Weight class
0,Jacare Souza,Jan Blachowicz Jacare Souza,S-DEC,Jan Blachowicz,5,5:00,win,Light Heavyweight
1,Paul Craig,Mauricio Rua Paul Craig,S-DEC,Mauricio Rua,3,5:00,draw draw,Light Heavyweight
2,Jared Gordon,Charles Oliveira Jared Gordon,KO/TKO Punch,Charles Oliveira,1,1:26,win,Lightweight
3,Antonio Arroyo,Andre Muniz Antonio Arroyo,U-DEC,Andre Muniz,3,5:00,win,Middleweight
4,Markus Perez,Wellington Turman Markus Perez,U-DEC,Wellington Turman,3,5:00,win,Middleweight
5,Sergio Moraes,James Krause Sergio Moraes,KO/TKO Punch,James Krause,3,4:19,win,Welterweight
6,Eduardo Garagorri,Ricardo Ramos Eduardo Garagorri,SUB Rear Naked Choke,Ricardo Ramos,1,3:57,win,Featherweight
7,Bobby Green,Francisco Trinaldo Bobby Green,U-DEC,Francisco Trinaldo,3,5:00,win,Lightweight
8,Warlley Alves,Randy Brown Warlley Alves,SUB Triangle Choke,Randy Brown,2,1:22,win,Welterweight
9,Renan Barao,Douglas Silva de Andrade Renan Barao,U-DEC,Douglas Silva de Andrade,3,5:00,win,Featherweight


In [31]:
testdf

Unnamed: 0,W/L,Fighter,Weight class,Method,Round,Time,R_fighter,B_fighter
0,win,Jan Blachowicz Jacare Souza,Light Heavyweight,S-DEC,5,5:00,Jan Blachowicz,Jacare Souza
1,draw draw,Mauricio Rua Paul Craig,Light Heavyweight,S-DEC,3,5:00,Mauricio Rua,Paul Craig
2,win,Charles Oliveira Jared Gordon,Lightweight,KO/TKO Punch,1,1:26,Charles Oliveira,Jared Gordon
3,win,Andre Muniz Antonio Arroyo,Middleweight,U-DEC,3,5:00,Andre Muniz,Antonio Arroyo
4,win,Wellington Turman Markus Perez,Middleweight,U-DEC,3,5:00,Wellington Turman,Markus Perez
5,win,James Krause Sergio Moraes,Welterweight,KO/TKO Punch,3,4:19,James Krause,Sergio Moraes
6,win,Ricardo Ramos Eduardo Garagorri,Featherweight,SUB Rear Naked Choke,1,3:57,Ricardo Ramos,Eduardo Garagorri
7,win,Francisco Trinaldo Bobby Green,Lightweight,U-DEC,3,5:00,Francisco Trinaldo,Bobby Green
8,win,Randy Brown Warlley Alves,Welterweight,SUB Triangle Choke,2,1:22,Randy Brown,Warlley Alves
9,win,Douglas Silva de Andrade Renan Barao,Featherweight,U-DEC,3,5:00,Douglas Silva de Andrade,Renan Barao


In [136]:
r = recent_fight_stats('Jan Blachowicz')
b = recent_fight_stats('Jacare Souza')
r = pd.DataFrame(r).transpose().reset_index(drop = True)
b = pd.DataFrame(b).transpose().reset_index(drop = True)
b.columns = fighter_B



In [141]:
first_fight = pd.concat([r, b], axis = 1)

In [144]:
pd.concat([testdf, first_fight], axis = 1)

Unnamed: 0,W/L,Fighter,Weight class,Method,Round,Time,R_fighter,B_fighter,R_fighter.1,R_current_lose_streak,...,B_win_by_Decision_Unanimous,B_win_by_KO/TKO,B_win_by_Submission,B_win_by_TKO_Doctor_Stoppage,B_wins,B_Stance,B_Height_cms,B_Reach_cms,B_Weight_lbs,B_age
0,win,Jan Blachowicz Jacare Souza,Light Heavyweight,S-DEC,5,5:00,Jan Blachowicz,Jacare Souza,Jan Blachowicz,0.0,...,1.0,4.0,4.0,0.0,9.0,Orthodox,185.42,182.88,185.0,39.0
1,draw draw,Mauricio Rua Paul Craig,Light Heavyweight,S-DEC,3,5:00,Mauricio Rua,Paul Craig,,,...,,,,,,,,,,
2,win,Charles Oliveira Jared Gordon,Lightweight,KO/TKO Punch,1,1:26,Charles Oliveira,Jared Gordon,,,...,,,,,,,,,,
3,win,Andre Muniz Antonio Arroyo,Middleweight,U-DEC,3,5:00,Andre Muniz,Antonio Arroyo,,,...,,,,,,,,,,
4,win,Wellington Turman Markus Perez,Middleweight,U-DEC,3,5:00,Wellington Turman,Markus Perez,,,...,,,,,,,,,,
5,win,James Krause Sergio Moraes,Welterweight,KO/TKO Punch,3,4:19,James Krause,Sergio Moraes,,,...,,,,,,,,,,
6,win,Ricardo Ramos Eduardo Garagorri,Featherweight,SUB Rear Naked Choke,1,3:57,Ricardo Ramos,Eduardo Garagorri,,,...,,,,,,,,,,
7,win,Francisco Trinaldo Bobby Green,Lightweight,U-DEC,3,5:00,Francisco Trinaldo,Bobby Green,,,...,,,,,,,,,,
8,win,Randy Brown Warlley Alves,Welterweight,SUB Triangle Choke,2,1:22,Randy Brown,Warlley Alves,,,...,,,,,,,,,,
9,win,Douglas Silva de Andrade Renan Barao,Featherweight,U-DEC,3,5:00,Douglas Silva de Andrade,Renan Barao,,,...,,,,,,,,,,


In [13]:
winner = []
loser = []

for item in combined_df['Fighter']:
    names = split_name(item)
    winner.append(names[0])
    loser.append(names[1])
    

NameError: name 'combined_df' is not defined

In [289]:
combined_df['Winner'] = winner
combined_df['Loser'] = loser

ValueError: Length of values does not match length of index

In [278]:
combined_df = combined_df.reset_index(drop = True)

In [279]:
combined_df

Unnamed: 0,W/L,Fighter,Weight class,Method,Round,Time,Winner,Loser
0,win,Jan Blachowicz Jacare Souza,Light Heavyweight,S-DEC,5,5:00,Jan Blachowicz,Jacare Souza
1,draw draw,Mauricio Rua Paul Craig,Light Heavyweight,S-DEC,3,5:00,Mauricio Rua,Paul Craig
2,win,Charles Oliveira Jared Gordon,Lightweight,KO/TKO Punch,1,1:26,Charles Oliveira,Jared Gordon
3,win,Andre Muniz Antonio Arroyo,Middleweight,U-DEC,3,5:00,Andre Muniz,Antonio Arroyo
4,win,Wellington Turman Markus Perez,Middleweight,U-DEC,3,5:00,Wellington Turman,Markus Perez
...,...,...,...,...,...,...,...,...
70,win,Deiveson Figueiredo Tim Elliott,Flyweight,SUB Guillotine Choke,1,3:08,Deiveson Figueiredo,Tim Elliott
71,win,Marlon Vera Andre Ewell,Bantamweight,KO/TKO Elbows,3,3:17,Marlon Vera,Andre Ewell
72,win,Miguel Baeza Hector Aldana,Welterweight,KO/TKO Kick,2,2:32,Miguel Baeza,Hector Aldana
73,win,Marvin Vettori Andrew Sanchez,Middleweight,U-DEC,3,5:00,Marvin Vettori,Andrew Sanchez


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


In [14]:
a = pd.read_html(str(soup.find(class_='b-fight-details__table b-fight-details__table_style_margin-top b-fight-details__table_type_event-details js-fight-table')))
df = a[0]

ValueError: No tables found

In [116]:
df = pd.concat([df, df['Fighter'].str.split(" ", expand = True)], axis = 1)

In [117]:
df = df.drop(['Str', 'Td', 'Sub', 'Pass'], axis = 1)

In [118]:
df

Unnamed: 0,W/L,Fighter,Weight class,Method,Round,Time,0,1,2,3,4,5,6
0,win,Jan Blachowicz Jacare Souza,Light Heavyweight,S-DEC,5,5:00,Jan,Blachowicz,,Jacare,Souza,,
1,draw draw,Mauricio Rua Paul Craig,Light Heavyweight,S-DEC,3,5:00,Mauricio,Rua,,Paul,Craig,,
2,win,Charles Oliveira Jared Gordon,Lightweight,KO/TKO Punch,1,1:26,Charles,Oliveira,,Jared,Gordon,,
3,win,Andre Muniz Antonio Arroyo,Middleweight,U-DEC,3,5:00,Andre,Muniz,,Antonio,Arroyo,,
4,win,Wellington Turman Markus Perez,Middleweight,U-DEC,3,5:00,Wellington,Turman,,Markus,Perez,,
5,win,James Krause Sergio Moraes,Welterweight,KO/TKO Punch,3,4:19,James,Krause,,Sergio,Moraes,,
6,win,Ricardo Ramos Eduardo Garagorri,Featherweight,SUB Rear Naked Choke,1,3:57,Ricardo,Ramos,,Eduardo,Garagorri,,
7,win,Francisco Trinaldo Bobby Green,Lightweight,U-DEC,3,5:00,Francisco,Trinaldo,,Bobby,Green,,
8,win,Randy Brown Warlley Alves,Welterweight,SUB Triangle Choke,2,1:22,Randy,Brown,,Warlley,Alves,,
9,win,Douglas Silva de Andrade Renan Barao,Featherweight,U-DEC,3,5:00,Douglas,Silva,de,Andrade,,Renan,Barao


In [337]:
fighter_stats.head(5)

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,no_of_rounds,B_current_lose_streak,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Henry Cejudo,Marlon Moraes,Marc Goddard,2019-06-08,"Chicago, Illinois, USA",Red,True,Bantamweight,5,0.0,...,2.0,0.0,0.0,8.0,Orthodox,162.56,162.56,135.0,31.0,32.0
1,Valentina Shevchenko,Jessica Eye,Robert Madrigal,2019-06-08,"Chicago, Illinois, USA",Red,True,Women's Flyweight,5,0.0,...,0.0,2.0,0.0,5.0,Southpaw,165.1,167.64,125.0,32.0,31.0
2,Tony Ferguson,Donald Cerrone,Dan Miragliotta,2019-06-08,"Chicago, Illinois, USA",Red,False,Lightweight,3,0.0,...,3.0,6.0,1.0,14.0,Orthodox,180.34,193.04,155.0,36.0,35.0
3,Jimmie Rivera,Petr Yan,Kevin MacDonald,2019-06-08,"Chicago, Illinois, USA",Blue,False,Bantamweight,3,0.0,...,1.0,0.0,0.0,6.0,Orthodox,162.56,172.72,135.0,26.0,29.0
4,Tai Tuivasa,Blagoy Ivanov,Dan Miragliotta,2019-06-08,"Chicago, Illinois, USA",Blue,False,Heavyweight,3,0.0,...,2.0,0.0,0.0,3.0,Southpaw,187.96,190.5,264.0,32.0,26.0


In [345]:
fighter_stats[fighter_stats['R_fighter'] == "Jan Blachowicz"]

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,no_of_rounds,B_current_lose_streak,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
161,Jan Blachowicz,Thiago Santos,Herb Dean,2019-02-23,"Prague, Czech Republic",Blue,False,Light Heavyweight,5,0.0,...,1.0,2.0,0.0,6.0,Orthodox,187.96,198.12,205.0,35.0,35.0
374,Jan Blachowicz,Nikita Krylov,Marc Goddard,2018-09-15,"Moscow, Moscow, Russia",Red,False,Light Heavyweight,3,1.0,...,1.0,1.0,0.0,5.0,Orthodox,187.96,198.12,205.0,26.0,35.0
818,Jan Blachowicz,Devin Clark,Marc Goddard,2017-10-21,"Gdansk, Poland",Red,False,Light Heavyweight,3,0.0,...,1.0,0.0,0.0,2.0,Orthodox,187.96,198.12,205.0,27.0,34.0
1773,Jan Blachowicz,Corey Anderson,Herb Dean,2015-09-05,"Las Vegas, Nevada, USA",Blue,False,Light Heavyweight,3,1.0,...,1.0,0.0,0.0,1.0,Orthodox,187.96,198.12,205.0,25.0,32.0


In [382]:
columns_str = list(fighter_stats.columns)

In [386]:
col_names_R = list(filter(None, [re.match(r'R.*', item) for item in columns_str])) 
col_names_R = [item.group(0) for item in col_names_R]

col_names_B = list(filter(None, [re.match(r'B.*', item) for item in columns_str])) 
col_names_B = [item.group(0) for item in col_names_B]

In [400]:
all_fighters_R = []
all_fighters_B = []

for fighter in testdf['R_fighter']:
    try:
        all_fighters_R.append(fighter_stats[col_names_R][fighter_stats['R_fighter'] == fighter].iloc[0,])
    except:
        try:
            all_fighters_R.append(fighter_stats[col_names_R][fighter_stats['B_fighter'] == fighter].iloc[0,])
        except:
            print(fighter)

for fighter in testdf['B_fighter']:
    try:
        all_fighters_B.append(fighter_stats[col_names_B][fighter_stats['B_fighter'] == fighter].iloc[0,])
    except:
        try:
            all_fighters_B.append(fighter_stats[col_names_B][fighter_stats['R_fighter'] == fighter].iloc[0,])
        except:
            print(fighter)



Andre Muniz
Wellington Turman
Tracy Cortez
Antonio Arroyo
Eduardo Garagorri
Isabela de Padua
Vanessa Melo


In [401]:
pd.DataFrame(all_fighters_R)

Unnamed: 0,R_fighter,Referee,R_current_lose_streak,R_current_win_streak,R_draw,R_avg_BODY_att,R_avg_BODY_landed,R_avg_CLINCH_att,R_avg_CLINCH_landed,R_avg_DISTANCE_att,...,R_win_by_Decision_Unanimous,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,R_age
161,Jan Blachowicz,Herb Dean,0.0,4.0,0.0,13.5,9.9,10.0,7.2,68.2,...,3.0,1.0,2.0,0.0,6.0,Orthodox,187.96,198.12,205.0,35.0
262,Mauricio Rua,Steve Perceval,1.0,0.0,0.0,8.333333,6.111111,9.722222,6.777778,55.611111,...,1.0,7.0,0.0,0.0,9.0,Orthodox,185.42,193.04,205.0,37.0
29,Charles Oliveira,Todd Ronald Anderson,0.0,4.0,0.0,8.608696,5.73913,6.652174,4.782609,29.434783,...,1.0,0.0,13.0,0.0,14.0,Orthodox,177.8,187.96,155.0,29.0
404,James Krause,Kevin MacDonald,0.0,4.0,0.0,9.888889,6.666667,3.333333,1.777778,108.333333,...,3.0,1.0,2.0,0.0,6.0,Orthodox,187.96,185.42,170.0,32.0
208,Ricardo Ramos,Joao Claudio Soares,0.0,3.0,0.0,13.666667,7.666667,6.666667,5.0,92.666667,...,1.0,1.0,0.0,0.0,3.0,Orthodox,175.26,182.88,135.0,23.0
365,Francisco Trinaldo,Marc Goddard,1.0,0.0,0.0,13.176471,10.470588,10.0,7.235294,63.470588,...,6.0,2.0,2.0,1.0,12.0,Southpaw,175.26,177.8,155.0,40.0
452,Randy Brown,Herb Dean,0.0,1.0,0.0,7.166667,5.333333,9.833333,8.166667,53.666667,...,2.0,1.0,1.0,0.0,4.0,Orthodox,190.5,198.12,170.0,28.0
230,Douglas Silva de Andrade,Mark Smith,0.0,1.0,0.0,15.0,8.8,4.8,2.2,136.0,...,2.0,1.0,0.0,0.0,3.0,Orthodox,170.18,172.72,135.0,33.0
217,Joanne Calderwood,Blake Grice,0.0,1.0,0.0,20.428571,12.285714,14.285714,9.714286,91.142857,...,2.0,1.0,1.0,0.0,4.0,Orthodox,167.64,165.1,125.0,33.0


In [402]:
pd.DataFrame(all_fighters_B)

Unnamed: 0,B_fighter,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_BODY_att,B_avg_BODY_landed,B_avg_CLINCH_att,B_avg_CLINCH_landed,B_avg_DISTANCE_att,B_avg_DISTANCE_landed,...,B_win_by_Decision_Unanimous,B_win_by_KO/TKO,B_win_by_Submission,B_win_by_TKO_Doctor_Stoppage,B_wins,B_Stance,B_Height_cms,B_Reach_cms,B_Weight_lbs,B_age
323,Jacare Souza,1.0,0.0,0.0,8.636364,5.363636,5.272727,3.090909,27.363636,10.818182,...,1.0,3.0,4.0,0.0,8.0,Orthodox,185.42,182.88,185.0,38.0
103,Paul Craig,1.0,0.0,0.0,8.6,6.8,6.4,5.4,26.2,11.0,...,0.0,0.0,2.0,0.0,2.0,Orthodox,190.5,193.04,205.0,31.0
812,Jared Gordon,0.0,1.0,0.0,7.0,5.0,5.0,4.0,65.0,27.0,...,0.0,1.0,0.0,0.0,1.0,Orthodox,175.26,172.72,155.0,29.0
406,Markus Perez,0.0,1.0,0.0,9.0,7.5,7.5,6.0,19.5,7.5,...,0.0,0.0,1.0,0.0,1.0,Orthodox,185.42,185.42,185.0,30.0
45,Sergio Moraes,1.0,0.0,0.0,8.25,5.25,1.833333,0.916667,64.5,27.083333,...,2.0,1.0,3.0,0.0,8.0,Orthodox,182.88,182.88,170.0,36.0
667,Bobby Green,4.0,0.0,0.0,20.375,13.25,8.625,7.125,106.0,45.0,...,1.0,1.0,1.0,0.0,4.0,Orthodox,177.8,180.34,155.0,31.0
404,Warlley Alves,0.0,2.0,0.0,12.25,8.625,8.5,6.125,54.375,22.875,...,2.0,0.0,3.0,1.0,6.0,Orthodox,180.34,182.88,170.0,27.0
899,Renan Barao,0.0,1.0,0.0,19.083333,10.416667,9.833333,5.75,142.0,46.583333,...,4.0,2.0,3.0,0.0,9.0,Orthodox,167.64,177.8,135.0,30.0


In [419]:
a = testdf.merge(pd.DataFrame(all_fighters_R), left_on='R_fighter', right_on='R_fighter')

In [420]:
a.iloc[0].values

array(['win', 'Jan Blachowicz  Jacare Souza', 'Light Heavyweight',
       'S-DEC', 5, '5:00', 'Jan Blachowicz', 'Jacare Souza',
       'Jan Blachowicz', 'Jacare Souza', 'Herb Dean', 0.0, 4.0, 0.0, 13.5,
       9.9, 10.0, 7.2, 68.2, 29.6, 10.9, 7.8, 72.5, 31.8, 0.3, 3.1, 2.9,
       1.2, 0.0, 89.1, 44.6, 0.519, 0.3, 2.4, 1.3, 0.546, 111.1, 63.9,
       4.0, 4.0, 9.5, 5.7, 8.1, 5.8, 54.0, 17.3, 17.7, 13.8, 64.4, 26.9,
       0.0, 5.9, 4.3, 1.2, 0.1, 79.8, 36.9, 0.41, 0.1, 2.7, 1.2, 0.31,
       114.9, 67.8, 26.0, 736.1, 0.0, 0.0, 0.0, 3.0, 1.0, 2.0, 0.0, 6.0,
       'Orthodox', 187.96, 198.12, 205.0, 35.0], dtype=object)

In [423]:
b = a.merge(pd.DataFrame(all_fighters_B), left_on='B_fighter', right_on='B_fighter')

In [424]:
b

Unnamed: 0,W/L,Fighter,Weight class,Method,Round,Time,Winner,Loser,R_fighter,B_fighter,...,B_win_by_Decision_Unanimous,B_win_by_KO/TKO,B_win_by_Submission,B_win_by_TKO_Doctor_Stoppage,B_wins,B_Stance,B_Height_cms,B_Reach_cms,B_Weight_lbs,B_age
0,win,Jan Blachowicz Jacare Souza,Light Heavyweight,S-DEC,5,5:00,Jan Blachowicz,Jacare Souza,Jan Blachowicz,Jacare Souza,...,1.0,3.0,4.0,0.0,8.0,Orthodox,185.42,182.88,185.0,38.0
1,draw draw,Mauricio Rua Paul Craig,Light Heavyweight,S-DEC,3,5:00,Mauricio Rua,Paul Craig,Mauricio Rua,Paul Craig,...,0.0,0.0,2.0,0.0,2.0,Orthodox,190.5,193.04,205.0,31.0
2,win,Charles Oliveira Jared Gordon,Lightweight,KO/TKO Punch,1,1:26,Charles Oliveira,Jared Gordon,Charles Oliveira,Jared Gordon,...,0.0,1.0,0.0,0.0,1.0,Orthodox,175.26,172.72,155.0,29.0
3,win,James Krause Sergio Moraes,Welterweight,KO/TKO Punch,3,4:19,James Krause,Sergio Moraes,James Krause,Sergio Moraes,...,2.0,1.0,3.0,0.0,8.0,Orthodox,182.88,182.88,170.0,36.0
4,win,Francisco Trinaldo Bobby Green,Lightweight,U-DEC,3,5:00,Francisco Trinaldo,Bobby Green,Francisco Trinaldo,Bobby Green,...,1.0,1.0,1.0,0.0,4.0,Orthodox,177.8,180.34,155.0,31.0
5,win,Randy Brown Warlley Alves,Welterweight,SUB Triangle Choke,2,1:22,Randy Brown,Warlley Alves,Randy Brown,Warlley Alves,...,2.0,0.0,3.0,1.0,6.0,Orthodox,180.34,182.88,170.0,27.0
6,win,Douglas Silva de Andrade Renan Barao,Featherweight,U-DEC,3,5:00,Douglas Silva de Andrade,Renan Barao,Douglas Silva de Andrade,Renan Barao,...,4.0,2.0,3.0,0.0,9.0,Orthodox,167.64,177.8,135.0,30.0


In [427]:
processed_df

Unnamed: 0,Winner,title_bout,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_BODY_att,B_avg_BODY_landed,B_avg_CLINCH_att,B_avg_CLINCH_landed,...,weight_class_Women's Strawweight,B_Stance_Open Stance,B_Stance_Orthodox,B_Stance_Sideways,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch
0,Red,True,5,0.0,4.0,0.0,9.200000,6.000000,0.200000,0.000000,...,0,0,1,0,0,0,0,1,0,0
1,Red,True,5,0.0,3.0,0.0,14.600000,9.100000,11.800000,7.300000,...,0,0,1,0,0,0,0,0,1,0
2,Red,False,3,0.0,3.0,0.0,15.354839,11.322581,6.741935,4.387097,...,0,0,1,0,0,0,0,1,0,0
3,Blue,False,3,0.0,4.0,0.0,17.000000,14.000000,13.750000,11.000000,...,0,0,0,0,0,1,0,1,0,0
4,Blue,False,3,0.0,1.0,0.0,17.000000,14.500000,2.500000,2.000000,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3587,Red,False,1,0.0,1.0,0.0,1.000000,1.000000,0.000000,0.000000,...,0,0,0,0,1,0,0,0,1,0
3588,Red,False,1,0.0,1.0,0.0,0.000000,0.000000,1.000000,1.000000,...,0,0,1,0,0,0,0,1,0,0
3589,Red,True,1,0.0,2.0,0.0,0.500000,0.500000,0.000000,0.000000,...,0,0,1,0,0,0,0,0,1,0
3590,Red,False,1,0.0,1.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0,0,1,0,0,0,0,0,1,0
