In [1]:
import numpy as np
import pandas as pd
import re
from datetime import datetime

In [2]:
df_fighters = pd.read_csv('fighters.csv')
df_cards = pd.read_csv('fightcards.csv')

## Initial Exploration

In [3]:
df_fighters.head()

Unnamed: 0,DoB,SApM,SLpM,height,name,reach,record,stance,strAcc,strDef,subAvg,tdAcc,tdAvg,tdDef,weight
0,Jun 15 1989,6.11,1.11,"6' 0""",Justin Frazier,75.0,10-3-0,Southpaw,66%,4%,0.0,0%,0.0,0%,265
1,Feb 07 1989,8.28,2.99,"5' 9""",Gleidson Cutis,,7-4-0,Orthodox,52%,59%,0.0,0%,0.0,0%,155
2,Jul 03 1982,2.5,1.47,"6' 1""",Xavier Foupa-Pokam,,32-22-0,Open Stance,43%,49%,0.0,0%,0.0,16%,185
3,--,3.29,0.71,"6' 6""",Chuck Grigsby,,23-10-0,Orthodox,16%,46%,0.0,0%,0.0,66%,205
4,--,3.14,0.79,"6' 0""",Jason Gilliam,,14-9-0,Orthodox,33%,42%,0.0,0%,0.0,0%,170


In [4]:
df_cards.head()

Unnamed: 0,card_name,f1,f1_sig_strike_per,f1_sig_strike_total,f1_td_attempt,f1_td_succeed,f2,f2_sig_strike_per,f2_sig_strike_total,f2_td_attempt,f2_td_succeed,fight_date,fights_location,round_format,round_fought,weight_class,winner,winning_method
0,UFC Fight Night: Santos vs. Teixeira,Gustavo Lopez,60%,43,3,2,Anthony Birchak,42%,40,0,0,November 07 2020,"Las Vegas, Nevada, USA",3,1,Bantamweight,Gustavo Lopez,SUB
1,UFC Fight Night: Hall vs. Silva,Jason Witt,20%,5,0,0,Cole Williams,64%,31,3,2,October 31 2020,"Las Vegas, Nevada, USA",3,2,Welterweight,Jason Witt,SUB
2,UFC Fight Night: Hall vs. Silva,Dustin Jacoby,57%,38,0,0,Justin Ledet,32%,31,0,0,October 31 2020,"Las Vegas, Nevada, USA",3,1,Light Heavyweight,Dustin Jacoby,KO/TKO
3,UFC Fight Night: Hall vs. Silva,Miles Johns,45%,105,7,0,Kevin Natividad,24%,137,1,0,October 31 2020,"Las Vegas, Nevada, USA",3,3,Bantamweight,Miles Johns,KO/TKO
4,UFC 255: Figueiredo vs. Perez,Sasha Palatnikov,48%,206,9,1,Louis Cosce,52%,237,1,1,November 21 2020,"Las Vegas, Nevada, USA",3,3,Welterweight,Sasha Palatnikov,KO/TKO


In [5]:
df_fighters.dtypes

DoB        object
SApM      float64
SLpM      float64
height     object
name       object
reach     float64
record     object
stance     object
strAcc     object
strDef     object
subAvg    float64
tdAcc      object
tdAvg     float64
tdDef      object
weight     object
dtype: object

In [6]:
df_cards.dtypes

card_name              object
f1                     object
f1_sig_strike_per      object
f1_sig_strike_total     int64
f1_td_attempt           int64
f1_td_succeed           int64
f2                     object
f2_sig_strike_per      object
f2_sig_strike_total     int64
f2_td_attempt           int64
f2_td_succeed           int64
fight_date             object
fights_location        object
round_format            int64
round_fought            int64
weight_class           object
winner                 object
winning_method         object
dtype: object

In [7]:
df_cards.isnull().sum()

card_name              0
f1                     0
f1_sig_strike_per      0
f1_sig_strike_total    0
f1_td_attempt          0
f1_td_succeed          0
f2                     0
f2_sig_strike_per      0
f2_sig_strike_total    0
f2_td_attempt          0
f2_td_succeed          0
fight_date             0
fights_location        0
round_format           0
round_fought           0
weight_class           0
winner                 0
winning_method         0
dtype: int64

In [8]:
df_fighters.isnull().sum()

DoB          0
SApM         0
SLpM         0
height       0
name         0
reach     1982
record       0
stance     877
strAcc       0
strDef       0
subAvg       0
tdAcc        0
tdAvg        0
tdDef        0
weight       0
dtype: int64

In [9]:
print(df_fighters.apply(lambda col: col.nunique()))

DoB       2524
SApM       758
SLpM       650
height      27
name      3713
reach       27
record    1170
stance       5
strAcc      85
strDef      86
subAvg      94
tdAcc       83
tdAvg      529
tdDef       93
weight     113
dtype: int64


In [10]:
# check for fighter with the same name
df_fighters[df_fighters.duplicated(subset='name', keep=False)]

Unnamed: 0,DoB,SApM,SLpM,height,name,reach,record,stance,strAcc,strDef,subAvg,tdAcc,tdAvg,tdDef,weight
433,Feb 06 1965,0.4,0.0,"5' 11""",Michael McDonald,,1-1-0,Orthodox,0%,50%,0.0,0%,0.0,0%,205
446,Jan 15 1991,2.76,2.69,"5' 9""",Michael McDonald,70.0,17-4-0,Orthodox,42%,57%,1.4,66%,1.09,52%,135
1308,May 02 1983,3.67,4.0,"6' 2""",Tony Johnson,76.0,7-2-0,Orthodox,92%,22%,0.0,0%,0.0,90%,205
1318,--,4.73,2.0,"6' 1""",Tony Johnson,,11-3-0,,53%,31%,0.0,22%,2.0,0%,265
1916,Jul 21 1986,4.46,2.44,"5' 10""",Joey Gomez,73.0,6-2-0,Orthodox,28%,55%,0.0,100%,0.62,50%,135
2092,Aug 29 1989,3.33,3.73,"5' 10""",Joey Gomez,71.0,7-1-0,Orthodox,49%,50%,0.0,28%,2.0,0%,155
2272,Oct 07 1992,6.76,6.24,"6' 0""",Mike Davis,72.0,9-2-0,Orthodox,53%,57%,0.0,33%,1.39,69%,145
2404,--,0.0,0.0,--,Mike Davis,,2-0-0,,0%,0%,0.0,0%,0.0,0%,--
3194,Mar 16 1990,3.23,2.98,"5' 4""",Bruno Silva,65.0,12-5-2 (1 NC),Orthodox,46%,58%,0.0,31%,2.89,64%,125
3300,Jul 13 1989,0.21,4.0,"6' 0""",Bruno Silva,74.0,20-6-0,Orthodox,73%,0%,0.0,0%,0.0,100%,185


## Data cleaning

#### df_fighters cleaning

Quite a lot of preprocessing will need to be done here. First, a few fighters have the same name there. We will add elements to their name to distinguish them. The easiest way would be by adding their weights, since fortunately, the fighters with the same name in our list here have different weight class. We also have 2 Mike Davis, with the 2nd one missing a lot of data. therefore, we will drop him altogether

In [11]:
# change certain name so that we don't have duplication
df_fighters.iloc[446, 4] = "Michael McDonald 135"
df_fighters.iloc[1318, 4] = "Tony Johnson 265"
df_fighters.iloc[2092, 4] = "Joey Gomez 155"
df_fighters.iloc[3300, 4] = "Bruno Silva 185"
df_fighters.drop([2404], inplace=True)

We can see that in both of the table, there are a lot of percentages that are in string. We will therefore create a function to convert those to decimal

In [12]:
def p2d(df, columns):
    for column in columns:
        df[column] = df[column].str.strip('%')
        df[column] = pd.to_numeric(df[column]) / 100


In [13]:
p2d(df_fighters, ['strAcc', 'strDef', 'tdAcc', 'tdDef'])


In [14]:
df_fighters.columns

Index(['DoB', 'SApM', 'SLpM', 'height', 'name', 'reach', 'record', 'stance',
       'strAcc', 'strDef', 'subAvg', 'tdAcc', 'tdAvg', 'tdDef', 'weight'],
      dtype='object')

In [15]:

# Some fighters do not have statistics available, and we will remove those fighters.
fighters_clean = df_fighters.loc[~(
                               (df_fighters["strDef"] == 0) &
                               (df_fighters["tdAvg"] == 0) &
                               (df_fighters["tdAcc"] == 0) &
                               (df_fighters["tdDef"] == 0) &
                               (df_fighters["subAvg"] == 0))].copy()

In [16]:
per_missing_dob = fighters_clean[fighters_clean['DoB'] == '--']['DoB'].count() / len(fighters_clean)
print('the percentage of missing Date of Birth is: {:.2f}'.format(per_missing_dob))

the percentage of missing Date of Birth is: 0.11


Although the percentage of missing date of birth is quite high, we would remove these fighters anyways since the lack of birth date often mean that the fighter only fought 1 match in UFC, and was quite irrelevant to the sport. Also, it often means that these fighters fought in the very early days of ufc. And as the fan knows, the sport has changed dramatically since then. Including these fighters might add more random noise to our dataset than helping

In [17]:
fighters_clean = fighters_clean[~(fighters_clean['DoB'] == '--')].copy()

In [18]:
print("Initially, there are {} fighers in total, after clean up: {} fighers".format(len(df_fighters), len(fighters_clean)))

Initially, there are 3717 fighers in total, after clean up: 2643 fighers


In [19]:
def get_birth_year(dob):
    return datetime.strptime(dob, '%b %d %Y').year

fighters_clean['born_year'] = fighters_clean['DoB'].apply(lambda x: get_birth_year(x))
fighters_clean.drop(['DoB'], inplace=True, axis=1)

In [20]:
fighters_clean.set_index('name', inplace=True)

In [21]:
fighters_clean.head()

Unnamed: 0_level_0,SApM,SLpM,height,reach,record,stance,strAcc,strDef,subAvg,tdAcc,tdAvg,tdDef,weight,born_year
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Justin Frazier,6.11,1.11,"6' 0""",75.0,10-3-0,Southpaw,0.66,0.04,0.0,0.0,0.0,0.0,265,1989
Gleidson Cutis,8.28,2.99,"5' 9""",,7-4-0,Orthodox,0.52,0.59,0.0,0.0,0.0,0.0,155,1989
Xavier Foupa-Pokam,2.5,1.47,"6' 1""",,32-22-0,Open Stance,0.43,0.49,0.0,0.0,0.0,0.16,185,1982
Mirko Filipovic,1.89,2.11,"6' 2""",73.0,35-11-2 (1 NC),Southpaw,0.5,0.63,0.3,0.4,0.19,0.78,230,1974
Jordan Johnson,2.64,3.45,"6' 2""",79.0,10-0-0,Orthodox,0.47,0.53,1.2,0.42,3.25,1.0,205,1988


From my experience, reach is 1 of the most important factor that would determine the strategy and outcome of a fight. yet as we see in the first look of the data, the data is missing for more than a thoudsand of fighters. 

The method we will use to move forward is as follow: Reach is often a function of height (although not perfect, this is the closest thing we have from the available data). Therefore, we will try to find another fighter that have the same height as the fighter with missing data and use that fighter height as our value.
In some cases, we will find multiple fighters with the same height, but different reach. for the sake of simplicity, we will simply get the first non-NaN value

In [23]:
# fetch first non-NA value of each height
height_ref = fighters_clean.groupby('height')['reach'].median()
height_ref

height
--        70.0
5' 0"     61.0
5' 1"     62.0
5' 10"    72.0
5' 11"    73.0
5' 2"     62.0
5' 3"     64.0
5' 4"     65.0
5' 5"     66.0
5' 6"     67.0
5' 7"     69.0
5' 8"     70.0
5' 9"     71.0
6' 0"     74.0
6' 1"     75.0
6' 10"     NaN
6' 11"    84.0
6' 2"     75.0
6' 3"     77.0
6' 4"     78.5
6' 5"     79.0
6' 6"     79.0
6' 7"     80.0
6' 8"     80.0
7' 2"      NaN
7' 5"      NaN
Name: reach, dtype: float64

In [24]:
# apply and check the amount of missing data left
fighters_clean['reach'] = fighters_clean['reach'].fillna(fighters_clean['height'].map(height_ref))
fighters_clean['reach'].isna().sum()

5

In [25]:
# with only 5, we can now drop those rows
fighters_clean.dropna(subset=['reach'], inplace=True)
fighters_clean['reach'].isna().sum()

0

In [26]:
# stance
fighters_clean['stance'].unique()

array(['Southpaw', 'Orthodox', 'Open Stance', 'Switch', nan, 'Sideways'],
      dtype=object)

In [27]:
# we will simply fill the missing stance with Open Stance
fighters_clean['stance'].fillna('Open Stance', inplace=True)

In [28]:
# convert height from inch to cm
def convert_to_cms(X):
    if X is np.NaN:
        return X
    elif len(X.split("'")) == 2:
        feet = float(X.split("'")[0])
        inches = int(X.split("'")[1].replace(' ', '').replace('"',''))
        return (feet * 30.48) + (inches * 2.54)
    else:
        return float(X.replace('"','')) * 2.54

In [29]:
# we have some missing data that hide under the form of "--"
fighters_clean[fighters_clean['height'] == '--']

Unnamed: 0_level_0,SApM,SLpM,height,reach,record,stance,strAcc,strDef,subAvg,tdAcc,tdAvg,tdDef,weight,born_year
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Amador Ramirez,2.07,4.93,--,70.0,5-4-0,Open Stance,0.51,0.69,0.0,0.33,1.0,0.0,135,1990
Matt Ricehouse,4.8,3.7,--,70.0,6-1-0,Open Stance,0.44,0.47,0.0,0.22,1.0,0.81,155,1987
Logan Nail,2.27,1.93,--,70.0,1-1-0,Open Stance,0.51,0.39,0.0,0.0,0.0,0.37,185,1989
Neal Ewing,1.93,2.27,--,70.0,6-0-0,Open Stance,0.6,0.48,0.0,0.62,5.0,0.0,185,1985
Lee Higgins,3.68,1.02,--,70.0,2-1-0,Open Stance,0.26,0.4,0.0,0.0,0.0,0.0,155,1980
Hiroshi Izumi,2.65,1.95,--,70.0,4-2-0,Orthodox,0.37,0.66,0.5,0.7,3.35,1.0,205,1982
Joe Duarte,4.0,2.27,--,70.0,10-4-0,Open Stance,0.38,0.53,1.0,0.5,3.0,0.69,155,1977
TJ Cook,3.18,2.3,--,70.0,13-5-0,Open Stance,0.47,0.54,0.0,0.5,1.01,0.0,205,1982
Edward Faaloloto,6.25,2.28,--,70.0,2-5-0,Open Stance,0.32,0.44,0.0,0.25,1.01,0.33,155,1984
Maka Watson,1.6,0.93,--,70.0,4-2-0,Open Stance,0.37,0.22,0.0,1.0,2.0,0.33,155,1984


In [30]:
# after some research, most of them have a height of 5'7". So we will replace these values in
feet = "5' "
inches = '7"'
height = feet + inches
fighters_clean['height'].replace({"--": height}, inplace=True)

In [31]:
fighters_clean['height'] = fighters_clean['height'].apply(convert_to_cms)


In [32]:
# split the record
fighters_clean['record'] = fighters_clean['record'].str.replace(' \(', '-(', regex=True)
fighters_clean[['win', 'lose', 'draw', 'nc']] = fighters_clean['record'].str.split('-', expand=True)

def split_nc(nc):
    return re.findall(r"\d+", nc, re.IGNORECASE)[0]
    
fighters_clean['nc'] = fighters_clean['nc'].apply(lambda x: split_nc(x) if x is not None else 0)
fighters_clean.drop(['record'], axis=1, inplace=True)

fighters_clean.head()

Unnamed: 0_level_0,SApM,SLpM,height,reach,stance,strAcc,strDef,subAvg,tdAcc,tdAvg,tdDef,weight,born_year,win,lose,draw,nc
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Justin Frazier,6.11,1.11,182.88,75.0,Southpaw,0.66,0.04,0.0,0.0,0.0,0.0,265,1989,10,3,0,0
Gleidson Cutis,8.28,2.99,175.26,71.0,Orthodox,0.52,0.59,0.0,0.0,0.0,0.0,155,1989,7,4,0,0
Xavier Foupa-Pokam,2.5,1.47,185.42,75.0,Open Stance,0.43,0.49,0.0,0.0,0.0,0.16,185,1982,32,22,0,0
Mirko Filipovic,1.89,2.11,187.96,73.0,Southpaw,0.5,0.63,0.3,0.4,0.19,0.78,230,1974,35,11,2,1
Jordan Johnson,2.64,3.45,187.96,79.0,Orthodox,0.47,0.53,1.2,0.42,3.25,1.0,205,1988,10,0,0,0


In [33]:
# deal with missing weight hiding as '--'
# first check out how many are left
len(fighters_clean[fighters_clean['weight'] == '--'])

0

In [34]:
# we can jsut drop them
fighters_clean =  fighters_clean[fighters_clean['weight'] != '--']

In [35]:
# finally convert the relevant rows from string to int
def string_2_int(df, columns):
    for column in columns:
        df[column] = df[column].astype(int)
        
        
string_2_int(fighters_clean, ['win', 'lose', 'draw', 'nc', 'weight'])

In [36]:
fighters_clean.dtypes

SApM         float64
SLpM         float64
height       float64
reach        float64
stance        object
strAcc       float64
strDef       float64
subAvg       float64
tdAcc        float64
tdAvg        float64
tdDef        float64
weight         int32
born_year      int64
win            int32
lose           int32
draw           int32
nc             int32
dtype: object

#### df_cards cleaning

In [37]:
p2d(df_cards, ['f1_sig_strike_per', 'f2_sig_strike_per'])
df_cards

Unnamed: 0,card_name,f1,f1_sig_strike_per,f1_sig_strike_total,f1_td_attempt,f1_td_succeed,f2,f2_sig_strike_per,f2_sig_strike_total,f2_td_attempt,f2_td_succeed,fight_date,fights_location,round_format,round_fought,weight_class,winner,winning_method
0,UFC Fight Night: Santos vs. Teixeira,Gustavo Lopez,0.60,43,3,2,Anthony Birchak,0.42,40,0,0,November 07 2020,"Las Vegas, Nevada, USA",3,1,Bantamweight,Gustavo Lopez,SUB
1,UFC Fight Night: Hall vs. Silva,Jason Witt,0.20,5,0,0,Cole Williams,0.64,31,3,2,October 31 2020,"Las Vegas, Nevada, USA",3,2,Welterweight,Jason Witt,SUB
2,UFC Fight Night: Hall vs. Silva,Dustin Jacoby,0.57,38,0,0,Justin Ledet,0.32,31,0,0,October 31 2020,"Las Vegas, Nevada, USA",3,1,Light Heavyweight,Dustin Jacoby,KO/TKO
3,UFC Fight Night: Hall vs. Silva,Miles Johns,0.45,105,7,0,Kevin Natividad,0.24,137,1,0,October 31 2020,"Las Vegas, Nevada, USA",3,3,Bantamweight,Miles Johns,KO/TKO
4,UFC 255: Figueiredo vs. Perez,Sasha Palatnikov,0.48,206,9,1,Louis Cosce,0.52,237,1,1,November 21 2020,"Las Vegas, Nevada, USA",3,3,Welterweight,Sasha Palatnikov,KO/TKO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6134,UFC 265: Lewis vs. Gane,Manel Kape,0.65,29,0,0,Ode Osbourne,0.47,38,0,0,August 07 2021,"Houston, Texas, USA",3,1,Flyweight,Manel Kape,KO/TKO
6135,UFC 265: Lewis vs. Gane,Vince Morales,0.38,196,4,1,Drako Rodriguez,0.36,151,5,3,August 07 2021,"Houston, Texas, USA",3,3,Bantamweight,Vince Morales,U-DEC
6136,UFC 265: Lewis vs. Gane,Song Yadong,0.45,257,3,0,Casey Kenney,0.41,190,5,1,August 07 2021,"Houston, Texas, USA",3,3,Bantamweight,Song Yadong,S-DEC
6137,UFC 265: Lewis vs. Gane,Vicente Luque,0.33,12,1,1,Michael Chiesa,0.58,12,0,0,August 07 2021,"Houston, Texas, USA",3,1,Welterweight,Vicente Luque,SUB


In [38]:
# reuse split birth year function to get year of fight
def get_fight_year(dof):
    return datetime.strptime(dof, '%B %d %Y').year

df_cards['fight_year'] = df_cards['fight_date'].apply(lambda x: get_fight_year(x))
df_cards.drop(['fight_date'], axis=1, inplace=True)

In [39]:
df_cards

Unnamed: 0,card_name,f1,f1_sig_strike_per,f1_sig_strike_total,f1_td_attempt,f1_td_succeed,f2,f2_sig_strike_per,f2_sig_strike_total,f2_td_attempt,f2_td_succeed,fights_location,round_format,round_fought,weight_class,winner,winning_method,fight_year
0,UFC Fight Night: Santos vs. Teixeira,Gustavo Lopez,0.60,43,3,2,Anthony Birchak,0.42,40,0,0,"Las Vegas, Nevada, USA",3,1,Bantamweight,Gustavo Lopez,SUB,2020
1,UFC Fight Night: Hall vs. Silva,Jason Witt,0.20,5,0,0,Cole Williams,0.64,31,3,2,"Las Vegas, Nevada, USA",3,2,Welterweight,Jason Witt,SUB,2020
2,UFC Fight Night: Hall vs. Silva,Dustin Jacoby,0.57,38,0,0,Justin Ledet,0.32,31,0,0,"Las Vegas, Nevada, USA",3,1,Light Heavyweight,Dustin Jacoby,KO/TKO,2020
3,UFC Fight Night: Hall vs. Silva,Miles Johns,0.45,105,7,0,Kevin Natividad,0.24,137,1,0,"Las Vegas, Nevada, USA",3,3,Bantamweight,Miles Johns,KO/TKO,2020
4,UFC 255: Figueiredo vs. Perez,Sasha Palatnikov,0.48,206,9,1,Louis Cosce,0.52,237,1,1,"Las Vegas, Nevada, USA",3,3,Welterweight,Sasha Palatnikov,KO/TKO,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6134,UFC 265: Lewis vs. Gane,Manel Kape,0.65,29,0,0,Ode Osbourne,0.47,38,0,0,"Houston, Texas, USA",3,1,Flyweight,Manel Kape,KO/TKO,2021
6135,UFC 265: Lewis vs. Gane,Vince Morales,0.38,196,4,1,Drako Rodriguez,0.36,151,5,3,"Houston, Texas, USA",3,3,Bantamweight,Vince Morales,U-DEC,2021
6136,UFC 265: Lewis vs. Gane,Song Yadong,0.45,257,3,0,Casey Kenney,0.41,190,5,1,"Houston, Texas, USA",3,3,Bantamweight,Song Yadong,S-DEC,2021
6137,UFC 265: Lewis vs. Gane,Vicente Luque,0.33,12,1,1,Michael Chiesa,0.58,12,0,0,"Houston, Texas, USA",3,1,Welterweight,Vicente Luque,SUB,2021


In [40]:
#randomly swap f1 and f2 for half of the dataset so that 50% of f2 are winners
swap_indices = np.random.choice(len(df_cards), size= len(df_cards) //2, replace = False)
df_cards.iloc[swap_indices, [1, 6]] = df_cards.iloc[swap_indices, [6, 1]].values


In [41]:
# quick check
df_cards["winner"] = df_cards["winner"] == df_cards["f1"]
df_cards["winner"] = df_cards["winner"].astype(int)
df_cards["winner"].value_counts()

1    3070
0    3069
Name: winner, dtype: int64

In [42]:
# we had to change some names earlier due to duplication, we will do the same here
cards_clean = df_cards.copy()
for col in ['f1', 'f2']:
    cards_clean.loc[(cards_clean[col] == 'Michael McDonald') & 
                    (cards_clean['weight_class'] == 'Bantamweight'), col] = "Michael McDonald 135"
    
    cards_clean.loc[(cards_clean[col] == 'Tony Johnson') & 
                    (cards_clean['weight_class'] == 'Heavyweight'), col] = "Tony Johnson 265"
    
    cards_clean.loc[(cards_clean[col] == 'Joey Gomez') & 
                    (cards_clean['weight_class'] == 'Welterweight'), col] = "Joey Gomez 155"
    
    cards_clean.loc[(cards_clean[col] == 'Bruno Silva') & 
                    (cards_clean['weight_class'] == 'Light Heavyweight'), col] = "Bruno Silva 185"
    
  

In [43]:
cards_clean

Unnamed: 0,card_name,f1,f1_sig_strike_per,f1_sig_strike_total,f1_td_attempt,f1_td_succeed,f2,f2_sig_strike_per,f2_sig_strike_total,f2_td_attempt,f2_td_succeed,fights_location,round_format,round_fought,weight_class,winner,winning_method,fight_year
0,UFC Fight Night: Santos vs. Teixeira,Gustavo Lopez,0.60,43,3,2,Anthony Birchak,0.42,40,0,0,"Las Vegas, Nevada, USA",3,1,Bantamweight,1,SUB,2020
1,UFC Fight Night: Hall vs. Silva,Jason Witt,0.20,5,0,0,Cole Williams,0.64,31,3,2,"Las Vegas, Nevada, USA",3,2,Welterweight,1,SUB,2020
2,UFC Fight Night: Hall vs. Silva,Justin Ledet,0.57,38,0,0,Dustin Jacoby,0.32,31,0,0,"Las Vegas, Nevada, USA",3,1,Light Heavyweight,0,KO/TKO,2020
3,UFC Fight Night: Hall vs. Silva,Miles Johns,0.45,105,7,0,Kevin Natividad,0.24,137,1,0,"Las Vegas, Nevada, USA",3,3,Bantamweight,1,KO/TKO,2020
4,UFC 255: Figueiredo vs. Perez,Louis Cosce,0.48,206,9,1,Sasha Palatnikov,0.52,237,1,1,"Las Vegas, Nevada, USA",3,3,Welterweight,0,KO/TKO,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6134,UFC 265: Lewis vs. Gane,Ode Osbourne,0.65,29,0,0,Manel Kape,0.47,38,0,0,"Houston, Texas, USA",3,1,Flyweight,0,KO/TKO,2021
6135,UFC 265: Lewis vs. Gane,Vince Morales,0.38,196,4,1,Drako Rodriguez,0.36,151,5,3,"Houston, Texas, USA",3,3,Bantamweight,1,U-DEC,2021
6136,UFC 265: Lewis vs. Gane,Casey Kenney,0.45,257,3,0,Song Yadong,0.41,190,5,1,"Houston, Texas, USA",3,3,Bantamweight,0,S-DEC,2021
6137,UFC 265: Lewis vs. Gane,Michael Chiesa,0.33,12,1,1,Vicente Luque,0.58,12,0,0,"Houston, Texas, USA",3,1,Welterweight,0,SUB,2021


In [44]:
all_fighters = fighters_clean.index.tolist()
all_fighters[:10]

['Justin Frazier',
 'Gleidson Cutis',
 'Xavier Foupa-Pokam',
 'Mirko Filipovic',
 'Jordan Johnson',
 'Martin Kampmann',
 'Darren Elkins',
 'Austen Lane',
 'Rachael Ostovich',
 'Travis Lutter']

In [45]:
# drop the fights that don't have the figther in the fighters_clean df
cards_clean = cards_clean.loc[(cards_clean["f1"].isin(all_fighters)) &
                              (cards_clean["f2"].isin(all_fighters))]
cards_clean.reset_index(inplace=True, drop=True)

In [46]:
cards_clean.head()

Unnamed: 0,card_name,f1,f1_sig_strike_per,f1_sig_strike_total,f1_td_attempt,f1_td_succeed,f2,f2_sig_strike_per,f2_sig_strike_total,f2_td_attempt,f2_td_succeed,fights_location,round_format,round_fought,weight_class,winner,winning_method,fight_year
0,UFC Fight Night: Santos vs. Teixeira,Gustavo Lopez,0.6,43,3,2,Anthony Birchak,0.42,40,0,0,"Las Vegas, Nevada, USA",3,1,Bantamweight,1,SUB,2020
1,UFC Fight Night: Hall vs. Silva,Jason Witt,0.2,5,0,0,Cole Williams,0.64,31,3,2,"Las Vegas, Nevada, USA",3,2,Welterweight,1,SUB,2020
2,UFC Fight Night: Hall vs. Silva,Justin Ledet,0.57,38,0,0,Dustin Jacoby,0.32,31,0,0,"Las Vegas, Nevada, USA",3,1,Light Heavyweight,0,KO/TKO,2020
3,UFC Fight Night: Hall vs. Silva,Miles Johns,0.45,105,7,0,Kevin Natividad,0.24,137,1,0,"Las Vegas, Nevada, USA",3,3,Bantamweight,1,KO/TKO,2020
4,UFC 255: Figueiredo vs. Perez,Louis Cosce,0.48,206,9,1,Sasha Palatnikov,0.52,237,1,1,"Las Vegas, Nevada, USA",3,3,Welterweight,0,KO/TKO,2020


In [47]:
print("we had {} cards initially. After clean up: {} cards".format(len(df_cards), len(cards_clean)))

we had 6139 cards initially. After clean up: 5948 cards


In [48]:
# get data of fighter 1 and fighter 2
f1_data = fighters_clean.loc[cards_clean['f1']]
f1_data = f1_data.add_suffix('_f1')
f2_data = fighters_clean.loc[cards_clean['f2']]
f2_data = f2_data.add_suffix('_f2')

In [49]:
f1_data.head()

Unnamed: 0_level_0,SApM_f1,SLpM_f1,height_f1,reach_f1,stance_f1,strAcc_f1,strDef_f1,subAvg_f1,tdAcc_f1,tdAvg_f1,tdDef_f1,weight_f1,born_year_f1,win_f1,lose_f1,draw_f1,nc_f1
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Gustavo Lopez,4.65,2.24,165.1,67.0,Orthodox,0.41,0.51,0.5,0.5,1.07,0.27,135,1989,12,6,0,0
Jason Witt,3.1,2.63,177.8,70.0,Orthodox,0.49,0.5,1.3,0.45,6.46,1.0,170,1986,19,7,0,0
Justin Ledet,4.82,3.35,193.04,80.0,Orthodox,0.39,0.49,0.2,0.5,0.23,0.42,205,1988,9,4,0,1
Miles Johns,2.5,4.03,170.18,66.0,Orthodox,0.53,0.71,0.0,0.3,0.99,0.92,135,1994,12,1,0,0
Louis Cosce,9.3,8.22,175.26,71.0,Orthodox,0.48,0.48,0.0,0.11,1.07,0.0,170,1995,7,1,0,0


In [50]:
f2_data.head()

Unnamed: 0_level_0,SApM_f2,SLpM_f2,height_f2,reach_f2,stance_f2,strAcc_f2,strDef_f2,subAvg_f2,tdAcc_f2,tdAvg_f2,tdDef_f2,weight_f2,born_year_f2,win_f2,lose_f2,draw_f2,nc_f2
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Anthony Birchak,3.67,3.93,172.72,69.0,Orthodox,0.36,0.56,0.5,0.14,0.48,0.61,135,1986,16,8,0,0
Cole Williams,5.34,0.31,182.88,73.0,Orthodox,0.27,0.23,0.0,0.0,0.0,0.25,170,1983,11,3,0,0
Dustin Jacoby,3.36,4.32,190.5,76.0,Orthodox,0.49,0.58,0.0,0.4,0.35,0.58,205,1988,15,5,1,0
Kevin Natividad,4.02,2.63,167.64,70.0,Orthodox,0.24,0.54,0.0,0.0,0.0,1.0,145,1993,9,3,0,0
Sasha Palatnikov,5.44,6.69,185.42,72.0,Switch,0.53,0.52,0.0,1.0,0.72,0.7,170,1989,6,4,0,0


In [51]:
# join the 2 dataframe
f1_data.reset_index(inplace=True, drop=True)
f2_data.reset_index(inplace=True, drop=True)
final_df = pd.concat([cards_clean, f1_data, f2_data], axis=1, sort=False)

In [52]:
final_df['f1_age_when_fight'] = final_df['fight_year'] - final_df['born_year_f1']
final_df['f2_age_when_fight'] = final_df['fight_year'] - final_df['born_year_f2']


In [53]:
final_df.head()

Unnamed: 0,card_name,f1,f1_sig_strike_per,f1_sig_strike_total,f1_td_attempt,f1_td_succeed,f2,f2_sig_strike_per,f2_sig_strike_total,f2_td_attempt,...,tdAvg_f2,tdDef_f2,weight_f2,born_year_f2,win_f2,lose_f2,draw_f2,nc_f2,f1_age_when_fight,f2_age_when_fight
0,UFC Fight Night: Santos vs. Teixeira,Gustavo Lopez,0.6,43,3,2,Anthony Birchak,0.42,40,0,...,0.48,0.61,135,1986,16,8,0,0,31,34
1,UFC Fight Night: Hall vs. Silva,Jason Witt,0.2,5,0,0,Cole Williams,0.64,31,3,...,0.0,0.25,170,1983,11,3,0,0,34,37
2,UFC Fight Night: Hall vs. Silva,Justin Ledet,0.57,38,0,0,Dustin Jacoby,0.32,31,0,...,0.35,0.58,205,1988,15,5,1,0,32,32
3,UFC Fight Night: Hall vs. Silva,Miles Johns,0.45,105,7,0,Kevin Natividad,0.24,137,1,...,0.0,1.0,145,1993,9,3,0,0,26,27
4,UFC 255: Figueiredo vs. Perez,Louis Cosce,0.48,206,9,1,Sasha Palatnikov,0.52,237,1,...,0.72,0.7,170,1989,6,4,0,0,25,31


In [54]:
# last checks
print(final_df.isna().sum())
print(final_df.dtypes)

card_name              0
f1                     0
f1_sig_strike_per      0
f1_sig_strike_total    0
f1_td_attempt          0
f1_td_succeed          0
f2                     0
f2_sig_strike_per      0
f2_sig_strike_total    0
f2_td_attempt          0
f2_td_succeed          0
fights_location        0
round_format           0
round_fought           0
weight_class           0
winner                 0
winning_method         0
fight_year             0
SApM_f1                0
SLpM_f1                0
height_f1              0
reach_f1               0
stance_f1              0
strAcc_f1              0
strDef_f1              0
subAvg_f1              0
tdAcc_f1               0
tdAvg_f1               0
tdDef_f1               0
weight_f1              0
born_year_f1           0
win_f1                 0
lose_f1                0
draw_f1                0
nc_f1                  0
SApM_f2                0
SLpM_f2                0
height_f2              0
reach_f2               0
stance_f2              0


In [55]:
final_df.to_csv('cleaned_dataset.csv', index=False)