<a href="https://colab.research.google.com/github/joshhawkins119/UFC_Fight_Outcome_Predictor/blob/main/UFC_Data_Clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [68]:
from google.colab import drive
drive.mount('/content/gdrive') 


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


IMPORT LIBRARIES

In [69]:
# Data Prep Imports 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import sys
import datetime 
import seaborn as sns
import datetime
from pytz import utc, timezone
import time
import re
from functools import reduce
import holidays
from dateutil.easter import *


plt.style.use('fivethirtyeight')
%matplotlib inline

IMPORT FIGHTER DATA

In [70]:
#Fighter Data

fighter_data = pd.read_csv('/content/gdrive/MyDrive/Supervised ML/UFC Fight Predictor/ufc_fighter_data.csv', index_col=0, low_memory=False)
fighter_data.isnull().sum()

DOB           0
Draws         0
First         0
Height        0
Last          0
Losses        0
Reach         0
SApM          0
SLpM          0
STANCE      865
Str.Acc.      0
Str.Def       0
Sub.Avg.      0
TDAcc.        0
TDAvg.        0
TDDef.        0
Weight        0
Wins          0
dtype: int64

In [71]:
fighter_data[['Height', 'Weight', 'Reach']]

Unnamed: 0,Height,Weight,Reach
0,--,155lbs.,--
1,"5'11""",155lbs.,--
2,"6'0""",265lbs.,--
3,"6'3""",235lbs.,"76"""
4,"5'6""",145lbs.,--
...,...,...,...
3661,--,170lbs.,--
3662,--,145lbs.,--
3663,"5'9""",185lbs.,--
3664,"5'7""",155lbs.,"70"""


DATA CLEANING AND ENRICHMENT

In [72]:
#fighter full name
fighter_data['Fighter_Name'] = fighter_data['First'] + ' ' + fighter_data['Last']   
fighter_data = fighter_data.drop(['First', 'Last'], axis=1)
            
# Number of Fights
fighter_data['Draws'] = fighter_data['Draws'].str.replace(r" \(.*\)","") # Remove NC by removing all '()' data
for col in ['Wins', 'Losses', 'Draws']:   # data type to int
    fighter_data[col] = fighter_data[col].astype(int)
fighter_data['Num_Fights'] = fighter_data['Wins']+fighter_data['Losses']+fighter_data['Draws']


#Drop rows where Total Fights are Zero
fighter_data = fighter_data[fighter_data.Num_Fights != 0]


# Must have at least 5 fights
fighter_data = fighter_data[fighter_data.Num_Fights > 4]


#Transform height into decimal 
fighter_data['Height'] = fighter_data['Height'].str.replace('"',"") 
fighter_data['Height'] = fighter_data['Height'].str.replace('--', '0')
fighter_data['Height_Feet'] = fighter_data['Height'].str.split("'").str[0].fillna(0).astype(int)
fighter_data['Height_Inches'] = fighter_data['Height'].str.split("'").str[1].fillna(0).astype(int)
fighter_data['Height_Inches'] = round(fighter_data['Height_Inches']/12, 1)

fighter_data['Height'] = fighter_data['Height_Feet'] + fighter_data['Height_Inches']
fighter_data = fighter_data.drop(['Height_Feet', 'Height_Inches'], axis=1)
fighter_data['Height'] = fighter_data['Height'].replace(0, np.nan)
fighter_data = fighter_data.dropna(subset=['Height'])


#Transform Weight into decimal
fighter_data['Weight'] = fighter_data['Weight'].str.replace('lbs.', '')
fighter_data['Weight'] = fighter_data['Weight'].str.replace('--', '0')
fighter_data['Weight'] = fighter_data['Weight'].astype(float)

#Win-to-Loss-Ratio (success percentage)
fighter_data['Success_Perc'] = round(fighter_data['Wins']/fighter_data['Num_Fights'], 2)


#Change Percent to Decimal
for col in ['Str.Acc.', 'Str.Def', 'TDAcc.', 'TDDef.']:   # data type to int
    fighter_data[col] = fighter_data[col].str.replace('%', '')
    fighter_data[col] = fighter_data[col].astype(int)
    fighter_data[col] = round(fighter_data[col]/100, 2)
    
    
#Reach
fighter_data.Reach = fighter_data.Reach.str.replace('"', '')
fighter_data['Reach'] = fighter_data['Reach'].str.replace('--', '0')
fighter_data['Reach'] = fighter_data['Reach'].astype(int)
fighter_data['Reach'] = fighter_data['Reach'].replace(0, np.nan)

#Stance
fighter_data['STANCE'] = fighter_data['STANCE'].fillna('None')
fighter_data['STANCE'] = fighter_data['STANCE'].replace('None', 'Orthodox')

#Age 
#Replace month acronym 
fighter_data['DOB'] = fighter_data['DOB'].str.replace('Jan', '1 ')
fighter_data['DOB'] = fighter_data['DOB'].str.replace('Feb', '2 ')
fighter_data['DOB'] = fighter_data['DOB'].str.replace('Mar', '3 ')
fighter_data['DOB'] = fighter_data['DOB'].str.replace('Apr', '4 ')
fighter_data['DOB'] = fighter_data['DOB'].str.replace('May','5 ')
fighter_data['DOB'] = fighter_data['DOB'].str.replace('Jun','6 ')
fighter_data['DOB'] = fighter_data['DOB'].str.replace('Jul','7 ')
fighter_data['DOB'] = fighter_data['DOB'].str.replace('Aug','8 ')
fighter_data['DOB'] = fighter_data['DOB'].str.replace('Sep','9 ')
fighter_data['DOB'] = fighter_data['DOB'].str.replace('Oct','10 ')
fighter_data['DOB'] = fighter_data['DOB'].str.replace('Nov','11 ')
fighter_data['DOB'] = fighter_data['DOB'].str.replace('Dec','12 ')
#Convert string to date
fighter_data['DOB'] = fighter_data['DOB'].str.replace(',', ' ')
fighter_data['DOB'] = fighter_data['DOB'].str.replace(' ', '/')
fighter_data['DOB'] = fighter_data['DOB'].replace('--', np.nan)
fighter_data = fighter_data.dropna(subset=['DOB'])

#Subtract birth date from current date
fighter_data['DOB'] = pd.to_datetime(fighter_data['DOB'], format = '%m/%d/%Y')
fighter_data['AGE'] = datetime.datetime.now() - fighter_data['DOB']

#Convert to string to only keep days, then convert back to int
fighter_data['AGE'] = fighter_data['AGE'].astype(str)
fighter_data['AGE'] = fighter_data['AGE'].str.split(" days").str[0].fillna(0).astype(int)

# days / 365 = years
fighter_data['AGE'] = round(fighter_data['AGE']/365, 1)


fighter_data.sort_values(by='Wins', ascending=False)

Unnamed: 0,DOB,Draws,Height,Losses,Reach,SApM,SLpM,STANCE,Str.Acc.,Str.Def,Sub.Avg.,TDAcc.,TDAvg.,TDDef.,Weight,Wins,Fighter_Name,Num_Fights,Success_Perc,AGE
1063,1977-05-29,10,6.0,53,,0.00,0.00,Orthodox,0.00,0.00,0.0,0.00,0.00,0.00,240.0,253,Travis Fulton,316,0.80,44.2
3012,1958-06-08,1,6.2,19,,0.00,0.00,Southpaw,0.00,0.00,0.0,0.00,0.00,0.00,250.0,101,Dan Severn,121,0.83,63.2
1433,1975-08-25,5,6.1,22,74.0,1.99,1.19,Orthodox,0.39,0.55,1.1,0.38,1.83,0.29,185.0,91,Jeremy Horn,118,0.77,45.9
3571,1978-03-15,0,6.2,21,,4.03,0.48,Orthodox,0.30,0.41,1.2,0.57,4.84,0.44,205.0,75,Travis Wiuff,96,0.78,43.4
2171,1976-01-12,8,5.8,42,,1.66,0.88,Orthodox,0.40,0.49,1.9,0.41,1.82,0.05,181.0,60,Ikuhisa Minowa,110,0.55,45.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3270,1968-09-19,0,6.4,4,,5.69,3.40,Orthodox,0.54,0.29,0.0,0.33,0.67,0.55,280.0,1,Yoshihiro Takayama,5,0.20,52.9
44,1970-06-10,1,5.2,3,,0.00,0.00,Orthodox,0.00,0.00,0.0,0.00,0.00,0.00,145.0,1,Alfonso Alcarez,5,0.20,51.2
1500,1968-08-05,1,6.0,5,,2.99,1.34,Orthodox,0.30,0.59,0.0,0.18,2.74,0.00,185.0,1,Tokimitsu Ishizawa,7,0.14,53.0
116,1967-03-28,1,5.9,5,,3.13,0.00,Orthodox,0.00,0.25,0.0,0.00,0.00,0.00,205.0,0,Yoji Anjo,6,0.00,54.4


IMPUTE MISSING VALUES FOR HEIGHT AND REACH

In [73]:
# impute values with mean where null/0
fighter_data["Height"] = fighter_data.groupby(["Weight"])['Height'].transform(lambda x: x.fillna(x.mean())).round(2)
fighter_data["Reach"] = fighter_data.groupby(['Height', "Weight"])["Reach"].transform(lambda x: x.fillna(x.mean())).round(2)

#Where reach is still null, replace value with height
fighter_data['Reach'] = fighter_data['Reach'].fillna(0)
fighter_data['Reach'] = fighter_data.apply(lambda x: x['Height'] if x['Reach']==0 else x['Reach'], axis=1)

fighter_data

Unnamed: 0,DOB,Draws,Height,Losses,Reach,SApM,SLpM,STANCE,Str.Acc.,Str.Def,Sub.Avg.,TDAcc.,TDAvg.,TDDef.,Weight,Wins,Fighter_Name,Num_Fights,Success_Perc,AGE
1,1983-07-03,0,5.9,6,72.43,4.41,3.29,Orthodox,0.38,0.57,0.0,0.00,0.00,0.77,155.0,4,Danny Abbadi,10,0.40,38.1
3,1981-09-02,0,6.2,5,76.00,2.45,2.45,Orthodox,0.44,0.58,0.2,0.24,1.23,0.47,235.0,20,Shamil Abdurakhimov,25,0.80,39.9
5,1991-11-27,0,5.9,2,71.00,4.49,3.80,Orthodox,0.33,0.56,0.0,0.50,0.33,0.00,170.0,6,Daichi Abe,8,0.75,29.7
6,1978-06-30,0,5.9,4,74.76,3.15,2.80,Southpaw,0.55,0.48,1.3,0.57,3.47,0.50,185.0,10,Papy Abedi,14,0.71,43.1
7,1984-04-27,0,5.9,1,74.76,3.98,3.79,Orthodox,0.31,0.68,0.7,0.42,2.13,1.00,185.0,5,Ricardo Abreu,6,0.83,37.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3655,1997-03-21,0,6.1,3,75.00,1.67,2.13,Orthodox,0.39,0.65,0.0,0.11,0.33,0.68,155.0,12,Fares Ziam,15,0.80,24.4
3657,1977-03-01,2,6.2,10,76.43,1.60,1.47,Orthodox,0.35,0.44,0.5,0.25,0.50,0.74,205.0,21,James Zikic,33,0.64,44.4
3659,1982-07-01,0,5.5,4,68.00,1.63,2.57,Southpaw,0.61,0.47,0.8,0.65,2.77,0.42,145.0,10,Cat Zingano,14,0.71,39.1
3664,1992-04-04,0,5.6,1,70.00,1.80,3.93,Orthodox,0.52,0.61,1.0,0.00,0.00,0.57,155.0,13,Allan Zuniga,14,0.93,29.3


In [74]:
# ONE HOT FOR STANCE AND 

def make_hots(df, col):
  hots = pd.get_dummies(df[col])
  df = pd.concat([df, hots], axis=1)
  return df

fighter_data = make_hots(fighter_data, 'STANCE')


In [75]:
fighter_list = fighter_data.Fighter_Name.unique().tolist()

IMPORT FIGHT DATA

In [76]:
fight_data = pd.read_csv('/content/gdrive/MyDrive/Supervised ML/UFC Fight Predictor/ufc_fight_data.csv', index_col=0)

REFORMAT DATES AND ONLY KEEP RECORDS WITH DEFINITE WINS (NO NC OR DRAWS)

In [77]:
#Name cleaning function

def remove_middle_name(x):
  count = 0
  for i in x:
    if(i.isspace()):
        count=count+1
  if count > 1:
    return x.split(' ')[0]+' '+x.split(' ')[-1]
  else:
    return x

# test = remove_middle_name(name)
# test

In [78]:
#Only fights with fighters from fighter data
# (This version throws warning) fight_data = fight_data[(fight_data.Losing_Fighter_Name.isin(fighter_list))|(fight_data.Winning_Fighter_Name.isin(fighter_list))]

#Re-Format names
fight_data['Winning_Fighter_Name'] = fight_data['Winning_Fighter_Name'].apply(remove_middle_name)
fight_data['Losing_Fighter_Name'] = fight_data['Losing_Fighter_Name'].apply(remove_middle_name)


win_data = fight_data[fight_data.Losing_Fighter_Name.isin(fighter_list)]
lose_data = fight_data[fight_data.Winning_Fighter_Name.isin(fighter_list)]
fight_data = pd.concat([win_data, lose_data])
fight_data = fight_data.drop_duplicates()

#date formating
fight_data['Date'] = fight_data['Date'].str.replace(',', '')
fight_data['Date'] = pd.to_datetime(fight_data['Date'], format="%B %d %Y")
fight_data = fight_data.rename(columns={'Method_Detail':'Finish_Detail'})
fight_data['Finish_Detail'] = fight_data['Finish_Detail'].fillna('None')

# Get rid of Draws and No Contests
fight_data = fight_data[fight_data.Winning_Fighter_Name != 'nc']
fight_data = fight_data[fight_data.Winning_Fighter_Name != 'draw']

fight_data = fight_data.sort_values(by=['Date'], ascending=False)

fight_data

Unnamed: 0,Attendance,Date,Location,Losing_Fighter_Name,Losing_Pass,Losing_STR,Losing_Sub,Losing_TD,Method,Finish_Detail,Round,Time,Weight_Class,Winning_Fighter_Name,Winning_Pass,Winning_STR,Winning_Sub,Winning_TD
0,USA,2021-07-24,"Vegas,",Cory Sandhagen,1,0,0,128,S-DEC,,5,5:00,Bantamweight,TJ Dillashaw,0,0,2,110
2,USA,2021-07-24,"Vegas,",Darrick Minner,3,0,2,19,KO/TKO,Punches,2,3:48,Featherweight,Darren Elkins,0,0,1,20
3,USA,2021-07-24,"Vegas,",Miranda Maverick,0,0,1,47,S-DEC,,3,5:00,Women's Flyweight,Maycee Barber,0,0,1,36
4,USA,2021-07-24,"Vegas,",Randy Costa,0,0,0,69,KO/TKO,Punches,2,2:11,Bantamweight,Adrian Yanez,0,1,0,64
5,USA,2021-07-24,"Vegas,",Punahele Soriano,0,0,0,66,U-DEC,,3,5:00,Middleweight,Brendan Allen,0,0,0,94
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6196,USA,1994-03-11,"Denver,",Jason DeLucia,0,0,1,0,SUB,Armbar,1,1:07,Open Weight,Royce Gracie,1,0,0,0
6195,USA,1994-03-11,"Denver,",Johnny Rhodes,0,0,0,4,SUB,Guillotine Choke,1,1:07,Open Weight,Patrick Smith,1,0,0,5
6194,USA,1994-03-11,"Denver,",Remco Pardoel,0,0,0,0,SUB,Gi Choke,1,1:31,Open Weight,Royce Gracie,1,0,1,0
6193,USA,1994-03-11,"Denver,",Patrick Smith,0,0,0,1,KO/TKO,Punches,1,1:17,Open Weight,Royce Gracie,0,0,1,4


In [79]:
fight_data.loc[fight_data.Losing_Fighter_Name == 'Chan Jung']

Unnamed: 0,Attendance,Date,Location,Losing_Fighter_Name,Losing_Pass,Losing_STR,Losing_Sub,Losing_TD,Method,Finish_Detail,Round,Time,Weight_Class,Winning_Fighter_Name,Winning_Pass,Winning_STR,Winning_Sub,Winning_TD
385,Emirates,2020-10-17,United,Chan Jung,0,0,0,62,U-DEC,,5,5:00,Featherweight,Brian Ortega,0,2,3,127
1350,USA,2018-11-10,"Denver,",Chan Jung,0,0,0,126,KO/TKO,Elbow,5,4:59,Featherweight,Yair Rodriguez,0,1,0,119
3838,Brazil,2013-08-03,de,Chan Jung,0,0,0,17,KO/TKO,Punches,4,2:00,Featherweight,Jose Aldo,0,0,5,32


In [80]:
fight_data['Weight_Class'].value_counts()

Lightweight              1098
Welterweight             1088
Middleweight              830
Heavyweight               583
Featherweight             562
Light Heavyweight         561
Bantamweight              477
Flyweight                 232
Women's Strawweight       200
Women's Bantamweight      152
Women's Flyweight         125
Open Weight                70
Catch Weight               38
Women's Featherweight      17
Super Heavyweight           1
Name: Weight_Class, dtype: int64

CREATE WIN AND LOSS DATAFRAMES AND CONCATENATE

In [81]:
# 1. Make two copies of the dataframe - one with winner as fighter, one with loser as fighter - THEN Concat
# 2. Create binary win column

win_temp_df = fight_data.copy()
win_temp_df['Winner'] = win_temp_df['Winning_Fighter_Name']
win_temp_df = win_temp_df.rename(columns={'Winning_Fighter_Name':'Fighter', 
                                          'Losing_Fighter_Name':'Opponent',
                                          'Winning_Pass':'Fighter_Pass',
                                          'Winning_STR':'Fighter_STR',
                                          'Winning_Sub':'Fighter_Sub',
                                          'Winning_TD':'Fighter_TD',
                                          'Losing_Pass':'Opponent_Pass',
                                          'Losing_STR':'Opponent_STR',
                                          'Losing_Sub':'Opponent_Sub',
                                          'Losing_TD':'Opponent_TD',
                                         })

loss_temp_df = fight_data.copy()
loss_temp_df['Winner'] = loss_temp_df['Winning_Fighter_Name']
loss_temp_df = loss_temp_df.rename(columns={'Losing_Fighter_Name':'Fighter', 
                                            'Winning_Fighter_Name':'Opponent',
                                            'Winning_Pass':'Opponent_Pass',
                                            'Winning_STR':'Opponent_STR',
                                            'Winning_Sub':'Opponent_Sub',
                                            'Winning_TD':'Opponent_TD',
                                            'Losing_Pass':'Fighter_Pass',
                                            'Losing_STR':'Fighter_STR',
                                            'Losing_Sub':'Fighter_Sub',
                                            'Losing_TD':'Fighter_TD',      
                                           })

fight_training_df = pd.concat([win_temp_df, loss_temp_df], sort=True)
fight_training_df = fight_training_df.drop_duplicates()

#IF Fighter == Winner, then win = 1 else 0
fight_training_df['Win'] = np.where(fight_training_df['Fighter'] == fight_training_df['Winner'], 1, 0)

fight_training_df = fight_training_df.sort_values(by=['Date'], ascending=False)
fight_training_df.loc[fight_training_df.Fighter.str.contains('Isl')].sort_values(by=['Date'], ascending=False)


Unnamed: 0,Attendance,Date,Fighter,Fighter_Pass,Fighter_STR,Fighter_Sub,Fighter_TD,Finish_Detail,Location,Method,Opponent,Opponent_Pass,Opponent_STR,Opponent_Sub,Opponent_TD,Round,Time,Weight_Class,Winner,Win
11,USA,2021-07-17,Islam Makhachev,1,0,3,61,Rear Naked Choke,"Vegas,",SUB,Thiago Moises,1,0,1,13,4,2:38,Lightweight,Islam Makhachev,1
201,USA,2021-03-06,Islam Makhachev,2,0,3,15,Arm Triangle,"Vegas,",SUB,Drew Dober,0,0,0,10,3,1:37,Lightweight,Islam Makhachev,1
910,Emirates,2019-09-07,Islam Makhachev,0,1,0,43,,United,U-DEC,Davi Ramos,0,0,0,7,3,5:00,Lightweight,Islam Makhachev,1
1116,Russia,2019-04-20,Islam Makhachev,0,0,4,14,,Saint,U-DEC,Arman Tsarukyan,0,0,1,13,3,5:00,Lightweight,Islam Makhachev,1
1469,Canada,2018-07-28,Islam Makhachev,1,0,1,7,Armbar,"Calgary,",SUB,Kajan Johnson,0,0,0,7,1,4:43,Lightweight,Islam Makhachev,1
1725,USA,2018-01-20,Islam Makhachev,0,1,0,3,Punch,"Boston,",KO/TKO,Gleison Tibau,0,0,0,2,1,0:57,Lightweight,Islam Makhachev,1
2155,USA,2017-02-11,Islam Makhachev,0,0,5,43,,New,U-DEC,Nik Lentz,1,0,0,13,3,5:00,Lightweight,Islam Makhachev,1
2355,USA,2016-09-17,Islam Makhachev,2,0,4,12,,"Hidalgo,",U-DEC,Chris Wade,2,0,0,5,3,5:00,Lightweight,Islam Makhachev,1
2801,USA,2015-10-03,Islam Makhachev,0,0,0,4,Punch,"Houston,",KO/TKO,Adriano Martins,0,1,0,3,1,1:46,Lightweight,Adriano Martins,0
2987,USA,2015-05-23,Islam Makhachev,1,0,3,32,Rear Naked Choke,"Vegas,",SUB,Leo Kuntz,0,0,0,7,2,2:38,Lightweight,Islam Makhachev,1


CREATE WIN-RELATED HISTORICAL ROLLING STAT COLUMNS AND ONE HOTS!!!

In [82]:
fight_training_df = fight_training_df.sort_values(by=['Date'], ascending=True)


# Win Stats
fight_training_df['Won_Last_Fight'] = fight_training_df.groupby(['Fighter'])['Win'].shift(1, axis=0).astype(float)
fight_training_df['Won_Fight_Before_Last'] = fight_training_df.groupby(['Fighter'])['Win'].shift(2, axis=0).astype(float)
fight_training_df['no_ufc_fights'] = fight_training_df.groupby(['Fighter'])['Date'].transform(lambda x: x.rolling(len(fight_training_df), 0).count()).astype(int)
fight_training_df['win_total'] = fight_training_df.groupby(['Fighter'])['Win'].transform(lambda x: x.rolling(len(fight_training_df), 0).sum()).astype(int)
fight_training_df['win_percentage'] = round(fight_training_df['win_total']/fight_training_df['no_ufc_fights'], 2)
fight_training_df['wins_last_3'] = fight_training_df.groupby(['Fighter'])['Win'].transform(lambda x: x.rolling(3, 0).sum()).astype(int) ##
fight_training_df['wins_last_3_percentage'] = round(fight_training_df['wins_last_3']/3, 2) ##
fight_training_df['Won_Last_Two_Fights'] = fight_training_df.groupby(['Fighter'])['Win'].transform(lambda x: x.rolling(2, 0).sum()).astype(int)
fight_training_df['Winning_Streak'] = fight_training_df.groupby(['Fighter'])['Win'].transform(lambda x: x.rolling(3, 0).sum()).astype(int)

# Months Since Last Fight
fight_training_df['Date_of_Last_Fight'] = fight_training_df.groupby(['Fighter'])['Date'].shift(1, axis=0)
fight_training_df['Months_Since_Last_Fight'] = fight_training_df['Date'] - fight_training_df['Date_of_Last_Fight']
fight_training_df['Months_Since_Last_Fight'] = fight_training_df['Months_Since_Last_Fight'].astype(str)
fight_training_df['Months_Since_Last_Fight'] = fight_training_df['Months_Since_Last_Fight'].str.replace('NaT', '0')
fight_training_df['Months_Since_Last_Fight'] = fight_training_df['Months_Since_Last_Fight'].str.split(" ").str[0].astype(int)
fight_training_df['Months_Since_Last_Fight'] = round(fight_training_df['Months_Since_Last_Fight']/30.5, 2)


def to_binary_last_two(df):
    if df['Won_Last_Two_Fights'] == 2:
        return 1
    else:
        return 0
    
def to_binary_streak(df):
    if df['Winning_Streak'] == 3:
        return 1
    else:
        return 0
    
fight_training_df['Won_Last_Two_Fights'] = fight_training_df.apply(to_binary_last_two, axis=1)
fight_training_df['Winning_Streak'] = fight_training_df.apply(to_binary_streak, axis=1)



# STR, Pass, Sub, TD
def make_rolling_perc(col, df, goup_col, denom_col):
    #df[col] = df[col].str.replace('--', '0')
    df[col] = df[col].astype(int)
    df[col+'_total'] = df.groupby(goup_col)[col].transform(lambda x: x.rolling(len(df), 0).sum()).astype(int)
    df[col+'_perc'] = round(df[col+'_total']/df[denom_col], 2)
    return df 

def make_rolling_avg(col, df, goup_col):
    df[col] = df[col].str.replace('--', '0')
    df[col] = df[col].astype(int)
    df[col+'_avg'] = df.groupby(goup_col)[col].transform(lambda x: x.rolling(len(df), 0).mean()).astype(float).round(2)
    return df 
   
for column in ['Fighter_Pass', 'Fighter_STR', 'Fighter_Sub', 'Fighter_TD', 'Round']: #'Time'
    fight_training_df = make_rolling_avg(column, fight_training_df, 'Fighter')
    
    
# Methods
method_hots = pd.get_dummies(fight_training_df.Method)
fight_training_df = pd.concat([fight_training_df, method_hots], axis=1)
for col in method_hots.columns:
    fight_training_df = make_rolling_perc(col, fight_training_df, 'Fighter', 'no_ufc_fights')


# Male or Female Fight one hots
def is_female(df):
    if 'Women' in df['Weight_Class']:
        return 1
    else:
        return 0
    
def is_male(df):
    if 'Women' in df['Weight_Class']:
        return 0
    else:
        return 1
    
fight_training_df['Male'] = fight_training_df.apply(is_male, axis=1)
fight_training_df['Female'] = fight_training_df.apply(is_female, axis=1)


    
# Time
fight_training_df['Minutes'] = fight_training_df['Time'].str.split(":").str[0].fillna(0).astype(int)
fight_training_df['Seconds'] = fight_training_df['Time'].str.split(":").str[1].fillna(0).astype(int)

fight_training_df['Fight_Time_in_Sec'] = fight_training_df['Seconds'] + (fight_training_df['Minutes']*60)
fight_training_df['Fight_Time_in_Sec_avg'] = fight_training_df.groupby('Fighter')['Fight_Time_in_Sec'].transform(lambda x: x.rolling(len(fight_training_df), 0).mean()).astype(float).round(2)
##
fight_training_df['Fight_Time_in_Min'] = round(fight_training_df['Fight_Time_in_Sec']/60, 2)
fight_training_df['Fight_Time_in_Min_avg'] = fight_training_df.groupby('Fighter')['Fight_Time_in_Min'].transform(lambda x: x.rolling(len(fight_training_df), 0).mean()).astype(float).round(2)
fight_training_df['Min_in_Octogon_Total'] = fight_training_df.groupby('Fighter')['Fight_Time_in_Min'].transform(lambda x: x.rolling(len(fight_training_df), 0).sum()).astype(float).round(2)
##

# Drops
drop_list = ['Fighter_Pass', 'Fighter_STR', 'Fighter_Sub', 'Fighter_TD', 'Finish_Detail',
            'Opponent_Pass', 'Opponent_STR', 'Opponent_Sub', 'Opponent_TD', 'Minutes', 'Seconds',
             'Time', 'Location', 'Attendance', 'Method']
fight_training_df = fight_training_df.drop(drop_list, axis=1)

# Drop one hot errors
one_hot_errors = ['0', '1', '2', '3']
for i in one_hot_errors:
  try:
    fight_training_df = fight_training_df.drop(one_hot_errors, axis=1)
  except:
    pass


fight_training_df.loc[fight_training_df.Fighter.str.contains('Isl')].sort_values(by=['Date'], ascending=False) 

Unnamed: 0,Date,Fighter,Opponent,Round,Weight_Class,Winner,Win,Won_Last_Fight,Won_Fight_Before_Last,no_ufc_fights,win_total,win_percentage,wins_last_3,wins_last_3_percentage,Won_Last_Two_Fights,Winning_Streak,Date_of_Last_Fight,Months_Since_Last_Fight,Fighter_Pass_avg,Fighter_STR_avg,Fighter_Sub_avg,Fighter_TD_avg,Round_avg,DQ,KO/TKO,M-DEC,S-DEC,SUB,U-DEC,DQ_total,DQ_perc,KO/TKO_total,KO/TKO_perc,M-DEC_total,M-DEC_perc,S-DEC_total,S-DEC_perc,SUB_total,SUB_perc,U-DEC_total,U-DEC_perc,Male,Female,Fight_Time_in_Sec,Fight_Time_in_Sec_avg,Fight_Time_in_Min,Fight_Time_in_Min_avg,Min_in_Octogon_Total
11,2021-07-17,Islam Makhachev,Thiago Moises,4,Lightweight,Islam Makhachev,1,1.0,1.0,10,9,0.9,3,1.0,1,1,2021-03-06,4.36,0.7,0.2,2.3,23.4,2.4,0,0,0,0,1,0,0,0.0,2,0.2,0,0.0,0,0.0,4,0.4,4,0.4,1,0,158,205.9,2.63,3.43,34.32
201,2021-03-06,Islam Makhachev,Drew Dober,3,Lightweight,Islam Makhachev,1,1.0,1.0,9,8,0.89,3,1.0,1,1,2019-09-07,17.9,0.67,0.22,2.22,19.22,2.22,0,0,0,0,1,0,0,0.0,2,0.22,0,0.0,0,0.0,3,0.33,4,0.44,1,0,97,211.22,1.62,3.52,31.69
910,2019-09-07,Islam Makhachev,Davi Ramos,3,Lightweight,Islam Makhachev,1,1.0,1.0,8,7,0.88,3,1.0,1,1,2019-04-20,4.59,0.5,0.25,2.12,19.75,2.12,0,0,0,0,0,1,0,0.0,2,0.25,0,0.0,0,0.0,2,0.25,4,0.5,1,0,300,225.5,5.0,3.76,30.07
1116,2019-04-20,Islam Makhachev,Arman Tsarukyan,3,Lightweight,Islam Makhachev,1,1.0,1.0,7,6,0.86,3,1.0,1,1,2018-07-28,8.72,0.57,0.14,2.43,16.43,2.0,0,0,0,0,0,1,0,0.0,2,0.29,0,0.0,0,0.0,2,0.29,3,0.43,1,0,300,214.86,5.0,3.58,25.07
1469,2018-07-28,Islam Makhachev,Kajan Johnson,1,Lightweight,Islam Makhachev,1,1.0,1.0,6,5,0.83,3,1.0,1,1,2018-01-20,6.2,0.67,0.17,2.17,16.83,1.83,0,0,0,0,1,0,0,0.0,2,0.33,0,0.0,0,0.0,2,0.33,2,0.33,1,0,283,200.67,4.72,3.34,20.07
1725,2018-01-20,Islam Makhachev,Gleison Tibau,1,Lightweight,Islam Makhachev,1,1.0,1.0,5,4,0.8,3,1.0,1,1,2017-02-11,11.25,0.6,0.2,2.4,18.8,2.0,0,1,0,0,0,0,0,0.0,2,0.4,0,0.0,0,0.0,1,0.2,2,0.4,1,0,57,184.2,0.95,3.07,15.35
2155,2017-02-11,Islam Makhachev,Nik Lentz,3,Lightweight,Islam Makhachev,1,1.0,0.0,4,3,0.75,2,0.67,1,0,2016-09-17,4.82,0.75,0.0,3.0,22.75,2.25,0,0,0,0,0,1,0,0.0,1,0.25,0,0.0,0,0.0,1,0.25,2,0.5,1,0,300,216.0,5.0,3.6,14.4
2355,2016-09-17,Islam Makhachev,Chris Wade,3,Lightweight,Islam Makhachev,1,0.0,1.0,3,2,0.67,2,0.67,0,0,2015-10-03,11.48,1.0,0.0,2.33,16.0,2.0,0,0,0,0,0,1,0,0.0,1,0.33,0,0.0,0,0.0,1,0.33,1,0.33,1,0,300,188.0,5.0,3.13,9.4
2801,2015-10-03,Islam Makhachev,Adriano Martins,1,Lightweight,Adriano Martins,0,1.0,,2,1,0.5,1,0.33,0,0,2015-05-23,4.36,0.5,0.0,1.5,18.0,1.5,0,1,0,0,0,0,0,0.0,1,0.5,0,0.0,0,0.0,1,0.5,0,0.0,1,0,106,132.0,1.77,2.2,4.4
2987,2015-05-23,Islam Makhachev,Leo Kuntz,2,Lightweight,Islam Makhachev,1,,,1,1,1.0,1,0.33,0,0,NaT,0.0,1.0,0.0,3.0,32.0,2.0,0,0,0,0,1,0,0,0.0,0,0.0,0,0.0,0,0.0,1,1.0,0,0.0,1,0,158,158.0,2.63,2.63,2.63


FIX DATA TYPES AND GET AGE -- Possible Restructure


In [83]:
# Clean and merge fight and fighter data
fighter_join = fighter_data[['Fighter_Name', 'DOB', 'Height', 'Reach', 'STANCE', 'Weight']]
fighter_join = fighter_join.rename({'Fighter_Name':'Fighter'}, axis=1)
fighter_join = fighter_join.dropna(subset=['DOB'], axis=0)

fight_training_df = fight_training_df.merge(fighter_join, on='Fighter', how='left')

# calculate fighter age
fight_training_df['Fighter_Age'] = (fight_training_df.Date - fight_training_df.DOB)#.astype(str)
fight_training_df = fight_training_df[pd.notnull(fight_training_df['Fighter_Age'])]
fight_training_df['Fighter_Age'] = fight_training_df['Fighter_Age'].astype(str)
fight_training_df['Fighter_Age'] = fight_training_df['Fighter_Age'].str.split(" ").str[0].astype(int)
fight_training_df['Fighter_Age'] = round(fight_training_df['Fighter_Age']/365, 2)
fight_training_df = fight_training_df.drop_duplicates()

# Dtype Corrections
fight_training_df['Weight'] = fight_training_df['Weight'].astype(float)

fight_training_df

Unnamed: 0,Date,Fighter,Opponent,Round,Weight_Class,Winner,Win,Won_Last_Fight,Won_Fight_Before_Last,no_ufc_fights,win_total,win_percentage,wins_last_3,wins_last_3_percentage,Won_Last_Two_Fights,Winning_Streak,Date_of_Last_Fight,Months_Since_Last_Fight,Fighter_Pass_avg,Fighter_STR_avg,Fighter_Sub_avg,Fighter_TD_avg,Round_avg,DQ,KO/TKO,M-DEC,S-DEC,SUB,U-DEC,DQ_total,DQ_perc,KO/TKO_total,KO/TKO_perc,M-DEC_total,M-DEC_perc,S-DEC_total,S-DEC_perc,SUB_total,SUB_perc,U-DEC_total,U-DEC_perc,Male,Female,Fight_Time_in_Sec,Fight_Time_in_Sec_avg,Fight_Time_in_Min,Fight_Time_in_Min_avg,Min_in_Octogon_Total,DOB,Height,Reach,STANCE,Weight,Fighter_Age
1,1994-03-11,Jason DeLucia,Royce Gracie,1,Open Weight,Royce Gracie,0,,,1,0,0.00,0,0.00,0,0,NaT,0.00,0.00,0.00,1.00,0.00,1.00,0,0,0,0,1,0,0,0.0,0,0.00,0,0.0,0,0.00,1,1.00,0,0.00,1,0,67,67.00,1.12,1.12,1.12,1969-07-24,5.9,5.9,Southpaw,190.0,24.65
7,1994-03-11,Patrick Smith,Royce Gracie,1,Open Weight,Royce Gracie,0,,,1,0,0.00,0,0.00,0,0,NaT,0.00,0.00,0.00,0.00,1.00,1.00,0,1,0,0,0,0,0,0.0,1,1.00,0,0.0,0,0.00,0,0.00,0,0.00,1,0,77,77.00,1.28,1.28,1.28,1963-08-28,6.2,77.0,Orthodox,225.0,30.56
8,1994-03-11,Royce Gracie,Minoki Ichihara,1,Open Weight,Royce Gracie,1,,,1,1,1.00,1,0.33,0,0,NaT,0.00,2.00,0.00,1.00,2.00,1.00,0,0,0,0,1,0,0,0.0,0,0.00,0,0.0,0,0.00,1,1.00,0,0.00,1,0,308,308.00,5.13,5.13,5.13,1966-12-12,6.1,6.1,Southpaw,175.0,27.26
9,1994-03-11,Royce Gracie,Jason DeLucia,1,Open Weight,Royce Gracie,1,1.0,,2,2,1.00,2,0.67,1,0,1994-03-11,0.00,1.50,0.00,0.50,1.00,1.00,0,0,0,0,1,0,0,0.0,0,0.00,0,0.0,0,0.00,2,1.00,0,0.00,1,0,67,187.50,1.12,3.12,6.25,1966-12-12,6.1,6.1,Southpaw,175.0,27.26
10,1994-03-11,Jason DeLucia,Scott Baker,1,Open Weight,Jason DeLucia,1,0.0,,2,1,0.50,1,0.33,0,0,1994-03-11,0.00,2.50,0.00,0.50,1.50,1.00,0,0,0,0,1,0,0,0.0,0,0.00,0,0.0,0,0.00,2,1.00,0,0.00,1,0,401,234.00,6.68,3.90,7.80,1969-07-24,5.9,5.9,Southpaw,190.0,24.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12070,2021-07-24,Brendan Allen,Punahele Soriano,3,Middleweight,Brendan Allen,1,1.0,0.0,6,5,0.83,2,0.67,1,0,2021-04-24,2.98,0.83,0.17,0.67,37.50,2.00,0,0,0,0,0,1,0,0.0,2,0.33,0,0.0,0,0.00,2,0.33,2,0.33,1,0,300,248.67,5.00,4.14,24.86,1995-12-28,6.2,75.0,Orthodox,185.0,25.59
12071,2021-07-24,Adrian Yanez,Randy Costa,2,Bantamweight,Adrian Yanez,1,1.0,1.0,3,3,1.00,3,1.00,1,1,2021-03-20,4.13,0.00,1.33,0.00,40.67,2.00,0,1,0,0,0,0,0,0.0,3,1.00,0,0.0,0,0.00,0,0.00,0,0.00,1,0,131,108.00,2.18,1.80,5.40,1993-11-29,5.6,70.0,Orthodox,135.0,27.67
12072,2021-07-24,Maycee Barber,Miranda Maverick,3,Women's Flyweight,Maycee Barber,1,0.0,0.0,6,4,0.67,1,0.33,0,0,2021-02-13,5.28,0.17,0.00,0.83,41.50,2.33,0,0,0,1,0,0,0,0.0,3,0.50,0,0.0,1,0.17,0,0.00,2,0.33,0,1,300,231.00,5.00,3.85,23.11,1998-05-18,5.4,65.0,Switch,125.0,23.20
12073,2021-07-24,Darrick Minner,Darren Elkins,2,Featherweight,Darren Elkins,0,1.0,1.0,4,2,0.50,2,0.67,0,0,2021-02-20,5.05,2.25,0.25,1.75,20.75,2.00,0,1,0,0,0,0,0,0.0,1,0.25,0,0.0,0,0.00,2,0.50,1,0.25,1,0,228,169.50,3.80,2.82,11.30,1990-04-28,5.6,69.0,Orthodox,145.0,31.26


In [84]:
future_df_cols = fight_training_df.columns.tolist()

['Date',
 'Fighter',
 'Opponent',
 'Round',
 'Weight_Class',
 'Winner',
 'Win',
 'Won_Last_Fight',
 'Won_Fight_Before_Last',
 'no_ufc_fights',
 'win_total',
 'win_percentage',
 'wins_last_3',
 'wins_last_3_percentage',
 'Won_Last_Two_Fights',
 'Winning_Streak',
 'Date_of_Last_Fight',
 'Months_Since_Last_Fight',
 'Fighter_Pass_avg',
 'Fighter_STR_avg',
 'Fighter_Sub_avg',
 'Fighter_TD_avg',
 'Round_avg',
 'DQ',
 'KO/TKO',
 'M-DEC',
 'S-DEC',
 'SUB',
 'U-DEC',
 'DQ_total',
 'DQ_perc',
 'KO/TKO_total',
 'KO/TKO_perc',
 'M-DEC_total',
 'M-DEC_perc',
 'S-DEC_total',
 'S-DEC_perc',
 'SUB_total',
 'SUB_perc',
 'U-DEC_total',
 'U-DEC_perc',
 'Male',
 'Female',
 'Fight_Time_in_Sec',
 'Fight_Time_in_Sec_avg',
 'Fight_Time_in_Min',
 'Fight_Time_in_Min_avg',
 'Min_in_Octogon_Total',
 'DOB',
 'Height',
 'Reach',
 'STANCE',
 'Weight',
 'Fighter_Age']

SEND OUT CLEAN FUTURE DF

In [85]:

# #Write out 
fight_training_df.to_csv('/content/gdrive/MyDrive/Supervised ML/UFC Fight Predictor/ufc_clean_future_data.csv')


SHIFT ROWS UP 1 TO MIMIC REALITY

In [86]:
# shift stats up by one fight
keep_list = ['Attendance', 'Date', 'Year', 'Month', 'Date_of_Last_Fight', 'Fighter', 'Location', 'Opponent', 'Win',
             'Winner', 'Months_Since_Last_Fight', 'Male', 'Female', 'Weight_Class', 'Fighter_Name', 'DOB', 'Height', 'Reach', 'STANCE', 'Weight', 'Fighter_Age']
shift_cols = [i for i in fight_training_df.columns.tolist() if i not in keep_list]
for col in shift_cols:
    fight_training_df[col+'_AOLF'] = fight_training_df.groupby(['Fighter'])[col].shift(1, axis=0)


ONLY KEEP FIGHTS WHERE BOTH FIGHTER HAS AT LEAST 3 FIGHTS

In [87]:
fight_training_df = fight_training_df[pd.notnull(fight_training_df['Won_Last_Fight_AOLF'])]
fight_training_df = fight_training_df[pd.notnull(fight_training_df['Won_Fight_Before_Last_AOLF'])]

fight_training_df.loc[fight_training_df.Fighter.str.contains('Isl')].sort_values(by=['Date'], ascending=False) 

Unnamed: 0,Date,Fighter,Opponent,Round,Weight_Class,Winner,Win,Won_Last_Fight,Won_Fight_Before_Last,no_ufc_fights,win_total,win_percentage,wins_last_3,wins_last_3_percentage,Won_Last_Two_Fights,Winning_Streak,Date_of_Last_Fight,Months_Since_Last_Fight,Fighter_Pass_avg,Fighter_STR_avg,Fighter_Sub_avg,Fighter_TD_avg,Round_avg,DQ,KO/TKO,M-DEC,S-DEC,SUB,U-DEC,DQ_total,DQ_perc,KO/TKO_total,KO/TKO_perc,M-DEC_total,M-DEC_perc,S-DEC_total,S-DEC_perc,SUB_total,SUB_perc,U-DEC_total,...,Weight,Fighter_Age,Round_AOLF,Won_Last_Fight_AOLF,Won_Fight_Before_Last_AOLF,no_ufc_fights_AOLF,win_total_AOLF,win_percentage_AOLF,wins_last_3_AOLF,wins_last_3_percentage_AOLF,Won_Last_Two_Fights_AOLF,Winning_Streak_AOLF,Fighter_Pass_avg_AOLF,Fighter_STR_avg_AOLF,Fighter_Sub_avg_AOLF,Fighter_TD_avg_AOLF,Round_avg_AOLF,DQ_AOLF,KO/TKO_AOLF,M-DEC_AOLF,S-DEC_AOLF,SUB_AOLF,U-DEC_AOLF,DQ_total_AOLF,DQ_perc_AOLF,KO/TKO_total_AOLF,KO/TKO_perc_AOLF,M-DEC_total_AOLF,M-DEC_perc_AOLF,S-DEC_total_AOLF,S-DEC_perc_AOLF,SUB_total_AOLF,SUB_perc_AOLF,U-DEC_total_AOLF,U-DEC_perc_AOLF,Fight_Time_in_Sec_AOLF,Fight_Time_in_Sec_avg_AOLF,Fight_Time_in_Min_AOLF,Fight_Time_in_Min_avg_AOLF,Min_in_Octogon_Total_AOLF
12050,2021-07-17,Islam Makhachev,Thiago Moises,4,Lightweight,Islam Makhachev,1,1.0,1.0,10,9,0.9,3,1.0,1,1,2021-03-06,4.36,0.7,0.2,2.3,23.4,2.4,0,0,0,0,1,0,0,0.0,2,0.2,0,0.0,0,0.0,4,0.4,4,...,155.0,29.74,3.0,1.0,1.0,9.0,8.0,0.89,3.0,1.0,1.0,1.0,0.67,0.22,2.22,19.22,2.22,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.22,0.0,0.0,0.0,0.0,3.0,0.33,4.0,0.44,97.0,211.22,1.62,3.52,31.69
11675,2021-03-06,Islam Makhachev,Drew Dober,3,Lightweight,Islam Makhachev,1,1.0,1.0,9,8,0.89,3,1.0,1,1,2019-09-07,17.9,0.67,0.22,2.22,19.22,2.22,0,0,0,0,1,0,0,0.0,2,0.22,0,0.0,0,0.0,3,0.33,4,...,155.0,29.38,3.0,1.0,1.0,8.0,7.0,0.88,3.0,1.0,1.0,1.0,0.5,0.25,2.12,19.75,2.12,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.25,0.0,0.0,0.0,0.0,2.0,0.25,4.0,0.5,300.0,225.5,5.0,3.76,30.07
10290,2019-09-07,Islam Makhachev,Davi Ramos,3,Lightweight,Islam Makhachev,1,1.0,1.0,8,7,0.88,3,1.0,1,1,2019-04-20,4.59,0.5,0.25,2.12,19.75,2.12,0,0,0,0,0,1,0,0.0,2,0.25,0,0.0,0,0.0,2,0.25,4,...,155.0,27.88,3.0,1.0,1.0,7.0,6.0,0.86,3.0,1.0,1.0,1.0,0.57,0.14,2.43,16.43,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.29,0.0,0.0,0.0,0.0,2.0,0.29,3.0,0.43,300.0,214.86,5.0,3.58,25.07
9882,2019-04-20,Islam Makhachev,Arman Tsarukyan,3,Lightweight,Islam Makhachev,1,1.0,1.0,7,6,0.86,3,1.0,1,1,2018-07-28,8.72,0.57,0.14,2.43,16.43,2.0,0,0,0,0,0,1,0,0.0,2,0.29,0,0.0,0,0.0,2,0.29,3,...,155.0,27.5,1.0,1.0,1.0,6.0,5.0,0.83,3.0,1.0,1.0,1.0,0.67,0.17,2.17,16.83,1.83,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.33,0.0,0.0,0.0,0.0,2.0,0.33,2.0,0.33,283.0,200.67,4.72,3.34,20.07
9188,2018-07-28,Islam Makhachev,Kajan Johnson,1,Lightweight,Islam Makhachev,1,1.0,1.0,6,5,0.83,3,1.0,1,1,2018-01-20,6.2,0.67,0.17,2.17,16.83,1.83,0,0,0,0,1,0,0,0.0,2,0.33,0,0.0,0,0.0,2,0.33,2,...,155.0,26.77,1.0,1.0,1.0,5.0,4.0,0.8,3.0,1.0,1.0,1.0,0.6,0.2,2.4,18.8,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.4,0.0,0.0,0.0,0.0,1.0,0.2,2.0,0.4,57.0,184.2,0.95,3.07,15.35
8690,2018-01-20,Islam Makhachev,Gleison Tibau,1,Lightweight,Islam Makhachev,1,1.0,1.0,5,4,0.8,3,1.0,1,1,2017-02-11,11.25,0.6,0.2,2.4,18.8,2.0,0,1,0,0,0,0,0,0.0,2,0.4,0,0.0,0,0.0,1,0.2,2,...,155.0,26.25,3.0,1.0,0.0,4.0,3.0,0.75,2.0,0.67,1.0,0.0,0.75,0.0,3.0,22.75,2.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,1.0,0.25,2.0,0.5,300.0,216.0,5.0,3.6,14.4
7854,2017-02-11,Islam Makhachev,Nik Lentz,3,Lightweight,Islam Makhachev,1,1.0,0.0,4,3,0.75,2,0.67,1,0,2016-09-17,4.82,0.75,0.0,3.0,22.75,2.25,0,0,0,0,0,1,0,0.0,1,0.25,0,0.0,0,0.0,1,0.25,2,...,155.0,25.31,3.0,0.0,1.0,3.0,2.0,0.67,2.0,0.67,0.0,0.0,1.0,0.0,2.33,16.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.33,0.0,0.0,0.0,0.0,1.0,0.33,1.0,0.33,300.0,188.0,5.0,3.13,9.4


CREATE NEW DATA FRAME FROM OPPONENT PERSPECTIVE


In [88]:
# Create copy of Training DF as Opponent DF for join

opponent_training_df = fight_training_df.copy()
opponent_training_df = opponent_training_df.drop(['Win', 'Winner', 'Male', 'Female'], axis=1)
cols = opponent_training_df.columns.tolist()

opponent_training_df.columns = ['Opponent_' + str(col) for col in cols]
opponent_training_df = opponent_training_df.rename(columns={'Opponent_Date':'Date', 'Opponent_Fighter':'Opponent', 'Opponent_Opponent':'Fighter', 
                                                            'Opponent_Fighter_Age':'Opponent_Age'})
opponent_training_df = opponent_training_df.drop_duplicates()
opponent_training_df = opponent_training_df.reset_index(drop=True)
opponent_training_df[opponent_training_df['Opponent'] == 'Patrick Smith']

Unnamed: 0,Date,Opponent,Fighter,Opponent_Round,Opponent_Weight_Class,Opponent_Won_Last_Fight,Opponent_Won_Fight_Before_Last,Opponent_no_ufc_fights,Opponent_win_total,Opponent_win_percentage,Opponent_wins_last_3,Opponent_wins_last_3_percentage,Opponent_Won_Last_Two_Fights,Opponent_Winning_Streak,Opponent_Date_of_Last_Fight,Opponent_Months_Since_Last_Fight,Opponent_Fighter_Pass_avg,Opponent_Fighter_STR_avg,Opponent_Fighter_Sub_avg,Opponent_Fighter_TD_avg,Opponent_Round_avg,Opponent_DQ,Opponent_KO/TKO,Opponent_M-DEC,Opponent_S-DEC,Opponent_SUB,Opponent_U-DEC,Opponent_DQ_total,Opponent_DQ_perc,Opponent_KO/TKO_total,Opponent_KO/TKO_perc,Opponent_M-DEC_total,Opponent_M-DEC_perc,Opponent_S-DEC_total,Opponent_S-DEC_perc,Opponent_SUB_total,Opponent_SUB_perc,Opponent_U-DEC_total,Opponent_U-DEC_perc,Opponent_Fight_Time_in_Sec,...,Opponent_Weight,Opponent_Age,Opponent_Round_AOLF,Opponent_Won_Last_Fight_AOLF,Opponent_Won_Fight_Before_Last_AOLF,Opponent_no_ufc_fights_AOLF,Opponent_win_total_AOLF,Opponent_win_percentage_AOLF,Opponent_wins_last_3_AOLF,Opponent_wins_last_3_percentage_AOLF,Opponent_Won_Last_Two_Fights_AOLF,Opponent_Winning_Streak_AOLF,Opponent_Fighter_Pass_avg_AOLF,Opponent_Fighter_STR_avg_AOLF,Opponent_Fighter_Sub_avg_AOLF,Opponent_Fighter_TD_avg_AOLF,Opponent_Round_avg_AOLF,Opponent_DQ_AOLF,Opponent_KO/TKO_AOLF,Opponent_M-DEC_AOLF,Opponent_S-DEC_AOLF,Opponent_SUB_AOLF,Opponent_U-DEC_AOLF,Opponent_DQ_total_AOLF,Opponent_DQ_perc_AOLF,Opponent_KO/TKO_total_AOLF,Opponent_KO/TKO_perc_AOLF,Opponent_M-DEC_total_AOLF,Opponent_M-DEC_perc_AOLF,Opponent_S-DEC_total_AOLF,Opponent_S-DEC_perc_AOLF,Opponent_SUB_total_AOLF,Opponent_SUB_perc_AOLF,Opponent_U-DEC_total_AOLF,Opponent_U-DEC_perc_AOLF,Opponent_Fight_Time_in_Sec_AOLF,Opponent_Fight_Time_in_Sec_avg_AOLF,Opponent_Fight_Time_in_Min_AOLF,Opponent_Fight_Time_in_Min_avg_AOLF,Opponent_Min_in_Octogon_Total_AOLF
0,1994-03-11,Patrick Smith,Ray Wizard,1,Open Weight,1.0,1.0,4,3,0.75,3,1.0,1,1,1994-03-11,0.0,0.5,0.0,0.0,5.0,1.0,0,0,0,0,1,0,0,0.0,2,0.5,0,0.0,0,0.0,2,0.5,0,0.0,58,...,225.0,30.56,1.0,1.0,0.0,3.0,2.0,0.67,2.0,0.67,1.0,0.0,0.33,0.0,0.0,6.33,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.67,0.0,0.0,0.0,0.0,1.0,0.33,0.0,0.0,67.0,58.0,1.12,0.97,2.9
10,1995-07-14,Patrick Smith,Rudyard Moncayo,1,Open Weight,1.0,1.0,5,4,0.8,3,1.0,1,1,1994-03-11,16.07,0.6,0.2,0.4,5.2,1.0,0,0,0,0,1,0,0,0.0,2,0.4,0,0.0,0,0.0,3,0.6,0,0.0,68,...,225.0,31.9,1.0,1.0,1.0,4.0,3.0,0.75,3.0,1.0,1.0,1.0,0.5,0.0,0.0,5.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.5,0.0,0.0,0.0,0.0,2.0,0.5,0.0,0.0,58.0,58.0,0.97,0.97,3.87


In [89]:
opponent_training_df.loc[opponent_training_df.Fighter.str.contains('Isl')].sort_values(by=['Date'], ascending=False) 

Unnamed: 0,Date,Opponent,Fighter,Opponent_Round,Opponent_Weight_Class,Opponent_Won_Last_Fight,Opponent_Won_Fight_Before_Last,Opponent_no_ufc_fights,Opponent_win_total,Opponent_win_percentage,Opponent_wins_last_3,Opponent_wins_last_3_percentage,Opponent_Won_Last_Two_Fights,Opponent_Winning_Streak,Opponent_Date_of_Last_Fight,Opponent_Months_Since_Last_Fight,Opponent_Fighter_Pass_avg,Opponent_Fighter_STR_avg,Opponent_Fighter_Sub_avg,Opponent_Fighter_TD_avg,Opponent_Round_avg,Opponent_DQ,Opponent_KO/TKO,Opponent_M-DEC,Opponent_S-DEC,Opponent_SUB,Opponent_U-DEC,Opponent_DQ_total,Opponent_DQ_perc,Opponent_KO/TKO_total,Opponent_KO/TKO_perc,Opponent_M-DEC_total,Opponent_M-DEC_perc,Opponent_S-DEC_total,Opponent_S-DEC_perc,Opponent_SUB_total,Opponent_SUB_perc,Opponent_U-DEC_total,Opponent_U-DEC_perc,Opponent_Fight_Time_in_Sec,...,Opponent_Weight,Opponent_Age,Opponent_Round_AOLF,Opponent_Won_Last_Fight_AOLF,Opponent_Won_Fight_Before_Last_AOLF,Opponent_no_ufc_fights_AOLF,Opponent_win_total_AOLF,Opponent_win_percentage_AOLF,Opponent_wins_last_3_AOLF,Opponent_wins_last_3_percentage_AOLF,Opponent_Won_Last_Two_Fights_AOLF,Opponent_Winning_Streak_AOLF,Opponent_Fighter_Pass_avg_AOLF,Opponent_Fighter_STR_avg_AOLF,Opponent_Fighter_Sub_avg_AOLF,Opponent_Fighter_TD_avg_AOLF,Opponent_Round_avg_AOLF,Opponent_DQ_AOLF,Opponent_KO/TKO_AOLF,Opponent_M-DEC_AOLF,Opponent_S-DEC_AOLF,Opponent_SUB_AOLF,Opponent_U-DEC_AOLF,Opponent_DQ_total_AOLF,Opponent_DQ_perc_AOLF,Opponent_KO/TKO_total_AOLF,Opponent_KO/TKO_perc_AOLF,Opponent_M-DEC_total_AOLF,Opponent_M-DEC_perc_AOLF,Opponent_S-DEC_total_AOLF,Opponent_S-DEC_perc_AOLF,Opponent_SUB_total_AOLF,Opponent_SUB_perc_AOLF,Opponent_U-DEC_total_AOLF,Opponent_U-DEC_perc_AOLF,Opponent_Fight_Time_in_Sec_AOLF,Opponent_Fight_Time_in_Sec_avg_AOLF,Opponent_Fight_Time_in_Min_AOLF,Opponent_Fight_Time_in_Min_avg_AOLF,Opponent_Min_in_Octogon_Total_AOLF
6780,2021-07-17,Thiago Moises,Islam Makhachev,4,Lightweight,1.0,1.0,7,4,0.57,2,0.67,0,0,2021-02-27,4.59,1.14,0.0,1.0,31.29,3.0,0,0,0,0,1,0,0,0.0,0,0.0,0,0.0,0,0.0,2,0.29,5,0.71,158,...,155.0,26.34,3.0,1.0,1.0,6.0,4.0,0.67,3.0,1.0,1.0,1.0,1.17,0.0,1.0,34.33,2.83,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.17,5.0,0.83,300.0,254.17,5.0,4.24,25.42
6537,2021-03-06,Drew Dober,Islam Makhachev,3,Lightweight,1.0,1.0,15,9,0.6,2,0.67,0,0,2020-05-13,9.74,0.07,0.33,0.47,36.4,2.0,0,0,0,0,1,0,0,0.0,5,0.33,0,0.0,0,0.0,5,0.33,5,0.33,97,...,155.0,32.4,2.0,1.0,1.0,14.0,9.0,0.64,3.0,1.0,1.0,1.0,0.07,0.36,0.5,38.29,1.93,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.36,0.0,0.0,0.0,0.0,4.0,0.29,5.0,0.36,265.0,201.07,4.42,3.35,46.93
5653,2019-09-07,Davi Ramos,Islam Makhachev,3,Lightweight,1.0,1.0,6,4,0.67,2,0.67,0,0,2019-05-18,3.67,0.5,0.0,1.67,28.33,2.33,0,0,0,0,0,1,0,0.0,0,0.0,0,0.0,0,0.0,3,0.5,3,0.5,300,...,155.0,32.86,3.0,1.0,1.0,5.0,4.0,0.8,3.0,1.0,1.0,1.0,0.6,0.0,2.0,32.6,2.2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.6,2.0,0.4,300.0,204.4,5.0,3.41,17.03
4960,2018-07-28,Kajan Johnson,Islam Makhachev,1,Lightweight,1.0,1.0,6,4,0.67,2,0.67,0,0,2018-03-17,4.36,0.17,0.33,1.5,28.67,2.67,0,0,0,0,1,0,0,0.0,2,0.33,0,0.0,1,0.17,1,0.17,2,0.33,283,...,155.0,34.29,3.0,1.0,1.0,5.0,4.0,0.8,3.0,1.0,1.0,1.0,0.2,0.4,1.8,33.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.4,0.0,0.0,1.0,0.2,0.0,0.0,2.0,0.4,300.0,214.0,5.0,3.57,17.84
4626,2018-01-20,Gleison Tibau,Islam Makhachev,1,Lightweight,0.0,0.0,27,16,0.59,0,0.0,0,0,2015-11-07,26.39,0.63,0.07,3.11,21.22,2.37,0,1,0,0,0,0,1,0.04,4,0.15,0,0.0,7,0.26,6,0.22,9,0.33,57,...,155.0,34.31,1.0,0.0,1.0,26.0,16.0,0.62,1.0,0.33,0.0,0.0,0.65,0.08,3.23,21.96,2.42,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.04,3.0,0.12,0.0,0.0,7.0,0.27,6.0,0.23,9.0,0.35,105.0,249.23,1.75,4.15,108.01
4075,2017-02-11,Nik Lentz,Islam Makhachev,3,Lightweight,1.0,1.0,16,11,0.69,2,0.67,0,0,2016-09-10,5.05,1.12,0.06,3.31,40.0,2.75,0,0,0,0,0,1,0,0.0,3,0.19,0,0.0,2,0.12,2,0.12,9,0.56,300,...,145.0,32.52,2.0,1.0,0.0,15.0,11.0,0.73,2.0,0.67,1.0,0.0,1.13,0.07,3.53,41.8,2.73,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.2,0.0,0.0,2.0,0.13,2.0,0.13,8.0,0.53,257.0,266.4,4.28,4.44,66.6
3835,2016-09-17,Chris Wade,Islam Makhachev,3,Lightweight,0.0,1.0,6,4,0.67,1,0.33,0,0,2016-05-08,4.33,0.67,0.17,1.83,19.0,2.33,0,0,0,0,0,1,0,0.0,0,0.0,0,0.0,0,0.0,2,0.33,4,0.67,300,...,155.0,28.99,3.0,1.0,1.0,5.0,4.0,0.8,2.0,0.67,0.0,0.0,0.4,0.2,2.2,21.8,2.2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.4,3.0,0.6,300.0,248.4,5.0,4.14,20.7
3253,2015-10-03,Adriano Martins,Islam Makhachev,1,Lightweight,1.0,1.0,5,4,0.8,3,1.0,1,1,2015-02-22,7.31,0.2,0.4,0.8,10.6,1.6,0,1,0,0,0,0,0,0.0,3,0.6,0,0.0,1,0.2,1,0.2,0,0.0,106,...,155.0,32.82,3.0,1.0,0.0,4.0,3.0,0.75,2.0,0.67,1.0,0.0,0.25,0.25,1.0,12.5,1.75,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.5,0.0,0.0,1.0,0.25,1.0,0.25,0.0,0.0,300.0,222.5,5.0,3.71,14.84


MERGE FIGTHER PERSPECTIVE DATAFRAME WITH OPPONENT PERSPECTIVE DATAFRAME AND CLEAN

In [90]:
# Merge and strip spaces in columns
ufc_training_df = fight_training_df.merge(opponent_training_df, how='left', on=['Date', 'Opponent', 'Fighter']) 
ufc_training_df = ufc_training_df.drop('Winner', axis=1)
ufc_training_df = ufc_training_df.dropna(subset=['Opponent_DOB'], axis=0)
ufc_training_df.columns = ufc_training_df.columns.to_series().apply(lambda x: x.strip())

ufc_training_df

Unnamed: 0,Date,Fighter,Opponent,Round,Weight_Class,Win,Won_Last_Fight,Won_Fight_Before_Last,no_ufc_fights,win_total,win_percentage,wins_last_3,wins_last_3_percentage,Won_Last_Two_Fights,Winning_Streak,Date_of_Last_Fight,Months_Since_Last_Fight,Fighter_Pass_avg,Fighter_STR_avg,Fighter_Sub_avg,Fighter_TD_avg,Round_avg,DQ,KO/TKO,M-DEC,S-DEC,SUB,U-DEC,DQ_total,DQ_perc,KO/TKO_total,KO/TKO_perc,M-DEC_total,M-DEC_perc,S-DEC_total,S-DEC_perc,SUB_total,SUB_perc,U-DEC_total,U-DEC_perc,...,Opponent_Weight,Opponent_Age,Opponent_Round_AOLF,Opponent_Won_Last_Fight_AOLF,Opponent_Won_Fight_Before_Last_AOLF,Opponent_no_ufc_fights_AOLF,Opponent_win_total_AOLF,Opponent_win_percentage_AOLF,Opponent_wins_last_3_AOLF,Opponent_wins_last_3_percentage_AOLF,Opponent_Won_Last_Two_Fights_AOLF,Opponent_Winning_Streak_AOLF,Opponent_Fighter_Pass_avg_AOLF,Opponent_Fighter_STR_avg_AOLF,Opponent_Fighter_Sub_avg_AOLF,Opponent_Fighter_TD_avg_AOLF,Opponent_Round_avg_AOLF,Opponent_DQ_AOLF,Opponent_KO/TKO_AOLF,Opponent_M-DEC_AOLF,Opponent_S-DEC_AOLF,Opponent_SUB_AOLF,Opponent_U-DEC_AOLF,Opponent_DQ_total_AOLF,Opponent_DQ_perc_AOLF,Opponent_KO/TKO_total_AOLF,Opponent_KO/TKO_perc_AOLF,Opponent_M-DEC_total_AOLF,Opponent_M-DEC_perc_AOLF,Opponent_S-DEC_total_AOLF,Opponent_S-DEC_perc_AOLF,Opponent_SUB_total_AOLF,Opponent_SUB_perc_AOLF,Opponent_U-DEC_total_AOLF,Opponent_U-DEC_perc_AOLF,Opponent_Fight_Time_in_Sec_AOLF,Opponent_Fight_Time_in_Sec_avg_AOLF,Opponent_Fight_Time_in_Min_AOLF,Opponent_Fight_Time_in_Min_avg_AOLF,Opponent_Min_in_Octogon_Total_AOLF
13,1995-12-16,Dan Severn,Oleg Taktarov,2,Open Weight,1,0.0,1.0,8,6,0.75,2,0.67,0,0,1995-07-14,5.08,0.50,0.00,0.88,5.38,1.12,0,0,0,0,0,1,0,0.0,1,0.12,0,0.0,0,0.00,6,0.75,1,0.12,...,210.0,28.33,1.0,1.0,1.0,5.0,4.0,0.80,3.0,1.00,1.0,1.0,1.40,0.00,0.00,0.80,1.00,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.20,0.0,0.0,0.0,0.00,4.0,0.80,0.0,0.00,9.0,307.00,0.15,5.12,25.58
14,1995-12-16,Oleg Taktarov,Dan Severn,2,Open Weight,0,1.0,1.0,6,4,0.67,2,0.67,0,0,1995-07-14,5.08,1.33,0.00,0.00,2.00,1.17,0,0,0,0,0,1,0,0.0,1,0.17,0,0.0,0,0.00,4,0.67,1,0.17,...,250.0,37.55,1.0,1.0,1.0,7.0,5.0,0.71,2.0,0.67,0.0,0.0,0.57,0.00,1.00,2.71,1.00,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.14,0.0,0.0,0.0,0.00,6.0,0.86,0.0,0.00,134.0,254.29,2.23,4.24,29.67
16,1995-12-16,Marco Ruas,Oleg Taktarov,1,Open Weight,0,1.0,1.0,5,4,0.80,2,0.67,0,0,1995-12-16,0.00,1.20,0.20,0.40,11.80,1.00,0,0,0,0,0,1,0,0.0,1,0.20,0,0.0,0,0.00,3,0.60,1,0.20,...,210.0,28.33,2.0,1.0,1.0,6.0,4.0,0.67,2.0,0.67,0.0,0.0,1.33,0.00,0.00,2.00,1.17,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.17,0.0,0.0,0.0,0.00,4.0,0.67,1.0,0.17,180.0,285.83,3.00,4.76,28.58
17,1995-12-16,Oleg Taktarov,Marco Ruas,1,Open Weight,1,0.0,1.0,7,5,0.71,2,0.67,0,0,1995-12-16,0.00,1.29,0.00,0.00,2.57,1.14,0,0,0,0,0,1,0,0.0,1,0.14,0,0.0,0,0.00,4,0.57,2,0.29,...,210.0,34.92,1.0,1.0,1.0,4.0,4.0,1.00,3.0,1.00,1.0,1.0,1.50,0.25,0.50,11.25,1.00,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.00,3.0,0.75,0.0,0.00,159.0,476.50,2.65,7.94,31.76
22,1996-05-17,Dan Severn,Ken Shamrock,3,Open Weight,1,1.0,1.0,11,9,0.82,3,1.00,1,1,1995-12-16,5.02,0.55,0.00,0.82,10.55,1.27,0,0,0,1,0,0,0,0.0,1,0.09,0,0.0,1,0.09,7,0.64,2,0.18,...,205.0,32.28,1.0,1.0,1.0,4.0,4.0,1.00,3.0,1.00,1.0,1.0,1.25,0.00,0.75,3.50,1.00,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.00,3.0,0.75,0.0,0.00,264.0,240.25,4.40,4.01,16.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6793,2021-07-24,Cory Sandhagen,TJ Dillashaw,5,Bantamweight,0,1.0,1.0,9,7,0.78,2,0.67,0,0,2021-02-06,5.51,0.33,0.44,0.44,57.11,2.22,0,0,0,1,0,0,0,0.0,4,0.44,0,0.0,2,0.22,2,0.22,1,0.11,...,135.0,35.48,1.0,1.0,1.0,16.0,12.0,0.75,2.0,0.67,0.0,0.0,0.75,0.56,1.31,62.81,2.69,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.56,0.0,0.0,2.0,0.12,1.0,0.06,4.0,0.25,32.0,194.94,0.53,3.25,51.97
6795,2021-07-24,Raulian Paiva,Kyler Phillips,3,Bantamweight,1,1.0,1.0,5,3,0.60,3,1.00,1,1,2020-07-11,12.39,0.20,0.20,0.40,45.40,2.40,0,0,1,0,0,0,0,0.0,2,0.40,1,0.2,1,0.20,0,0.00,1,0.20,...,135.0,26.13,3.0,1.0,1.0,3.0,3.0,1.00,3.0,1.00,1.0,1.0,0.33,0.00,2.33,60.00,2.67,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.33,0.0,0.0,0.0,0.00,0.0,0.00,2.0,0.67,300.0,214.67,5.00,3.58,10.73
6797,2021-07-24,Julio Arce,Andre Ewell,2,Bantamweight,1,0.0,1.0,6,4,0.67,2,0.67,0,0,2019-11-02,20.66,0.50,0.17,0.50,51.33,2.83,0,1,0,0,0,0,0,0.0,2,0.33,0,0.0,2,0.33,1,0.17,1,0.17,...,135.0,33.53,3.0,1.0,1.0,7.0,4.0,0.57,2.0,0.67,0.0,0.0,0.14,0.14,0.29,64.71,3.00,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.14,0.0,0.0,3.0,0.43,1.0,0.14,2.0,0.29,300.0,278.43,5.00,4.64,32.48
6800,2021-07-24,Darrick Minner,Darren Elkins,2,Featherweight,0,1.0,1.0,4,2,0.50,2,0.67,0,0,2021-02-20,5.05,2.25,0.25,1.75,20.75,2.00,0,1,0,0,0,0,0,0.0,1,0.25,0,0.0,0,0.00,2,0.50,1,0.25,...,145.0,37.21,3.0,0.0,0.0,23.0,15.0,0.65,1.0,0.33,0.0,0.0,0.96,0.09,2.17,42.30,2.61,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,0.22,0.0,0.0,2.0,0.09,3.0,0.13,13.0,0.57,142.0,242.22,2.37,4.04,92.85


CREATE COLUMNs THAT TRACK AVG WIN % Of OPPONENTS FOR BOTH FIGHTER AND OPPONENT


In [91]:
ufc_training_df['avg_level_of_opponents_faced_AOLF'] = ufc_training_df.groupby(['Fighter'])['Opponent_win_percentage_AOLF'].transform(lambda x: x.rolling(len(ufc_training_df), 0).mean()).astype(float)
ufc_training_df['Opponent_avg_level_of_opponents_faced_AOLF'] = ufc_training_df.groupby(['Opponent'])['win_percentage_AOLF'].transform(lambda x: x.rolling(len(ufc_training_df), 0).mean()).astype(float)
ufc_training_df['avg_level_of_opponents_faced'] = ufc_training_df.groupby(['Fighter'])['Opponent_win_percentage'].transform(lambda x: x.rolling(len(ufc_training_df), 0).mean()).astype(float)
ufc_training_df['Opponent_avg_level_of_opponents_faced'] = ufc_training_df.groupby(['Opponent'])['win_percentage'].transform(lambda x: x.rolling(len(ufc_training_df), 0).mean()).astype(float)


In [92]:
#Month and Year integers

ufc_training_df['Year'] = pd.DatetimeIndex(ufc_training_df['Date']).year
ufc_training_df['Month'] = pd.DatetimeIndex(ufc_training_df['Date']).month

In [93]:
#TEST
ufc_training_df[['Date', 'Fighter', 'win_percentage_AOLF', 'Opponent', 'Opponent_win_percentage_AOLF', 'Win']]

Unnamed: 0,Date,Fighter,win_percentage_AOLF,Opponent,Opponent_win_percentage_AOLF,Win
13,1995-12-16,Dan Severn,0.71,Oleg Taktarov,0.80,1
14,1995-12-16,Oleg Taktarov,0.80,Dan Severn,0.71,0
16,1995-12-16,Marco Ruas,1.00,Oleg Taktarov,0.67,0
17,1995-12-16,Oleg Taktarov,0.67,Marco Ruas,1.00,1
22,1996-05-17,Dan Severn,0.80,Ken Shamrock,1.00,1
...,...,...,...,...,...,...
6793,2021-07-24,Cory Sandhagen,0.88,TJ Dillashaw,0.75,0
6795,2021-07-24,Raulian Paiva,0.50,Kyler Phillips,1.00,1
6797,2021-07-24,Julio Arce,0.60,Andre Ewell,0.57,1
6800,2021-07-24,Darrick Minner,0.67,Darren Elkins,0.65,0


CREATE DIFF COLS

In [94]:
#Calculate Residuals (fighter_col - opponent col)

#REMOVE SHIFT COLS
ufc_training_df = ufc_training_df.drop(shift_cols, axis=1)

#get numeric colums
columns_to_change = [i for i in ufc_training_df.columns.tolist() if i not in 
                     ['Date', 'Fighter', 'Opponent', 'Win', 'Date_of_Last_Fight', 'Male', 'Female',	'DOB', 'STANCE', 'Weight_Class',
                      'Opponent_Weight_Class', 'Opponent_Date_of_Last_Fight', 'Opponent_DOB', 'Opponent_STANCE', 'Year', 'Month',
                      'Fighter_Age', 'Opponent_Age']]

#only need non-opponent side because opponent will be dealt with in the function
columns_to_change_2 = [i for i in columns_to_change if 'Opponent' not in i]

#residual function
def create_residual_col(df, col):
  opponent_col = str('Opponent_'+col)
  df[str('Diff_'+col)] = (df[col] - df[opponent_col])
  df = df.drop([col, opponent_col], axis=1, inplace=True)

#alternate df with residuals only
diff_df = ufc_training_df.copy()
[create_residual_col(diff_df,i) for i in columns_to_change_2]
diff_df['Diff_age'] = diff_df['Fighter_Age'] - diff_df['Opponent_Age']
diff_df.drop(['Fighter_Age', 'Opponent_Age'], axis=1, inplace=True)

# drop extra columns
final_drop_cols = [i for i in columns_to_change if 'Opponent' in i]
final_drop_cols = [i for i in final_drop_cols if 'AOLF' not in i]
#final_drop_cols = [i for i in final_drop_cols if i not in ['Won_Last_Fight', 'Won_Fight_Before_Last', 'Opponent_Won_Last_Fight', 'Opponent_Won_Fight_Before_Last']]
final_drop_cols = [i for i in final_drop_cols if i not in ['Opponent_avg_level_of_opponents_faced', 'Opponent_Months_Since_Last_Fight', 'Opponent_Height', 'Opponent_Reach', 'Opponent_Weight']]
final_drop_cols = final_drop_cols + ['Diff_avg_level_of_opponents_faced']
diff_df = diff_df.drop(final_drop_cols, axis=1)


diff_df


Unnamed: 0,Date,Fighter,Opponent,Weight_Class,Win,Date_of_Last_Fight,Male,Female,DOB,STANCE,Opponent_Weight_Class,Opponent_Date_of_Last_Fight,Opponent_DOB,Opponent_STANCE,Year,Month,Diff_Months_Since_Last_Fight,Diff_Height,Diff_Reach,Diff_Weight,Diff_Round_AOLF,Diff_Won_Last_Fight_AOLF,Diff_Won_Fight_Before_Last_AOLF,Diff_no_ufc_fights_AOLF,Diff_win_total_AOLF,Diff_win_percentage_AOLF,Diff_wins_last_3_AOLF,Diff_wins_last_3_percentage_AOLF,Diff_Won_Last_Two_Fights_AOLF,Diff_Winning_Streak_AOLF,Diff_Fighter_Pass_avg_AOLF,Diff_Fighter_STR_avg_AOLF,Diff_Fighter_Sub_avg_AOLF,Diff_Fighter_TD_avg_AOLF,Diff_Round_avg_AOLF,Diff_DQ_AOLF,Diff_KO/TKO_AOLF,Diff_M-DEC_AOLF,Diff_S-DEC_AOLF,Diff_SUB_AOLF,Diff_U-DEC_AOLF,Diff_DQ_total_AOLF,Diff_DQ_perc_AOLF,Diff_KO/TKO_total_AOLF,Diff_KO/TKO_perc_AOLF,Diff_M-DEC_total_AOLF,Diff_M-DEC_perc_AOLF,Diff_S-DEC_total_AOLF,Diff_S-DEC_perc_AOLF,Diff_SUB_total_AOLF,Diff_SUB_perc_AOLF,Diff_U-DEC_total_AOLF,Diff_U-DEC_perc_AOLF,Diff_Fight_Time_in_Sec_AOLF,Diff_Fight_Time_in_Sec_avg_AOLF,Diff_Fight_Time_in_Min_AOLF,Diff_Fight_Time_in_Min_avg_AOLF,Diff_Min_in_Octogon_Total_AOLF,Diff_avg_level_of_opponents_faced_AOLF,Diff_age
13,1995-12-16,Dan Severn,Oleg Taktarov,Open Weight,1,1995-07-14,1,0,1958-06-08,Southpaw,Open Weight,1995-07-14,1967-08-26,Orthodox,1995,12,0.00,0.2,71.75,40.0,0.0,0.0,0.0,2.0,1.0,-0.09,-1.0,-0.33,-1.0,-1.0,-0.83,0.00,1.00,1.91,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.06,0.0,0.0,0.0,0.00,2.0,0.06,0.0,0.00,125.0,-52.71,2.08,-0.88,4.09,0.090000,9.22
14,1995-12-16,Oleg Taktarov,Dan Severn,Open Weight,0,1995-07-14,1,0,1967-08-26,Orthodox,Open Weight,1995-07-14,1958-06-08,Southpaw,1995,12,0.00,-0.2,-71.75,-40.0,0.0,0.0,0.0,-2.0,-1.0,0.09,1.0,0.33,1.0,1.0,0.83,0.00,-1.00,-1.91,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.0,0.00,-2.0,-0.06,0.0,0.00,-125.0,52.71,-2.08,0.88,-4.09,-0.090000,-9.22
16,1995-12-16,Marco Ruas,Oleg Taktarov,Open Weight,0,1995-12-16,1,0,1961-01-23,Orthodox,Open Weight,1995-12-16,1967-08-26,Orthodox,1995,12,0.00,0.1,0.10,0.0,-1.0,0.0,0.0,-2.0,0.0,0.33,1.0,0.33,1.0,1.0,0.17,0.25,0.50,9.25,-0.17,0.0,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,0.0,0.08,0.0,0.0,0.0,0.00,-1.0,0.08,-1.0,-0.17,-21.0,190.67,-0.35,3.18,3.18,-0.185000,6.59
17,1995-12-16,Oleg Taktarov,Marco Ruas,Open Weight,1,1995-12-16,1,0,1967-08-26,Orthodox,Open Weight,1995-12-16,1961-01-23,Orthodox,1995,12,0.00,-0.1,-0.10,0.0,1.0,0.0,0.0,2.0,0.0,-0.33,-1.0,-0.33,-1.0,-1.0,-0.17,-0.25,-0.50,-9.25,0.17,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,0.0,-0.08,0.0,0.0,0.0,0.00,1.0,-0.08,1.0,0.17,21.0,-190.67,0.35,-3.18,-3.18,0.185000,-6.59
22,1996-05-17,Dan Severn,Ken Shamrock,Open Weight,1,1995-12-16,1,0,1958-06-08,Southpaw,Open Weight,1996-02-16,1964-02-11,Orthodox,1996,5,2.04,0.1,5.75,45.0,0.0,0.0,0.0,6.0,4.0,-0.20,0.0,0.00,0.0,0.0,-0.65,0.00,0.15,6.50,0.10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.15,0.0,0.0,0.0,0.00,4.0,-0.05,2.0,0.20,-203.0,69.85,-3.38,1.16,35.67,0.100000,5.69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6793,2021-07-24,Cory Sandhagen,TJ Dillashaw,Bantamweight,0,2021-02-06,1,0,1992-04-20,Switch,Bantamweight,2019-01-19,1986-02-07,Orthodox,2021,7,-24.56,0.4,3.00,0.0,0.0,0.0,-1.0,-8.0,-5.0,0.13,0.0,0.00,1.0,0.0,-0.50,-0.06,-0.81,-14.56,-0.81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,-0.06,0.0,0.0,-1.0,0.00,1.0,0.19,-3.0,-0.13,-4.0,-35.06,-0.06,-0.58,-30.64,-0.113788,-6.20
6795,2021-07-24,Raulian Paiva,Kyler Phillips,Bantamweight,1,2020-07-11,1,0,1995-10-17,Orthodox,Bantamweight,2021-03-06,1995-06-12,Orthodox,2021,7,7.80,0.0,-3.00,-10.0,0.0,0.0,-1.0,1.0,-1.0,-0.50,-1.0,-0.33,0.0,-1.0,-0.08,0.25,-2.33,-16.00,-0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.17,0.0,0.0,1.0,0.25,0.0,0.00,-1.0,-0.42,0.0,49.83,0.00,0.83,6.90,0.500000,-0.34
6797,2021-07-24,Julio Arce,Andre Ewell,Bantamweight,1,2019-11-02,1,0,1989-10-27,Southpaw,Bantamweight,2021-02-13,1988-01-21,Southpaw,2021,7,15.38,-0.1,-5.00,10.0,0.0,0.0,-1.0,-2.0,-1.0,0.03,-1.0,-0.34,0.0,0.0,0.46,0.06,0.31,-11.51,0.00,0.0,0.0,0.0,1.0,0.0,-1.0,0.0,0.0,0.0,0.06,0.0,0.0,-1.0,-0.03,0.0,0.06,-1.0,-0.09,0.0,-41.63,0.00,-0.69,-12.74,-0.149167,-1.77
6800,2021-07-24,Darrick Minner,Darren Elkins,Featherweight,0,2021-02-20,1,0,1990-04-28,Orthodox,Featherweight,2020-11-07,1984-05-16,Orthodox,2021,7,-3.44,-0.2,-2.00,0.0,0.0,1.0,0.0,-20.0,-13.0,0.02,1.0,0.34,1.0,0.0,1.04,0.24,-0.50,-20.97,-0.61,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,-5.0,-0.22,0.0,0.0,-2.0,-0.09,-1.0,0.54,-12.0,-0.24,158.0,-92.22,2.63,-1.54,-85.35,-0.064706,-5.95


WRITE OUT CLEAN DATA FOR MODEL TRAINING

In [95]:
# Write out clean data -- diff version

diff_df.to_csv('/content/gdrive/MyDrive/Supervised ML/UFC Fight Predictor/ufc_clean_data_diffs.csv')

In [96]:
# Write out clean data -- full version

ufc_training_df.to_csv('/content/gdrive/MyDrive/Supervised ML/UFC Fight Predictor/ufc_clean_data.csv')
