# The Next Step in our process is Organizing

We have all the fights ever and all the fighters ever, so next we need to append all the fighters statistics to each fight and then make it easier for PyTorch to understand.

In [23]:
import pandas as pd # Our beloved to work with data
import numpy as np # In case we need to do some math

# Making the Columns

### For the Fighters Dataset

* We need the win percentage instead of their record
* We need their first and last name in one column
* We do not need their nickname, win, loss, draw columns, and eventually not their first and last name columns so drop those

In [24]:
fighters = pd.read_csv('fighters.csv')

In [25]:
# Win Percentage Calulcations
fighters['WinPercentage'] = fighters['Win'] / (fighters['Win'] + fighters['Loss'] + fighters['Draw'])
fighters['Experience'] = fighters['Win'] + fighters['Loss'] + fighters['Draw']
# Getting rid of unnecessary columns
fighters.drop(columns=['Draw', 'Loss', 'Win', 'Nickname'], inplace=True)

In [26]:
fighters.head()

Unnamed: 0,First,Last,HT,WT,Reach,WinPercentage,Experience
0,Tom,Aaron,--,155 lbs.,--,0.625,8
1,Danny,Abbadi,"5' 11""",155 lbs.,--,0.4,10
2,Nariman,Abbasov,"5' 8""",155 lbs.,"66.0""",0.875,32
3,David,Abbott,"6' 0""",265 lbs.,--,0.4,25
4,Hamdy,Abdelwahab,"6' 2""",264 lbs.,"72.0""",1.0,5


In [27]:
# Combining first and last names
fighters['Name'] = fighters['First'] + ' ' + fighters['Last']
fighters.drop(columns=['First', 'Last'], inplace=True)
fighters.head()

Unnamed: 0,HT,WT,Reach,WinPercentage,Experience,Name
0,--,155 lbs.,--,0.625,8,Tom Aaron
1,"5' 11""",155 lbs.,--,0.4,10,Danny Abbadi
2,"5' 8""",155 lbs.,"66.0""",0.875,32,Nariman Abbasov
3,"6' 0""",265 lbs.,--,0.4,25,David Abbott
4,"6' 2""",264 lbs.,"72.0""",1.0,5,Hamdy Abdelwahab


### For the fights dataset

* Need to rearrange the order sometimes so the left column isnt always the winner and also so that we can keep track of that

In [28]:
fights = pd.read_csv('fights.csv')
fights.head()

Unnamed: 0.1,Unnamed: 0,Winners,Losers
0,0,Tom Aspinall,Marcin Tybura
1,1,Julija Stoliarenko,Molly McCann
2,2,Nathaniel Wood,Andre Fili
3,3,Paul Craig,Andre Muniz
4,4,Fares Ziam,Jai Herbert


# OUTLIERS

Before we move on I want to remove a couple big upsets in the UFC that will definitely skew our model. I am going to remove the following fights:

* Serra vs St. Pierre 1
* Holm vs Rousey
* Weidman vs Silva
* Pena vs Nunes
* Werdum vs Emilianenko
* Dillashaw vs Barao
* Diaz vs McGregor

This list may increase depending on how the model performs.

In [29]:
fights_to_remove = [
    ('Matt Serra', 'Georges St. Pierre'),
    ('Holly Holm', 'Ronda Rousey'),
    ('Chris Weidman', 'Anderson Silva'),
    ('Julianna Pena', 'Amanda Nunes'),
    ('Fabrico Werdum', 'Fedor Emilianenko'),
    ('T.J. Dillashaw', 'Renan Barao'),
    ('Nate Diaz', 'Conor McGregor')
]

# Filter out the fights to be removed
filtered_fights = fights[~fights.apply(lambda row: (row['Winners'], row['Losers']) in fights_to_remove, axis=1)]

In [30]:
# Creating a new column for the winner
fighter_1 = []
fighter_2 = []
winner = []

# Randomizing and keeping track of the winner
for index, row in fights.iterrows():
    # random number either 0 or 1
    number = np.random.randint(0, 2)
    
    # If its zero then we do not need to switch the winner
    if number == 0:
        fighter_1.append(row['Winners'])
        fighter_2.append(row['Losers'])
        winner.append(0)
        
    # If its one then we need to switch the order  
    else:
        fighter_1.append(row['Losers'])
        fighter_2.append(row['Winners'])
        winner.append(1)

# Making a new dataframe with the new columns
organized = pd.DataFrame({'Fighter 1': fighter_1, 'Fighter 2': fighter_2, 'Winner': winner})
organized.head()

Unnamed: 0,Fighter 1,Fighter 2,Winner
0,Marcin Tybura,Tom Aspinall,1
1,Molly McCann,Julija Stoliarenko,1
2,Andre Fili,Nathaniel Wood,1
3,Paul Craig,Andre Muniz,0
4,Fares Ziam,Jai Herbert,0


#### SUCCESS

# Assigning everything

We in short need to add all the players STATS to each fight, which sounds very intimidating (at least it did to me) BUT maybe it is not.

In [31]:
fighters_1 = fighters.add_suffix('_1')
fighters_2 = fighters.add_suffix('_2')

In [32]:
# I think its working? I am not 100% confident but I think it is
attempt_1 = pd.merge(organized, fighters_1, left_on='Fighter 1', right_on='Name_1')
attempt_1

Unnamed: 0,Fighter 1,Fighter 2,Winner,HT_1,WT_1,Reach_1,WinPercentage_1,Experience_1,Name_1
0,Marcin Tybura,Tom Aspinall,1,"6' 3""",249 lbs.,"78.0""",0.750000,32,Marcin Tybura
1,Marcin Tybura,Alexandr Romanov,0,"6' 3""",249 lbs.,"78.0""",0.750000,32,Marcin Tybura
2,Marcin Tybura,Alexander Volkov,1,"6' 3""",249 lbs.,"78.0""",0.750000,32,Marcin Tybura
3,Marcin Tybura,Walt Harris,0,"6' 3""",249 lbs.,"78.0""",0.750000,32,Marcin Tybura
4,Marcin Tybura,Greg Hardy,0,"6' 3""",249 lbs.,"78.0""",0.750000,32,Marcin Tybura
...,...,...,...,...,...,...,...,...,...
7239,Minoki Ichihara,Royce Gracie,1,"5' 7""",178 lbs.,--,0.000000,1,Minoki Ichihara
7240,Alberta Cerra Leon,Remco Pardoel,1,"5' 8""",238 lbs.,--,0.000000,1,Alberta Cerra Leon
7241,Orlando Wiet,Robert Lucarelli,0,"5' 10""",170 lbs.,--,0.166667,6,Orlando Wiet
7242,Thaddeus Luster,Frank Hamaker,1,"6' 3""",210 lbs.,--,0.000000,1,Thaddeus Luster


In [33]:
attempt_2 = pd.merge(attempt_1, fighters_2, left_on='Fighter 2', right_on='Name_2')
attempt_2

Unnamed: 0,Fighter 1,Fighter 2,Winner,HT_1,WT_1,Reach_1,WinPercentage_1,Experience_1,Name_1,HT_2,WT_2,Reach_2,WinPercentage_2,Experience_2,Name_2
0,Marcin Tybura,Tom Aspinall,1,"6' 3""",249 lbs.,"78.0""",0.750000,32,Marcin Tybura,"6' 5""",256 lbs.,"78.0""",0.8125,16,Tom Aspinall
1,Andrei Arlovski,Tom Aspinall,1,"6' 3""",240 lbs.,"77.0""",0.607143,56,Andrei Arlovski,"6' 5""",256 lbs.,"78.0""",0.8125,16,Tom Aspinall
2,Jake Collier,Tom Aspinall,1,"6' 3""",230 lbs.,"78.0""",0.590909,22,Jake Collier,"6' 5""",256 lbs.,"78.0""",0.8125,16,Tom Aspinall
3,Serghei Spivac,Tom Aspinall,1,"6' 3""",260 lbs.,"78.0""",0.842105,19,Serghei Spivac,"6' 5""",256 lbs.,"78.0""",0.8125,16,Tom Aspinall
4,Alan Baudot,Tom Aspinall,1,"6' 3""",243 lbs.,"79.0""",0.666667,12,Alan Baudot,"6' 5""",256 lbs.,"78.0""",0.8125,16,Tom Aspinall
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7242,Jason DeLucia,Scott Baker,0,"5' 11""",190 lbs.,--,0.600000,55,Jason DeLucia,"6' 0""",210 lbs.,--,0.5000,2,Scott Baker
7243,Johnny Rhodes,Fred Ettish,0,"6' 0""",210 lbs.,--,0.666667,3,Johnny Rhodes,"6' 0""",180 lbs.,--,0.5000,2,Fred Ettish
7244,Johnny Rhodes,David Levicki,0,"6' 0""",210 lbs.,--,0.666667,3,Johnny Rhodes,"6' 5""",275 lbs.,--,0.2500,4,David Levicki
7245,Orlando Wiet,Robert Lucarelli,0,"5' 10""",170 lbs.,--,0.166667,6,Orlando Wiet,"6' 2""",245 lbs.,--,0.0000,1,Robert Lucarelli


In [34]:
fight_data = attempt_2.drop(columns=['Name_1', 'Name_2'])
fight_data.head()

Unnamed: 0,Fighter 1,Fighter 2,Winner,HT_1,WT_1,Reach_1,WinPercentage_1,Experience_1,HT_2,WT_2,Reach_2,WinPercentage_2,Experience_2
0,Marcin Tybura,Tom Aspinall,1,"6' 3""",249 lbs.,"78.0""",0.75,32,"6' 5""",256 lbs.,"78.0""",0.8125,16
1,Andrei Arlovski,Tom Aspinall,1,"6' 3""",240 lbs.,"77.0""",0.607143,56,"6' 5""",256 lbs.,"78.0""",0.8125,16
2,Jake Collier,Tom Aspinall,1,"6' 3""",230 lbs.,"78.0""",0.590909,22,"6' 5""",256 lbs.,"78.0""",0.8125,16
3,Serghei Spivac,Tom Aspinall,1,"6' 3""",260 lbs.,"78.0""",0.842105,19,"6' 5""",256 lbs.,"78.0""",0.8125,16
4,Alan Baudot,Tom Aspinall,1,"6' 3""",243 lbs.,"79.0""",0.666667,12,"6' 5""",256 lbs.,"78.0""",0.8125,16


# Cleaning

* replace all '--' values with 'NaN'
* convert feet to inches
* get rid of any row that has NaN value

In [35]:
# Getting rid of the '--' values cause there are stupid and annoying
for column in fight_data.columns:
    fight_data[column] = fight_data[column].replace({'--': np.nan})

In [36]:
# Height Conversion Function
def convert_height(height):
    
    # Ignore it if it already happened
    if type(height) == float:
        return height
    
    # Otherwise strip the feet and inches and convert it to inches
    else:
        feet_inches = height.split("'")
        feet = float(feet_inches[0])
        inches = float(feet_inches[1].replace('"', ''))
        return (feet * 12) + inches

In [37]:
# Doing the easier stuff that I know will work first whicfh means no more words in the numbers
fight_data['WT_1'] = fight_data['WT_1'].str.replace(' lbs.', '').astype(float)
fight_data['WT_2'] = fight_data['WT_2'].str.replace(' lbs.', '').astype(float)

fight_data['Reach_1'] = fight_data['Reach_1'].str.replace('"', '').astype(float)
fight_data['Reach_2'] = fight_data['Reach_2'].str.replace('"', '').astype(float)

In [38]:
fight_data.head()

Unnamed: 0,Fighter 1,Fighter 2,Winner,HT_1,WT_1,Reach_1,WinPercentage_1,Experience_1,HT_2,WT_2,Reach_2,WinPercentage_2,Experience_2
0,Marcin Tybura,Tom Aspinall,1,"6' 3""",249.0,78.0,0.75,32,"6' 5""",256.0,78.0,0.8125,16
1,Andrei Arlovski,Tom Aspinall,1,"6' 3""",240.0,77.0,0.607143,56,"6' 5""",256.0,78.0,0.8125,16
2,Jake Collier,Tom Aspinall,1,"6' 3""",230.0,78.0,0.590909,22,"6' 5""",256.0,78.0,0.8125,16
3,Serghei Spivac,Tom Aspinall,1,"6' 3""",260.0,78.0,0.842105,19,"6' 5""",256.0,78.0,0.8125,16
4,Alan Baudot,Tom Aspinall,1,"6' 3""",243.0,79.0,0.666667,12,"6' 5""",256.0,78.0,0.8125,16


In [39]:
# Making all the heights into inches
fight_data['HT_1'] = fight_data['HT_1'].apply(convert_height)
fight_data['HT_2'] = fight_data['HT_2'].apply(convert_height)

In [40]:
# LASTY NO NULL VALUES
fight_data.dropna(inplace=True)
fight_data.tail()

Unnamed: 0,Fighter 1,Fighter 2,Winner,HT_1,WT_1,Reach_1,WinPercentage_1,Experience_1,HT_2,WT_2,Reach_2,WinPercentage_2,Experience_2
7018,Eric Schafer,Rob MacDonald,0,75.0,185.0,76.0,0.583333,24,75.0,205.0,78.0,0.555556,9
7019,Jason Lambert,Rob MacDonald,0,75.0,185.0,75.0,0.666667,39,75.0,205.0,78.0,0.555556,9
7024,Justin Buchholz,Corey Hill,0,72.0,155.0,73.0,0.6,25,76.0,155.0,80.0,0.4,15
7048,Rameau Thierry Sokoudjou,Kazuhiro Nakamura,0,70.0,205.0,78.0,0.5,36,71.0,205.0,70.0,0.617647,34
7066,Jason Dent,Roger Huerta,1,70.0,155.0,71.0,0.628571,35,69.0,155.0,70.0,0.69697,33


**SUCCESS**

# Caluclate the differences

Lastly to make it easier on our neural network we just want to see the difference from fighter_1 minus fighter_2

In [41]:
HeightDiff = fight_data['HT_1'] - fight_data['HT_2']
WeightDiff = fight_data['WT_1'] - fight_data['WT_2']
ReachDiff = fight_data['Reach_1'] - fight_data['Reach_2']
WinDiff = fight_data['WinPercentage_1'] - fight_data['WinPercentage_2']
ExpDiff = fight_data['Experience_1'] - fight_data['Experience_2']

In [42]:
final_data = pd.DataFrame({'HeightDiff': HeightDiff, 'WeightDiff': WeightDiff, 'ReachDiff': ReachDiff,
                           'WinDiff': WinDiff, 'ExpDiff': ExpDiff, 'Winner': fight_data['Winner']})
final_data

Unnamed: 0,HeightDiff,WeightDiff,ReachDiff,WinDiff,ExpDiff,Winner
0,-2.0,-7.0,0.0,-0.062500,16,1
1,-2.0,-16.0,-1.0,-0.205357,40,1
2,-2.0,-26.0,0.0,-0.221591,6,1
3,-2.0,4.0,0.0,0.029605,3,1
4,-2.0,-13.0,1.0,-0.145833,-4,1
...,...,...,...,...,...,...
7018,0.0,-20.0,-2.0,0.027778,15,0
7019,0.0,-20.0,-3.0,0.111111,30,0
7024,-4.0,0.0,-7.0,0.200000,10,0
7048,-1.0,0.0,8.0,-0.117647,2,0


In [43]:
final_data['Winner'].value_counts()

Winner
1    3099
0    3078
Name: count, dtype: int64

In [44]:
final_data.to_csv('ufc_fights.csv', index=False)

# SUCCESS

It appears all has worked and the data is ready to be trained! now lets hope I get good results