# The Next Step in our process is Organizing

We have all the fights ever and all the fighters ever, so next we need to append all the fighters statistics to each fight and then make it easier for PyTorch to understand.

In [1]:
import pandas as pd # Our beloved to work with data
import numpy as np # In case we need to do some math

# Making the Columns

### For the Fighters Dataset

* We need the win percentage instead of their record
* We need their first and last name in one column
* We do not need their nickname, win, loss, draw columns, and eventually not their first and last name columns so drop those

In [2]:
fighters = pd.read_csv('fighters.csv')

In [3]:
# Win Percentage Calulcations
fighters['WinPercentage'] = fighters['Win'] / (fighters['Win'] + fighters['Loss'] + fighters['Draw'])
fighters['Experience'] = fighters['Win'] + fighters['Loss'] + fighters['Draw']
fighters['DrawPercentage'] = fighters['Draw'] / (fighters['Win'] + fighters['Loss'] + fighters['Draw'])
fighters['LossPercentage'] = fighters['Loss'] / (fighters['Win'] + fighters['Loss'] + fighters['Draw'])
# Getting rid of unnecessary columns
fighters.drop(columns=['Draw', 'Loss', 'Win', 'Nickname'], inplace=True)

In [4]:
fighters.head()

Unnamed: 0,First,Last,HT,WT,Reach,WinPercentage,Experience,DrawPercentage,LossPercentage
0,Tom,Aaron,--,155 lbs.,--,0.625,8,0.0,0.375
1,Danny,Abbadi,"5' 11""",155 lbs.,--,0.4,10,0.0,0.6
2,Nariman,Abbasov,"5' 8""",155 lbs.,"66.0""",0.875,32,0.0,0.125
3,David,Abbott,"6' 0""",265 lbs.,--,0.4,25,0.0,0.6
4,Hamdy,Abdelwahab,"6' 2""",264 lbs.,"72.0""",1.0,5,0.0,0.0


In [5]:
# Combining first and last names
fighters['Name'] = fighters['First'] + ' ' + fighters['Last']
fighters.drop(columns=['First', 'Last'], inplace=True)
fighters.head()

Unnamed: 0,HT,WT,Reach,WinPercentage,Experience,DrawPercentage,LossPercentage,Name
0,--,155 lbs.,--,0.625,8,0.0,0.375,Tom Aaron
1,"5' 11""",155 lbs.,--,0.4,10,0.0,0.6,Danny Abbadi
2,"5' 8""",155 lbs.,"66.0""",0.875,32,0.0,0.125,Nariman Abbasov
3,"6' 0""",265 lbs.,--,0.4,25,0.0,0.6,David Abbott
4,"6' 2""",264 lbs.,"72.0""",1.0,5,0.0,0.0,Hamdy Abdelwahab


### For the fights dataset

* Need to rearrange the order sometimes so the left column isnt always the winner and also so that we can keep track of that

In [6]:
fights = pd.read_csv('fights.csv')
fights.head()

Unnamed: 0.1,Unnamed: 0,Winners,Losers
0,0,Tom Aspinall,Marcin Tybura
1,1,Julija Stoliarenko,Molly McCann
2,2,Nathaniel Wood,Andre Fili
3,3,Paul Craig,Andre Muniz
4,4,Fares Ziam,Jai Herbert


# OUTLIERS

Before we move on I want to remove a couple big upsets in the UFC that will definitely skew our model. I am going to remove the following fights:

* Serra vs St. Pierre 1
* Holm vs Rousey
* Weidman vs Silva
* Pena vs Nunes
* Werdum vs Emilianenko
* Dillashaw vs Barao
* Diaz vs McGregor

This list may increase depending on how the model performs.

In [7]:
fights_to_remove = [
    ('Matt Serra', 'Georges St. Pierre'),
    ('Holly Holm', 'Ronda Rousey'),
    ('Chris Weidman', 'Anderson Silva'),
    ('Julianna Pena', 'Amanda Nunes'),
    ('Fabrico Werdum', 'Fedor Emilianenko'),
    ('T.J. Dillashaw', 'Renan Barao'),
    ('Nate Diaz', 'Conor McGregor')
]

# Filter out the fights to be removed
filtered_fights = fights[~fights.apply(lambda row: (row['Winners'], row['Losers']) in fights_to_remove, axis=1)]

In [8]:
# Creating a new column for the winner
fighter_1 = []
fighter_2 = []
winner = []

# Randomizing and keeping track of the winner
for index, row in fights.iterrows():
    # random number either 0 or 1
    number = np.random.randint(0, 2)
    
    # If its zero then we do not need to switch the winner
    if number == 0:
        fighter_1.append(row['Winners'])
        fighter_2.append(row['Losers'])
        winner.append(0)
        
    # If its one then we need to switch the order  
    else:
        fighter_1.append(row['Losers'])
        fighter_2.append(row['Winners'])
        winner.append(1)

# Making a new dataframe with the new columns
organized = pd.DataFrame({'Fighter 1': fighter_1, 'Fighter 2': fighter_2, 'Winner': winner})
organized.head()

Unnamed: 0,Fighter 1,Fighter 2,Winner
0,Marcin Tybura,Tom Aspinall,1
1,Julija Stoliarenko,Molly McCann,0
2,Andre Fili,Nathaniel Wood,1
3,Paul Craig,Andre Muniz,0
4,Jai Herbert,Fares Ziam,1


#### SUCCESS

# Assigning everything

We in short need to add all the players STATS to each fight, which sounds very intimidating (at least it did to me) BUT maybe it is not.

In [9]:
fighters_1 = fighters.add_suffix('_1')
fighters_2 = fighters.add_suffix('_2')

In [10]:
# I think its working? I am not 100% confident but I think it is
attempt_1 = pd.merge(organized, fighters_1, left_on='Fighter 1', right_on='Name_1')
attempt_1

Unnamed: 0,Fighter 1,Fighter 2,Winner,HT_1,WT_1,Reach_1,WinPercentage_1,Experience_1,DrawPercentage_1,LossPercentage_1,Name_1
0,Marcin Tybura,Tom Aspinall,1,"6' 3""",249 lbs.,"78.0""",0.75,32,0.0,0.25,Marcin Tybura
1,Marcin Tybura,Blagoy Ivanov,0,"6' 3""",249 lbs.,"78.0""",0.75,32,0.0,0.25,Marcin Tybura
2,Marcin Tybura,Alexandr Romanov,0,"6' 3""",249 lbs.,"78.0""",0.75,32,0.0,0.25,Marcin Tybura
3,Marcin Tybura,Greg Hardy,0,"6' 3""",249 lbs.,"78.0""",0.75,32,0.0,0.25,Marcin Tybura
4,Marcin Tybura,Augusto Sakai,1,"6' 3""",249 lbs.,"78.0""",0.75,32,0.0,0.25,Marcin Tybura
...,...,...,...,...,...,...,...,...,...,...,...
7246,Robert Lucarelli,Orlando Wiet,1,"6' 2""",245 lbs.,--,0.00,1,0.0,1.00,Robert Lucarelli
7247,Thaddeus Luster,Frank Hamaker,1,"6' 3""",210 lbs.,--,0.00,1,0.0,1.00,Thaddeus Luster
7248,David Levicki,Johnny Rhodes,1,"6' 5""",275 lbs.,--,0.25,4,0.0,0.75,David Levicki
7249,Ray Wizard,Patrick Smith,1,--,--,--,0.00,1,0.0,1.00,Ray Wizard


In [11]:
attempt_2 = pd.merge(attempt_1, fighters_2, left_on='Fighter 2', right_on='Name_2')
attempt_2

Unnamed: 0,Fighter 1,Fighter 2,Winner,HT_1,WT_1,Reach_1,WinPercentage_1,Experience_1,DrawPercentage_1,LossPercentage_1,Name_1,HT_2,WT_2,Reach_2,WinPercentage_2,Experience_2,DrawPercentage_2,LossPercentage_2,Name_2
0,Marcin Tybura,Tom Aspinall,1,"6' 3""",249 lbs.,"78.0""",0.750000,32,0.0,0.250000,Marcin Tybura,"6' 5""",256 lbs.,"78.0""",0.812500,16,0.0,0.187500,Tom Aspinall
1,Jake Collier,Tom Aspinall,1,"6' 3""",230 lbs.,"78.0""",0.590909,22,0.0,0.409091,Jake Collier,"6' 5""",256 lbs.,"78.0""",0.812500,16,0.0,0.187500,Tom Aspinall
2,Curtis Blaydes,Tom Aspinall,0,"6' 4""",265 lbs.,"80.0""",0.809524,21,0.0,0.190476,Curtis Blaydes,"6' 5""",256 lbs.,"78.0""",0.812500,16,0.0,0.187500,Tom Aspinall
3,Alan Baudot,Tom Aspinall,1,"6' 3""",243 lbs.,"79.0""",0.666667,12,0.0,0.333333,Alan Baudot,"6' 5""",256 lbs.,"78.0""",0.812500,16,0.0,0.187500,Tom Aspinall
4,Andrei Arlovski,Tom Aspinall,1,"6' 3""",240 lbs.,"77.0""",0.607143,56,0.0,0.392857,Andrei Arlovski,"6' 5""",256 lbs.,"78.0""",0.812500,16,0.0,0.187500,Tom Aspinall
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7242,David Levicki,Johnny Rhodes,1,"6' 5""",275 lbs.,--,0.250000,4,0.0,0.750000,David Levicki,"6' 0""",210 lbs.,--,0.666667,3,0.0,0.333333,Johnny Rhodes
7243,Patrick Smith,Scott Morris,0,"6' 2""",225 lbs.,--,0.540541,37,0.0,0.459459,Patrick Smith,"5' 10""",210 lbs.,--,0.666667,3,0.0,0.333333,Scott Morris
7244,Sean Daugherty,Scott Morris,1,"6' 0""",175 lbs.,--,0.000000,2,0.0,1.000000,Sean Daugherty,"5' 10""",210 lbs.,--,0.666667,3,0.0,0.333333,Scott Morris
7245,Robert Lucarelli,Orlando Wiet,1,"6' 2""",245 lbs.,--,0.000000,1,0.0,1.000000,Robert Lucarelli,"5' 10""",170 lbs.,--,0.166667,6,0.0,0.833333,Orlando Wiet


In [12]:
fight_data = attempt_2.drop(columns=['Name_1', 'Name_2'])
fight_data.head()

Unnamed: 0,Fighter 1,Fighter 2,Winner,HT_1,WT_1,Reach_1,WinPercentage_1,Experience_1,DrawPercentage_1,LossPercentage_1,HT_2,WT_2,Reach_2,WinPercentage_2,Experience_2,DrawPercentage_2,LossPercentage_2
0,Marcin Tybura,Tom Aspinall,1,"6' 3""",249 lbs.,"78.0""",0.75,32,0.0,0.25,"6' 5""",256 lbs.,"78.0""",0.8125,16,0.0,0.1875
1,Jake Collier,Tom Aspinall,1,"6' 3""",230 lbs.,"78.0""",0.590909,22,0.0,0.409091,"6' 5""",256 lbs.,"78.0""",0.8125,16,0.0,0.1875
2,Curtis Blaydes,Tom Aspinall,0,"6' 4""",265 lbs.,"80.0""",0.809524,21,0.0,0.190476,"6' 5""",256 lbs.,"78.0""",0.8125,16,0.0,0.1875
3,Alan Baudot,Tom Aspinall,1,"6' 3""",243 lbs.,"79.0""",0.666667,12,0.0,0.333333,"6' 5""",256 lbs.,"78.0""",0.8125,16,0.0,0.1875
4,Andrei Arlovski,Tom Aspinall,1,"6' 3""",240 lbs.,"77.0""",0.607143,56,0.0,0.392857,"6' 5""",256 lbs.,"78.0""",0.8125,16,0.0,0.1875


# Cleaning

* replace all '--' values with 'NaN'
* convert feet to inches
* get rid of any row that has NaN value

In [13]:
# Getting rid of the '--' values cause there are stupid and annoying
for column in fight_data.columns:
    fight_data[column] = fight_data[column].replace({'--': np.nan})

In [14]:
# Height Conversion Function
def convert_height(height):
    
    # Ignore it if it already happened
    if type(height) == float:
        return height
    
    # Otherwise strip the feet and inches and convert it to inches
    else:
        feet_inches = height.split("'")
        feet = float(feet_inches[0])
        inches = float(feet_inches[1].replace('"', ''))
        return (feet * 12) + inches

In [15]:
# Doing the easier stuff that I know will work first whicfh means no more words in the numbers
fight_data['WT_1'] = fight_data['WT_1'].str.replace(' lbs.', '').astype(float)
fight_data['WT_2'] = fight_data['WT_2'].str.replace(' lbs.', '').astype(float)

fight_data['Reach_1'] = fight_data['Reach_1'].str.replace('"', '').astype(float)
fight_data['Reach_2'] = fight_data['Reach_2'].str.replace('"', '').astype(float)

In [16]:
fight_data.head()

Unnamed: 0,Fighter 1,Fighter 2,Winner,HT_1,WT_1,Reach_1,WinPercentage_1,Experience_1,DrawPercentage_1,LossPercentage_1,HT_2,WT_2,Reach_2,WinPercentage_2,Experience_2,DrawPercentage_2,LossPercentage_2
0,Marcin Tybura,Tom Aspinall,1,"6' 3""",249.0,78.0,0.75,32,0.0,0.25,"6' 5""",256.0,78.0,0.8125,16,0.0,0.1875
1,Jake Collier,Tom Aspinall,1,"6' 3""",230.0,78.0,0.590909,22,0.0,0.409091,"6' 5""",256.0,78.0,0.8125,16,0.0,0.1875
2,Curtis Blaydes,Tom Aspinall,0,"6' 4""",265.0,80.0,0.809524,21,0.0,0.190476,"6' 5""",256.0,78.0,0.8125,16,0.0,0.1875
3,Alan Baudot,Tom Aspinall,1,"6' 3""",243.0,79.0,0.666667,12,0.0,0.333333,"6' 5""",256.0,78.0,0.8125,16,0.0,0.1875
4,Andrei Arlovski,Tom Aspinall,1,"6' 3""",240.0,77.0,0.607143,56,0.0,0.392857,"6' 5""",256.0,78.0,0.8125,16,0.0,0.1875


In [17]:
# Making all the heights into inches
fight_data['HT_1'] = fight_data['HT_1'].apply(convert_height)
fight_data['HT_2'] = fight_data['HT_2'].apply(convert_height)

In [18]:
# LASTY NO NULL VALUES
fight_data.dropna(inplace=True)
fight_data.tail()

Unnamed: 0,Fighter 1,Fighter 2,Winner,HT_1,WT_1,Reach_1,WinPercentage_1,Experience_1,DrawPercentage_1,LossPercentage_1,HT_2,WT_2,Reach_2,WinPercentage_2,Experience_2,DrawPercentage_2,LossPercentage_2
7000,Jess Liaudin,David Bielkheden,1,69.0,170.0,72.0,0.645161,31,0.0,0.354839,70.0,155.0,71.0,0.666667,36,0.0,0.333333
7002,Jason Lambert,Terry Martin,0,75.0,185.0,75.0,0.666667,39,0.0,0.333333,68.0,185.0,71.0,0.676471,34,0.0,0.323529
7007,Rameau Thierry Sokoudjou,Kazuhiro Nakamura,0,70.0,205.0,78.0,0.5,36,0.0,0.5,71.0,205.0,70.0,0.617647,34,0.0,0.382353
7009,James Irvin,Mike Kyle,1,74.0,205.0,75.0,0.62963,27,0.0,0.37037,76.0,205.0,77.0,0.594595,37,0.027027,0.378378
7052,Phil Baroni,Matt Lindland,1,69.0,170.0,72.0,0.454545,33,0.0,0.545455,72.0,185.0,74.0,0.709677,31,0.0,0.290323


**SUCCESS**

# Caluclate the differences

Lastly to make it easier on our neural network we just want to see the difference from fighter_1 minus fighter_2

In [19]:
HeightDiff = fight_data['HT_1'] - fight_data['HT_2']
WeightDiff = fight_data['WT_1'] - fight_data['WT_2']
ReachDiff = fight_data['Reach_1'] - fight_data['Reach_2']
WinDiff = fight_data['WinPercentage_1'] - fight_data['WinPercentage_2']
ExpDiff = fight_data['Experience_1'] - fight_data['Experience_2']
DrawDiff = fight_data['DrawPercentage_1'] - fight_data['DrawPercentage_2']
LossDiff = fight_data['LossPercentage_1'] - fight_data['LossPercentage_2']

In [20]:
final_data = pd.DataFrame({'HeightDiff': HeightDiff, 'WeightDiff': WeightDiff, 'ReachDiff': ReachDiff,
                           'WinDiff': WinDiff, 'ExpDiff': ExpDiff, 'DrawDiff': DrawDiff, 'LossDiff':LossDiff, 'Winner': fight_data['Winner']})
final_data

Unnamed: 0,HeightDiff,WeightDiff,ReachDiff,WinDiff,ExpDiff,DrawDiff,LossDiff,Winner
0,-2.0,-7.0,0.0,-0.062500,16,0.000000,0.062500,1
1,-2.0,-26.0,0.0,-0.221591,6,0.000000,0.221591,1
2,-1.0,9.0,2.0,-0.002976,5,0.000000,0.002976,0
3,-2.0,-13.0,1.0,-0.145833,-4,0.000000,0.145833,1
4,-2.0,-16.0,-1.0,-0.205357,40,0.000000,0.205357,1
...,...,...,...,...,...,...,...,...
7000,-1.0,15.0,1.0,-0.021505,-5,0.000000,0.021505,1
7002,7.0,0.0,4.0,-0.009804,5,0.000000,0.009804,0
7007,-1.0,0.0,8.0,-0.117647,2,0.000000,0.117647,0
7009,-2.0,0.0,-2.0,0.035035,-10,-0.027027,-0.008008,1


In [21]:
final_data['Winner'].value_counts()

Winner
0    3128
1    3049
Name: count, dtype: int64

In [22]:
final_data.to_csv('ufc_fights.csv', index=False)

# SUCCESS

It appears all has worked and the data is ready to be trained! now lets hope I get good results