# The Next Step in our process is Organizing

We have all the fights ever and all the fighters ever, so next we need to append all the fighters statistics to each fight and then make it easier for PyTorch to understand.

In [1]:
import pandas as pd # Our beloved to work with data
import matplotlib.pyplot as plt # In case we need to see anything
import numpy as np # In case we need to do some math

# Making the Columns

### For the Fighters Dataset

* We need the win percentage instead of their record
* We need their first and last name in one column
* We do not need their nickname, win, loss, draw columns, and eventually not their first and last name columns so drop those

In [2]:
fighters = pd.read_csv('fighters.csv')

In [3]:
# Win Percentage Calulcations
fighters['WinPercentage'] = fighters['Win'] / (fighters['Win'] + fighters['Loss'] + fighters['Draw'])
# Getting rid of unnecessary columns
fighters.drop(columns=['Draw', 'Loss', 'Win', 'Nickname'], inplace=True)

In [4]:
fighters.head()

Unnamed: 0,First,Last,HT,WT,Reach,WinPercentage
0,Tom,Aaron,--,155 lbs.,--,0.625
1,Danny,Abbadi,"5' 11""",155 lbs.,--,0.4
2,Nariman,Abbasov,"5' 8""",155 lbs.,"66.0""",0.875
3,David,Abbott,"6' 0""",265 lbs.,--,0.4
4,Hamdy,Abdelwahab,"6' 2""",264 lbs.,"72.0""",1.0


In [5]:
# Combining first and last names
fighters['Name'] = fighters['First'] + ' ' + fighters['Last']
fighters.drop(columns=['First', 'Last'], inplace=True)
fighters.head()

Unnamed: 0,HT,WT,Reach,WinPercentage,Name
0,--,155 lbs.,--,0.625,Tom Aaron
1,"5' 11""",155 lbs.,--,0.4,Danny Abbadi
2,"5' 8""",155 lbs.,"66.0""",0.875,Nariman Abbasov
3,"6' 0""",265 lbs.,--,0.4,David Abbott
4,"6' 2""",264 lbs.,"72.0""",1.0,Hamdy Abdelwahab


### For the fights dataset

* Need to rearrange the order sometimes so the left column isnt always the winner and also so that we can keep track of that

In [6]:
fights = pd.read_csv('fights.csv')
fights.head()

Unnamed: 0,Winners,Losers
0,Tom Aspinall,Marcin Tybura
1,Julija Stoliarenko,Molly McCann
2,Nathaniel Wood,Andre Fili
3,Paul Craig,Andre Muniz
4,Fares Ziam,Jai Herbert


In [7]:
# Creating a new column for the winner
fighter_1 = []
fighter_2 = []
winner = []

# Randomizing and keeping track of the winner
for index, row in fights.iterrows():
    # random number either 0 or 1
    number = np.random.randint(0, 2)
    
    # If its zero then we do not need to switch the winner
    if number == 0:
        fighter_1.append(row['Winners'])
        fighter_2.append(row['Losers'])
        winner.append(0)
        
    # If its one then we need to switch the order  
    else:
        fighter_1.append(row['Losers'])
        fighter_2.append(row['Winners'])
        winner.append(1)

# Making a new dataframe with the new columns
organized = pd.DataFrame({'Fighter 1': fighter_1, 'Fighter 2': fighter_2, 'Winner': winner})
organized.head()

Unnamed: 0,Fighter 1,Fighter 2,Winner
0,Tom Aspinall,Marcin Tybura,0
1,Julija Stoliarenko,Molly McCann,0
2,Nathaniel Wood,Andre Fili,0
3,Paul Craig,Andre Muniz,0
4,Jai Herbert,Fares Ziam,1


#### SUCCESS

# Assigning everything

We in short need to add all the players STATS to each fight, which sounds very intimidating (at least it did to me) BUT maybe it is not.

In [8]:
fighters_1 = fighters.add_suffix('_1')
fighters_2 = fighters.add_suffix('_2')

In [9]:
# I think its working? I am not 100% confident but I think it is
attempt_1 = pd.merge(organized, fighters_1, left_on='Fighter 1', right_on='Name_1')
attempt_1

Unnamed: 0,Fighter 1,Fighter 2,Winner,HT_1,WT_1,Reach_1,WinPercentage_1,Name_1
0,Tom Aspinall,Marcin Tybura,0,"6' 5""",256 lbs.,"78.0""",0.812500,Tom Aspinall
1,Tom Aspinall,Alan Baudot,0,"6' 5""",256 lbs.,"78.0""",0.812500,Tom Aspinall
2,Julija Stoliarenko,Molly McCann,0,"5' 7""",125 lbs.,"66.0""",0.523810,Julija Stoliarenko
3,Julija Stoliarenko,Alexis Davis,1,"5' 7""",125 lbs.,"66.0""",0.523810,Julija Stoliarenko
4,Julija Stoliarenko,Julia Avila,1,"5' 7""",125 lbs.,"66.0""",0.523810,Julija Stoliarenko
...,...,...,...,...,...,...,...,...
7241,Scott Morris,Sean Daugherty,0,"5' 10""",210 lbs.,--,0.666667,Scott Morris
7242,Minoki Ichihara,Royce Gracie,1,"5' 7""",178 lbs.,--,0.000000,Minoki Ichihara
7243,Alberta Cerra Leon,Remco Pardoel,1,"5' 8""",238 lbs.,--,0.000000,Alberta Cerra Leon
7244,Robert Lucarelli,Orlando Wiet,1,"6' 2""",245 lbs.,--,0.000000,Robert Lucarelli


In [10]:
attempt_2 = pd.merge(attempt_1, fighters_2, left_on='Fighter 2', right_on='Name_2')
attempt_2

Unnamed: 0,Fighter 1,Fighter 2,Winner,HT_1,WT_1,Reach_1,WinPercentage_1,Name_1,HT_2,WT_2,Reach_2,WinPercentage_2,Name_2
0,Tom Aspinall,Marcin Tybura,0,"6' 5""",256 lbs.,"78.0""",0.812500,Tom Aspinall,"6' 3""",249 lbs.,"78.0""",0.75,Marcin Tybura
1,Blagoy Ivanov,Marcin Tybura,1,"5' 11""",250 lbs.,"73.0""",0.760000,Blagoy Ivanov,"6' 3""",249 lbs.,"78.0""",0.75,Marcin Tybura
2,Andrei Arlovski,Marcin Tybura,1,"6' 3""",240 lbs.,"77.0""",0.607143,Andrei Arlovski,"6' 3""",249 lbs.,"78.0""",0.75,Marcin Tybura
3,Serghei Spivac,Marcin Tybura,1,"6' 3""",260 lbs.,"78.0""",0.842105,Serghei Spivac,"6' 3""",249 lbs.,"78.0""",0.75,Marcin Tybura
4,Alexandr Romanov,Marcin Tybura,1,"6' 2""",262 lbs.,"75.0""",0.894737,Alexandr Romanov,"6' 3""",249 lbs.,"78.0""",0.75,Marcin Tybura
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7242,Joe Charles,Kevin Rosier,0,"6' 1""",260 lbs.,--,0.315789,Joe Charles,"6' 4""",275 lbs.,--,0.25,Kevin Rosier
7243,Johnny Rhodes,Fred Ettish,0,"6' 0""",210 lbs.,--,0.666667,Johnny Rhodes,"6' 0""",180 lbs.,--,0.50,Fred Ettish
7244,Johnny Rhodes,David Levicki,0,"6' 0""",210 lbs.,--,0.666667,Johnny Rhodes,"6' 5""",275 lbs.,--,0.25,David Levicki
7245,Scott Morris,Sean Daugherty,0,"5' 10""",210 lbs.,--,0.666667,Scott Morris,"6' 0""",175 lbs.,--,0.00,Sean Daugherty


In [11]:
fight_data = attempt_2.drop(columns=['Name_1', 'Name_2'])
fight_data.head()

Unnamed: 0,Fighter 1,Fighter 2,Winner,HT_1,WT_1,Reach_1,WinPercentage_1,HT_2,WT_2,Reach_2,WinPercentage_2
0,Tom Aspinall,Marcin Tybura,0,"6' 5""",256 lbs.,"78.0""",0.8125,"6' 3""",249 lbs.,"78.0""",0.75
1,Blagoy Ivanov,Marcin Tybura,1,"5' 11""",250 lbs.,"73.0""",0.76,"6' 3""",249 lbs.,"78.0""",0.75
2,Andrei Arlovski,Marcin Tybura,1,"6' 3""",240 lbs.,"77.0""",0.607143,"6' 3""",249 lbs.,"78.0""",0.75
3,Serghei Spivac,Marcin Tybura,1,"6' 3""",260 lbs.,"78.0""",0.842105,"6' 3""",249 lbs.,"78.0""",0.75
4,Alexandr Romanov,Marcin Tybura,1,"6' 2""",262 lbs.,"75.0""",0.894737,"6' 3""",249 lbs.,"78.0""",0.75


# Cleaning

* replace all '--' values with 'NaN'
* convert feet to inches
* get rid of any row that has NaN value

In [12]:
# Getting rid of the '--' values cause there are stupid and annoying
for column in fight_data.columns:
    fight_data[column] = fight_data[column].replace({'--': np.nan})

In [23]:
# Height Conversion Function
def convert_height(height):
    
    # Ignore it if it already happened
    if type(height) == float:
        return height
    
    # Otherwise strip the feet and inches and convert it to inches
    else:
        feet_inches = height.split("'")
        feet = float(feet_inches[0])
        inches = float(feet_inches[1].replace('"', ''))
        return (feet * 12) + inches

In [14]:
# Doing the easier stuff that I know will work first whicfh means no more words in the numbers
fight_data['WT_1'] = fight_data['WT_1'].str.replace(' lbs.', '').astype(float)
fight_data['WT_2'] = fight_data['WT_2'].str.replace(' lbs.', '').astype(float)

fight_data['Reach_1'] = fight_data['Reach_1'].str.replace('"', '').astype(float)
fight_data['Reach_2'] = fight_data['Reach_2'].str.replace('"', '').astype(float)

In [15]:
fight_data.head()

Unnamed: 0,Fighter 1,Fighter 2,Winner,HT_1,WT_1,Reach_1,WinPercentage_1,HT_2,WT_2,Reach_2,WinPercentage_2
0,Tom Aspinall,Marcin Tybura,0,"6' 5""",256.0,78.0,0.8125,"6' 3""",249.0,78.0,0.75
1,Blagoy Ivanov,Marcin Tybura,1,"5' 11""",250.0,73.0,0.76,"6' 3""",249.0,78.0,0.75
2,Andrei Arlovski,Marcin Tybura,1,"6' 3""",240.0,77.0,0.607143,"6' 3""",249.0,78.0,0.75
3,Serghei Spivac,Marcin Tybura,1,"6' 3""",260.0,78.0,0.842105,"6' 3""",249.0,78.0,0.75
4,Alexandr Romanov,Marcin Tybura,1,"6' 2""",262.0,75.0,0.894737,"6' 3""",249.0,78.0,0.75


In [24]:
# Making all the heights into inches
fight_data['HT_1'] = fight_data['HT_1'].apply(convert_height)
fight_data['HT_2'] = fight_data['HT_2'].apply(convert_height)

In [29]:
# LASTY NO NULL VALUES
fight_data.dropna(inplace=True)
fight_data.tail()

Unnamed: 0,Fighter 1,Fighter 2,Winner,HT_1,WT_1,Reach_1,WinPercentage_1,HT_2,WT_2,Reach_2,WinPercentage_2
6999,Dale Hartt,Corey Hill,0,70.0,155.0,69.0,0.666667,76.0,155.0,80.0,0.4
7011,Eddie Sanchez,Justin McCully,1,74.0,230.0,74.0,0.652174,73.0,225.0,73.0,0.611111
7014,Luke Cummo,Luigi Fioravanti,1,72.0,170.0,74.0,0.5,68.0,170.0,70.0,0.634146
7019,Kazuhiro Nakamura,Rameau Thierry Sokoudjou,1,71.0,205.0,70.0,0.617647,70.0,205.0,78.0,0.5
7026,David Heath,Renato Sobral,1,71.0,205.0,72.0,0.72,73.0,205.0,75.0,0.770833


**SUCCESS**

# Caluclate the differences

Lastly to make it easier on our neural network we just want to see the difference from fighter_1 minus fighter_2

In [30]:
HeightDiff = fight_data['HT_1'] - fight_data['HT_2']
WeightDiff = fight_data['WT_1'] - fight_data['WT_2']
ReachDiff = fight_data['Reach_1'] - fight_data['Reach_2']
WinDiff = fight_data['WinPercentage_1'] - fight_data['WinPercentage_2']

In [32]:
final_data = pd.DataFrame({'HeightDiff': HeightDiff, 'WeightDiff': WeightDiff, 'ReachDiff': ReachDiff, 'WinDiff': WinDiff, 'Winner': fight_data['Winner']})
final_data

Unnamed: 0,HeightDiff,WeightDiff,ReachDiff,WinDiff,Winner
0,2.0,7.0,0.0,0.062500,0
1,-4.0,1.0,-5.0,0.010000,1
2,0.0,-9.0,-1.0,-0.142857,1
3,0.0,11.0,0.0,0.092105,1
4,-1.0,13.0,-3.0,0.144737,1
...,...,...,...,...,...
6999,-6.0,0.0,-11.0,0.266667,0
7011,1.0,5.0,1.0,0.041063,1
7014,4.0,0.0,4.0,-0.134146,1
7019,1.0,0.0,-8.0,0.117647,1


In [33]:
final_data['Winner'].value_counts()

Winner
1    3135
0    3042
Name: count, dtype: int64

In [34]:
final_data.to_csv('ufc_fights.csv', index=False)

# SUCCESS

It appears all has worked and the data is ready to be trained! now lets hope I get good results