# Jonathan Halverson
# Thursday, April 6, 2017
# Correcting the winner-loser order

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('halverson')

In [2]:
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', 100)

Read in the raw individual fight data. Note that Fighter1 is not necessarily the winner and Fighter2 may not be the loser. Join with fights.csv to get correct order.

In [3]:
iofile = 'data/fightmetric_individual_fights/detailed_stats_individual_fights_RAW.csv'
df = pd.read_csv(iofile, header=0, parse_dates=['Date'])
df.head(3)

Unnamed: 0,Date,Fighter1,Knockdowns1,SigStrikesLanded1,SigStrikesAttempted1,TotStrikesLanded1,TotStrikesAttempted1,TakedownLanded1,TakedownAttempted1,SubsAttempted1,Pass1,Reversal1,Fighter2,Knockdowns2,SigStrikesLanded2,SigStrikesAttempted2,TotStrikesLanded2,TotStrikesAttempted2,TakedownLanded2,TakedownAttempted2,SubsAttempted2,Pass2,Reversal2
0,2017-02-11,Holly Holm,0.0,77.0,182.0,122.0,235.0,0.0,9.0,0.0,0.0,0.0,Germaine de Randamie,0.0,80.0,150.0,144.0,223.0,0.0,1.0,0.0,0.0,0.0
1,2017-02-11,Anderson Silva,0.0,43.0,78.0,54.0,89.0,0.0,0.0,0.0,0.0,0.0,Derek Brunson,0.0,54.0,101.0,118.0,184.0,2.0,10.0,0.0,0.0,0.0
2,2017-02-11,Jacare Souza,0.0,7.0,18.0,15.0,30.0,1.0,2.0,1.0,2.0,0.0,Tim Boetsch,0.0,6.0,10.0,8.0,12.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4068 entries, 0 to 4067
Data columns (total 23 columns):
Date                    4042 non-null datetime64[ns]
Fighter1                4042 non-null object
Knockdowns1             4042 non-null float64
SigStrikesLanded1       4042 non-null float64
SigStrikesAttempted1    4042 non-null float64
TotStrikesLanded1       4042 non-null float64
TotStrikesAttempted1    4042 non-null float64
TakedownLanded1         4042 non-null float64
TakedownAttempted1      4042 non-null float64
SubsAttempted1          4042 non-null float64
Pass1                   4042 non-null float64
Reversal1               4042 non-null float64
Fighter2                4042 non-null object
Knockdowns2             4042 non-null float64
SigStrikesLanded2       4042 non-null float64
SigStrikesAttempted2    4042 non-null float64
TotStrikesLanded2       4042 non-null float64
TotStrikesAttempted2    4042 non-null float64
TakedownLanded2         4042 non-null float64
TakedownAttemp

We see that 26 rows are null. These are the fights (of the 4068) that FightMetric did not analyze.

In [5]:
df[pd.isnull(df.Fighter1)].shape

(26, 23)

Rename the duplicate name:

In [6]:
# rename the second instance
idx = df[(df.Fighter1 == 'Dong Hyun Kim') & (df.Fighter2 == "Brendan O'Reilly")].index.values
df = df.set_value(idx, 'Fighter1', 'Dong Hyun Kim 2')
idx = df[(df.Fighter2 == 'Dong Hyun Kim') & (df.Fighter1 == 'Polo Reyes')].index.values
df = df.set_value(idx, 'Fighter2', 'Dong Hyun Kim 2')
idx = df[(df.Fighter2 == 'Dong Hyun Kim') & (df.Fighter1 == 'Dominique Steele')].index.values
df = df.set_value(idx, 'Fighter2', 'Dong Hyun Kim 2')

In [7]:
ftr = 'Dong Hyun Kim 2'
df[(df.Fighter1 == ftr) | (df.Fighter2 == ftr)]

Unnamed: 0,Date,Fighter1,Knockdowns1,SigStrikesLanded1,SigStrikesAttempted1,TotStrikesLanded1,TotStrikesAttempted1,TakedownLanded1,TakedownAttempted1,SubsAttempted1,Pass1,Reversal1,Fighter2,Knockdowns2,SigStrikesLanded2,SigStrikesAttempted2,TotStrikesLanded2,TotStrikesAttempted2,TakedownLanded2,TakedownAttempted2,SubsAttempted2,Pass2,Reversal2
101,2016-12-03,Dong Hyun Kim 2,0.0,21.0,32.0,98.0,116.0,4.0,4.0,0.0,8.0,1.0,Brendan O'Reilly,0.0,25.0,38.0,70.0,85.0,0.0,2.0,1.0,1.0,2.0
365,2016-06-04,Polo Reyes,2.0,135.0,299.0,147.0,312.0,0.0,0.0,0.0,2.0,0.0,Dong Hyun Kim 2,0.0,83.0,197.0,99.0,214.0,2.0,3.0,0.0,2.0,0.0
596,2015-11-28,Dominique Steele,1.0,36.0,61.0,84.0,114.0,4.0,4.0,0.0,1.0,0.0,Dong Hyun Kim 2,0.0,19.0,42.0,52.0,79.0,0.0,3.0,0.0,0.0,0.0


In [8]:
df.describe()

Unnamed: 0,Knockdowns1,SigStrikesLanded1,SigStrikesAttempted1,TotStrikesLanded1,TotStrikesAttempted1,TakedownLanded1,TakedownAttempted1,SubsAttempted1,Pass1,Reversal1,Knockdowns2,SigStrikesLanded2,SigStrikesAttempted2,TotStrikesLanded2,TotStrikesAttempted2,TakedownLanded2,TakedownAttempted2,SubsAttempted2,Pass2,Reversal2
count,4042.0,4042.0,4042.0,4042.0,4042.0,4042.0,4042.0,4042.0,4042.0,4042.0,4042.0,4042.0,4042.0,4042.0,4042.0,4042.0,4042.0,4042.0,4042.0,4042.0
mean,0.258288,32.106383,72.081148,54.038347,96.610589,1.314201,2.973281,0.556408,1.478971,0.143741,0.164523,26.142751,65.084612,42.596734,83.513112,0.896338,2.728847,0.380257,0.827066,0.141267
std,0.526541,27.127688,62.866,44.256326,74.061143,1.854986,3.67402,0.99636,2.370043,0.432955,0.433557,25.201473,60.793013,37.986226,69.40816,1.508498,3.729893,0.83959,1.647636,0.425707
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,11.0,24.0,19.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,18.0,12.0,26.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,26.0,57.0,45.0,84.0,1.0,2.0,0.0,0.0,0.0,0.0,19.0,49.0,33.0,69.0,0.0,1.0,0.0,0.0,0.0
75%,0.0,46.0,101.0,77.0,142.0,2.0,4.0,1.0,2.0,0.0,0.0,38.0,95.75,63.0,126.0,1.0,4.0,0.0,1.0,0.0
max,5.0,238.0,454.0,361.0,464.0,21.0,27.0,10.0,26.0,5.0,3.0,206.0,495.0,355.0,497.0,12.0,33.0,7.0,14.0,3.0


Drop the null rows:

In [9]:
df = df.dropna()
df.shape

(4042, 23)

### Load the fights

In [10]:
iofile = 'data/fightmetric_cards/fightmetric_fights_CLEAN_3-6-2017.csv'
fights = pd.read_csv(iofile, header=0, parse_dates=['Date'])
fights.head(3)

Unnamed: 0,Winner,Outcome,Loser,WeightClass,Method,MethodNotes,Round,Time,Event,Date,Location
0,Germaine de Randamie,def.,Holly Holm,Women's Featherweight,U-DEC,,5,5:00,UFC 208: Holm vs. De Randamie,2017-02-11,"New York, New York, USA"
1,Anderson Silva,def.,Derek Brunson,Middleweight,U-DEC,,3,5:00,UFC 208: Holm vs. De Randamie,2017-02-11,"New York, New York, USA"
2,Jacare Souza,def.,Tim Boetsch,Middleweight,SUB,Kimura,1,3:41,UFC 208: Holm vs. De Randamie,2017-02-11,"New York, New York, USA"


In [11]:
fights.shape

(4068, 11)

What are the 26 fights that were not analyzed?

In [12]:
not_analyzed = []
for index, row in fights.iterrows():
     fighter1 = row['Winner']
     fighter2 = row['Loser']
     msk1 = ((df.Fighter1 == fighter1) & (df.Fighter2 == fighter2))
     msk2 = ((df.Fighter1 == fighter2) & (df.Fighter2 == fighter1))
     x = df[msk1 & (df.Date == row['Date'])].shape[0]
     y = df[msk2 & (df.Date == row['Date'])].shape[0]
     if x + y == 0:
          not_analyzed.append(row.values)
pd.DataFrame(not_analyzed)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Ian Freeman,def.,Nate Schroeder,Heavyweight,KO/TKO,Knees,2,2:13,UFC 26: Ultimate Field Of Dreams,2000-06-09,"Cedar Rapids, Iowa, USA"
1,Shonie Carter,def.,Adrian Serrano,Welterweight,U-DEC,,2,5:00,UFC 26: Ultimate Field Of Dreams,2000-06-09,"Cedar Rapids, Iowa, USA"
2,Travis Fulton,def.,David Dodd,Heavyweight,U-DEC,,2,5:00,UFC 21: Return of the Champions,1999-07-16,"Cedar Rapids, Iowa, USA"
3,Tulio Palhares,def.,Adriano Santos,Middleweight,KO/TKO,,1,9:00,UFC - Ultimate Brazil,1998-10-16,"Sao Paulo, Brazil"
4,Cesar Marscucci,def.,Paulo Santos,Lightweight,KO/TKO,,1,0:27,UFC - Ultimate Brazil,1998-10-16,"Sao Paulo, Brazil"
5,Andre Roberts,def.,Harry Moskowitz,Heavyweight,U-DEC,,2,3:00,UFC 17: Redemption,1998-05-15,"Mobile, Alabama, USA"
6,Chris Brennan,def.,Courtney Turner,Lightweight,SUB,Armbar,1,1:20,UFC 16: Battle in the Bayou,1998-03-13,"New Orleans, Louisiana, USA"
7,Laverne Clark,def.,Josh Stewart,Lightweight,KO/TKO,,1,1:15,UFC 16: Battle in the Bayou,1998-03-13,"New Orleans, Louisiana, USA"
8,Justin Martin,def.,Eric Martin,Heavyweight,SUB,Heel Hook,1,0:14,UFC 12: Judgement Day,1997-02-07,"Dothan, Alabama, USA"
9,Nick Sanzo,def.,Jackie Lee,Lightweight,KO/TKO,,1,0:48,UFC 12: Judgement Day,1997-02-07,"Dothan, Alabama, USA"


We will drop the above fights when doing the analysis.

### Find the winner and loser by joining the stats dataframe with the fights dataframe:

In [13]:
ftr = 'Kazushi Sakuraba'
fights[(fights.Winner == ftr) | (fights.Loser == ftr)]

Unnamed: 0,Winner,Outcome,Loser,WeightClass,Method,MethodNotes,Round,Time,Event,Date,Location
3906,Kazushi Sakuraba,def.,Marcus Silveira,Heavyweight,SUB,Armbar,1,3:44,UFC - Ultimate Japan,1997-12-21,"Yokohama, Kanagawa, Japan"
3909,Kazushi Sakuraba,no contest,Marcus Silveira,Heavyweight,Overturned,,1,1:51,UFC - Ultimate Japan,1997-12-21,"Yokohama, Kanagawa, Japan"


In [14]:
xf = []
for index, row in df.iterrows():
     fighter1 = row['Fighter1']
     fighter2 = row['Fighter2']
     msk1 = ((fights.Winner == fighter1) & (fights.Loser == fighter2))
     msk2 = ((fights.Winner == fighter2) & (fights.Loser == fighter1))
     x = fights[msk1 & (fights.Date == row['Date'])].shape[0]
     y = fights[msk2 & (fights.Date == row['Date'])].shape[0]
     if (x == 1):
          xf.append(list(row.values))
     elif (y == 1):
          xf.append([row[0]] + list(row[12:23].values) + list(row[1:12].values))
     else:
          print 'Sakuraba fought Silveira twice in one night'
          xf.append(list(row.values))

Sakuraba fought Silveira twice in one night
Sakuraba fought Silveira twice in one night


In [15]:
xf = pd.DataFrame(xf, columns=df.columns)
xf.head(10)

Unnamed: 0,Date,Fighter1,Knockdowns1,SigStrikesLanded1,SigStrikesAttempted1,TotStrikesLanded1,TotStrikesAttempted1,TakedownLanded1,TakedownAttempted1,SubsAttempted1,Pass1,Reversal1,Fighter2,Knockdowns2,SigStrikesLanded2,SigStrikesAttempted2,TotStrikesLanded2,TotStrikesAttempted2,TakedownLanded2,TakedownAttempted2,SubsAttempted2,Pass2,Reversal2
0,2017-02-11,Germaine de Randamie,0.0,80.0,150.0,144.0,223.0,0.0,1.0,0.0,0.0,0.0,Holly Holm,0.0,77.0,182.0,122.0,235.0,0.0,9.0,0.0,0.0,0.0
1,2017-02-11,Anderson Silva,0.0,43.0,78.0,54.0,89.0,0.0,0.0,0.0,0.0,0.0,Derek Brunson,0.0,54.0,101.0,118.0,184.0,2.0,10.0,0.0,0.0,0.0
2,2017-02-11,Jacare Souza,0.0,7.0,18.0,15.0,30.0,1.0,2.0,1.0,2.0,0.0,Tim Boetsch,0.0,6.0,10.0,8.0,12.0,0.0,0.0,0.0,0.0,0.0
3,2017-02-11,Glover Teixeira,0.0,24.0,53.0,113.0,179.0,3.0,3.0,1.0,11.0,0.0,Jared Cannonier,0.0,21.0,46.0,40.0,66.0,0.0,0.0,0.0,0.0,0.0
4,2017-02-11,Dustin Poirier,0.0,97.0,216.0,122.0,243.0,3.0,3.0,0.0,1.0,1.0,Jim Miller,0.0,71.0,135.0,83.0,150.0,1.0,6.0,3.0,1.0,0.0
5,2017-02-11,Belal Muhammad,0.0,71.0,152.0,135.0,236.0,2.0,5.0,0.0,2.0,0.0,Randy Brown,0.0,52.0,145.0,65.0,158.0,0.0,0.0,0.0,0.0,1.0
6,2017-02-11,Wilson Reis,0.0,34.0,84.0,52.0,107.0,6.0,9.0,2.0,6.0,0.0,Ulka Sasaki,0.0,34.0,105.0,69.0,145.0,0.0,1.0,0.0,2.0,2.0
7,2017-02-11,Islam Makhachev,0.0,43.0,58.0,98.0,127.0,5.0,5.0,0.0,12.0,0.0,Nik Lentz,0.0,13.0,35.0,35.0,61.0,0.0,2.0,1.0,0.0,0.0
8,2017-02-11,Rick Glenn,0.0,52.0,115.0,60.0,123.0,0.0,11.0,0.0,0.0,0.0,Phillipe Nover,0.0,50.0,102.0,108.0,165.0,0.0,2.0,0.0,0.0,0.0
9,2017-02-11,Ryan LaFlare,1.0,29.0,91.0,51.0,116.0,1.0,4.0,0.0,2.0,0.0,Roan Carneiro,0.0,19.0,50.0,25.0,56.0,0.0,4.0,0.0,2.0,0.0


In [16]:
xf.shape

(4042, 23)

Note that fx is too long by 2 fights because of Sakuraba:

In [17]:
# fx is very large since cartesian product so do filter after
fx = fights.merge(xf, left_on='Winner', right_on='Fighter1', how='left')
fx = fx[(fx.Date_x == fx.Date_y) & (fx.Loser == fx.Fighter2)]
fx.shape

(4044, 34)

In [18]:
ftr = 'Kazushi Sakuraba'
fx[(fx.Winner == ftr) | (fx.Loser == ftr)]

Unnamed: 0,Winner,Outcome,Loser,WeightClass,Method,MethodNotes,Round,Time,Event,Date_x,Location,Date_y,Fighter1,Knockdowns1,SigStrikesLanded1,SigStrikesAttempted1,TotStrikesLanded1,TotStrikesAttempted1,TakedownLanded1,TakedownAttempted1,SubsAttempted1,Pass1,Reversal1,Fighter2,Knockdowns2,SigStrikesLanded2,SigStrikesAttempted2,TotStrikesLanded2,TotStrikesAttempted2,TakedownLanded2,TakedownAttempted2,SubsAttempted2,Pass2,Reversal2
27535,Kazushi Sakuraba,def.,Marcus Silveira,Heavyweight,SUB,Armbar,1,3:44,UFC - Ultimate Japan,1997-12-21,"Yokohama, Kanagawa, Japan",1997-12-21,Kazushi Sakuraba,0.0,1.0,2.0,2.0,3.0,1.0,1.0,2.0,1.0,0.0,Marcus Silveira,0.0,1.0,2.0,11.0,13.0,1.0,1.0,1.0,0.0,0.0
27536,Kazushi Sakuraba,def.,Marcus Silveira,Heavyweight,SUB,Armbar,1,3:44,UFC - Ultimate Japan,1997-12-21,"Yokohama, Kanagawa, Japan",1997-12-21,Kazushi Sakuraba,0.0,0.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,Marcus Silveira,0.0,4.0,10.0,9.0,15.0,1.0,1.0,1.0,0.0,0.0
27556,Kazushi Sakuraba,no contest,Marcus Silveira,Heavyweight,Overturned,,1,1:51,UFC - Ultimate Japan,1997-12-21,"Yokohama, Kanagawa, Japan",1997-12-21,Kazushi Sakuraba,0.0,1.0,2.0,2.0,3.0,1.0,1.0,2.0,1.0,0.0,Marcus Silveira,0.0,1.0,2.0,11.0,13.0,1.0,1.0,1.0,0.0,0.0
27557,Kazushi Sakuraba,no contest,Marcus Silveira,Heavyweight,Overturned,,1,1:51,UFC - Ultimate Japan,1997-12-21,"Yokohama, Kanagawa, Japan",1997-12-21,Kazushi Sakuraba,0.0,0.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,Marcus Silveira,0.0,4.0,10.0,9.0,15.0,1.0,1.0,1.0,0.0,0.0


In [19]:
# rename the second instance
idx = fx[(fx.Winner == ftr) & (fx.Outcome == 'def.') & (fx.SigStrikesLanded1 == 0)].index.values
fx = fx.drop(idx, axis=0)
idx = fx[(fx.Winner == ftr) & (fx.Outcome == 'no contest') & (fx.SigStrikesLanded1 == 1)].index.values
fx = fx.drop(idx, axis=0)
fx.shape

(4042, 34)

In [20]:
ftr = 'Kazushi Sakuraba'
fx[(fx.Winner == ftr) | (fx.Loser == ftr)]

Unnamed: 0,Winner,Outcome,Loser,WeightClass,Method,MethodNotes,Round,Time,Event,Date_x,Location,Date_y,Fighter1,Knockdowns1,SigStrikesLanded1,SigStrikesAttempted1,TotStrikesLanded1,TotStrikesAttempted1,TakedownLanded1,TakedownAttempted1,SubsAttempted1,Pass1,Reversal1,Fighter2,Knockdowns2,SigStrikesLanded2,SigStrikesAttempted2,TotStrikesLanded2,TotStrikesAttempted2,TakedownLanded2,TakedownAttempted2,SubsAttempted2,Pass2,Reversal2
27535,Kazushi Sakuraba,def.,Marcus Silveira,Heavyweight,SUB,Armbar,1,3:44,UFC - Ultimate Japan,1997-12-21,"Yokohama, Kanagawa, Japan",1997-12-21,Kazushi Sakuraba,0.0,1.0,2.0,2.0,3.0,1.0,1.0,2.0,1.0,0.0,Marcus Silveira,0.0,1.0,2.0,11.0,13.0,1.0,1.0,1.0,0.0,0.0
27557,Kazushi Sakuraba,no contest,Marcus Silveira,Heavyweight,Overturned,,1,1:51,UFC - Ultimate Japan,1997-12-21,"Yokohama, Kanagawa, Japan",1997-12-21,Kazushi Sakuraba,0.0,0.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,Marcus Silveira,0.0,4.0,10.0,9.0,15.0,1.0,1.0,1.0,0.0,0.0


In [21]:
fx = fx.drop(['Fighter1', 'Fighter2', 'Date_y'], axis=1)
new_cols = []
for column in fx.columns:
      new_cols.append((column, column.replace('1', '').replace('2', '_L').replace('Date_x', 'Date')))
fx = fx.rename(columns=dict(new_cols))
fx.head(3)

Unnamed: 0,Winner,Outcome,Loser,WeightClass,Method,MethodNotes,Round,Time,Event,Date,Location,Knockdowns,SigStrikesLanded,SigStrikesAttempted,TotStrikesLanded,TotStrikesAttempted,TakedownLanded,TakedownAttempted,SubsAttempted,Pass,Reversal,Knockdowns_L,SigStrikesLanded_L,SigStrikesAttempted_L,TotStrikesLanded_L,TotStrikesAttempted_L,TakedownLanded_L,TakedownAttempted_L,SubsAttempted_L,Pass_L,Reversal_L
0,Germaine de Randamie,def.,Holly Holm,Women's Featherweight,U-DEC,,5,5:00,UFC 208: Holm vs. De Randamie,2017-02-11,"New York, New York, USA",0.0,80.0,150.0,144.0,223.0,0.0,1.0,0.0,0.0,0.0,0.0,77.0,182.0,122.0,235.0,0.0,9.0,0.0,0.0,0.0
4,Anderson Silva,def.,Derek Brunson,Middleweight,U-DEC,,3,5:00,UFC 208: Holm vs. De Randamie,2017-02-11,"New York, New York, USA",0.0,43.0,78.0,54.0,89.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,101.0,118.0,184.0,2.0,10.0,0.0,0.0,0.0
22,Jacare Souza,def.,Tim Boetsch,Middleweight,SUB,Kimura,1,3:41,UFC 208: Holm vs. De Randamie,2017-02-11,"New York, New York, USA",0.0,7.0,18.0,15.0,30.0,1.0,2.0,1.0,2.0,0.0,0.0,6.0,10.0,8.0,12.0,0.0,0.0,0.0,0.0,0.0


Now everything is good so write to file:

In [22]:
fx.to_csv('data/fightmetric_individual_fights/detailed_stats_individual_fights_FINAL.csv', index=False)

Nothing is null in the stats columns:

In [23]:
fx[pd.isnull(fx.Knockdowns)]

Unnamed: 0,Winner,Outcome,Loser,WeightClass,Method,MethodNotes,Round,Time,Event,Date,Location,Knockdowns,SigStrikesLanded,SigStrikesAttempted,TotStrikesLanded,TotStrikesAttempted,TakedownLanded,TakedownAttempted,SubsAttempted,Pass,Reversal,Knockdowns_L,SigStrikesLanded_L,SigStrikesAttempted_L,TotStrikesLanded_L,TotStrikesAttempted_L,TakedownLanded_L,TakedownAttempted_L,SubsAttempted_L,Pass_L,Reversal_L
