# Jonathan Halverson
# Monday, March 27, 2017
# Part 13: Simple predictive models based on rules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('halverson')
from scipy.stats import binom

In [2]:
iofile = 'data/fightmetric_cards/fightmetric_fights_CLEAN_3-6-2017.csv'
fights = pd.read_csv(iofile, header=0, parse_dates=['Date'])
fights.head(3)

Unnamed: 0,Winner,Outcome,Loser,WeightClass,Method,MethodNotes,Round,Time,Event,Date,Location
0,Germaine de Randamie,def.,Holly Holm,Women's Featherweight,U-DEC,,5,5:00,UFC 208: Holm vs. De Randamie,2017-02-11,"New York, New York, USA"
1,Anderson Silva,def.,Derek Brunson,Middleweight,U-DEC,,3,5:00,UFC 208: Holm vs. De Randamie,2017-02-11,"New York, New York, USA"
2,Jacare Souza,def.,Tim Boetsch,Middleweight,SUB,Kimura,1,3:41,UFC 208: Holm vs. De Randamie,2017-02-11,"New York, New York, USA"


In [3]:
iofile = 'data/ufc_name_education.csv'
ufc = pd.read_csv(iofile, header=0)
ufc.head(3)

Unnamed: 0,Name,Education
0,Jim Alers,1
1,Corey Anderson,1
2,Rich Attonito,1


In [4]:
iofile = 'data/fightmetric_fighters_with_corrections_from_UFC_Wikipedia_CLEAN.csv'
fighters = pd.read_csv(iofile, header=0, parse_dates=['Dob'])
cols = ['Name', 'Height', 'Reach', 'LegReach', 'Stance', 'Dob']
df = fights.merge(fighters[cols], how='left', left_on='Winner', right_on='Name')
df = df.merge(fighters[cols], how='left', left_on='Loser', right_on='Name', suffixes=('', '_L'))
df = df.drop(['Name', 'Name_L'], axis=1)
df = df.merge(ufc, left_on='Winner', right_on='Name', how='left')
df = df.merge(ufc, left_on='Loser', right_on='Name', how='left', suffixes=('', '_L'))
df.Education = df.Education.fillna(0.0)
df.Education_L = df.Education_L.fillna(0.0)
df = df.drop(['Name', 'Name_L'], axis=1)
df.head(3)

Unnamed: 0,Winner,Outcome,Loser,WeightClass,Method,MethodNotes,Round,Time,Event,Date,...,LegReach,Stance,Dob,Height_L,Reach_L,LegReach_L,Stance_L,Dob_L,Education,Education_L
0,Germaine de Randamie,def.,Holly Holm,Women's Featherweight,U-DEC,,5,5:00,UFC 208: Holm vs. De Randamie,2017-02-11,...,41.0,Orthodox,1984-04-24,68.0,69.0,38.0,Southpaw,1981-10-17,0.0,0.0
1,Anderson Silva,def.,Derek Brunson,Middleweight,U-DEC,,3,5:00,UFC 208: Holm vs. De Randamie,2017-02-11,...,42.0,Southpaw,1975-04-14,73.0,77.0,43.0,Southpaw,1984-01-04,0.0,1.0
2,Jacare Souza,def.,Tim Boetsch,Middleweight,SUB,Kimura,1,3:41,UFC 208: Holm vs. De Randamie,2017-02-11,...,41.0,Orthodox,1979-12-07,72.0,74.0,42.0,Orthodox,1981-01-28,0.0,1.0


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4068 entries, 0 to 4067
Data columns (total 23 columns):
Winner         4068 non-null object
Outcome        4068 non-null object
Loser          4068 non-null object
WeightClass    4068 non-null object
Method         4068 non-null object
MethodNotes    2159 non-null object
Round          4068 non-null int64
Time           4068 non-null object
Event          4068 non-null object
Date           4068 non-null datetime64[ns]
Location       4068 non-null object
Height         4064 non-null float64
Reach          3801 non-null float64
LegReach       2315 non-null float64
Stance         3947 non-null object
Dob            4012 non-null datetime64[ns]
Height_L       4048 non-null float64
Reach_L        3473 non-null float64
LegReach_L     1565 non-null float64
Stance_L       3894 non-null object
Dob_L          3913 non-null datetime64[ns]
Education      4068 non-null float64
Education_L    4068 non-null float64
dtypes: datetime64[ns](3), float64

Filter fights to defeats since January 1, 2005:

In [5]:
fs = df[(df.Date > pd.to_datetime('2005-01-01')) & (df.Outcome == 'def.')]
fs.shape[0]

3561

### Younger wins

In [6]:
yw = fs[pd.notnull(fs.Dob) & pd.notnull(fs.Dob_L) & (fs.Dob != fs.Dob_L)]
wins = yw[yw.Dob > yw.Dob_L].shape[0]
total = yw.shape[0]
wins, total, wins / float(total), fs.shape[0] - total

(1964, 3529, 0.5565315953527912, 32)

In [7]:
2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

1.9906913544237038e-11

### Longer reach wins

In [8]:
lr = fs[pd.notnull(fs.Reach) & pd.notnull(fs.Reach_L) & (fs.Reach != fs.Reach_L)]
wins = lr[lr.Reach > lr.Reach_L].shape[0]
total = lr.shape[0]
wins, total, wins / float(total), fs.shape[0] - total

(1499, 2863, 0.5235766678309466, 698)

In [9]:
2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

0.012254511782245889

What if we don't exclude nulls?

In [10]:
lr = fs[fs.Reach != fs.Reach_L]
wins = lr[lr.Reach > lr.Reach_L].shape[0]
total = lr.shape[0]
wins, total, wins / float(total), fs.shape[0] - total

(1499, 3138, 0.47769279796048436, 423)

In [11]:
71.0 > np.nan

False

In [12]:
71.0 == np.nan

False

In [13]:
lr[['Reach', 'Reach_L']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3138 entries, 0 to 3619
Data columns (total 2 columns):
Reach      3115 non-null float64
Reach_L    2877 non-null float64
dtypes: float64(2)
memory usage: 73.5 KB


We see that nulls in Reach_L lead to Falses which increased the total which gave a win ratio less than 1/2.

### Taller wins

In [14]:
ht = fs[pd.notnull(fs.Height) & pd.notnull(fs.Height_L) & (fs.Height != fs.Height_L)]
wins = ht[ht.Height > ht.Height_L].shape[0]
total = ht.shape[0]
wins, total, wins / float(total), fs.shape[0] - total

(1486, 2903, 0.511884257664485, 658)

In [15]:
2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

0.20691383522689766

### Stance

In [16]:
st = fs[pd.notnull(fs.Stance) & pd.notnull(fs.Stance_L) & (((fs.Stance == 'Orthodox') & (fs.Stance_L == 'Southpaw')) | ((fs.Stance == 'Southpaw') & (fs.Stance_L == 'Orthodox')))]
wins = st[st.Stance == 'Southpaw'].shape[0]
total = st.shape[0]
wins, total, wins / float(total), fs.shape[0] - total

(561, 1010, 0.5554455445544555, 2551)

In [17]:
2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

0.00047237811629372023

### Education

In [18]:
ed = fs[pd.notnull(fs.Education) & pd.notnull(fs.Education_L) & (fs.Education != fs.Education_L)]
wins = ed[ed.Education == 1].shape[0]
total = ed.shape[0]
wins, total, wins / float(total), fs.shape[0] - total

(594, 1051, 0.5651760228353948, 2510)

In [19]:
2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

2.6619836969235326e-05

### Leg reach

In [21]:
lg = fs[pd.notnull(fs.LegReach) & pd.notnull(fs.LegReach_L) & (fs.LegReach != fs.LegReach_L)]
wins = lg[lg.LegReach > lg.LegReach_L].shape[0]
total = lg.shape[0]
wins, total, wins / float(total), fs.shape[0] - total

(487, 993, 0.4904330312185297, 2568)

In [22]:
2 * binom.cdf(p=0.5, k=min(wins, total - wins), n=total)

0.56787738449981351

### Overall win ratio

In [45]:
fs_fighters = fs.Winner.append(fs.Loser).value_counts().to_frame()
wins = fs.Winner.value_counts().to_frame()
loses = fs.Loser.value_counts().to_frame()
fs_fighters = fs_fighters.merge(wins, left_index=True, right_index=True, how='left')
fs_fighters = fs_fighters.merge(loses, left_index=True, right_index=True, how='left')
fs_fighters = fs_fighters.fillna(0.0)
fs_fighters.columns = ['Total', 'Wins', 'Losses']
fs_fighters['WinRatio'] = fs_fighters.Wins / (fs_fighters.Wins + fs_fighters.Losses)

In [51]:
ws = fs.merge(fs_fighters[['Total', 'WinRatio']], left_on='Winner', right_index=True, how='left')
ws = ws.merge(fs_fighters[['Total', 'WinRatio']], left_on='Loser', right_index=True, how='left', suffixes=('', '_L'))
ws.head(3)

Unnamed: 0,Winner,Outcome,Loser,WeightClass,Method,MethodNotes,Round,Time,Event,Date,...,Reach_L,LegReach_L,Stance_L,Dob_L,Education,Education_L,Total,WinRatio,Total_L,WinRatio_L
0,Germaine de Randamie,def.,Holly Holm,Women's Featherweight,U-DEC,,5,5:00,UFC 208: Holm vs. De Randamie,2017-02-11,...,69.0,38.0,Southpaw,1981-10-17,0.0,0.0,5,0.8,6,0.5
1,Anderson Silva,def.,Derek Brunson,Middleweight,U-DEC,,3,5:00,UFC 208: Holm vs. De Randamie,2017-02-11,...,77.0,43.0,Southpaw,1984-01-04,0.0,1.0,21,0.809524,10,0.7
2,Jacare Souza,def.,Tim Boetsch,Middleweight,SUB,Kimura,1,3:41,UFC 208: Holm vs. De Randamie,2017-02-11,...,74.0,42.0,Orthodox,1981-01-28,0.0,1.0,8,0.875,21,0.52381


In [52]:
ws = ws[pd.notnull(ws.WinRatio) & pd.notnull(ws.WinRatio_L) & (ws.Total > 5) & (ws.Total_L > 5)]
wins = ws[ws.WinRatio > ws.WinRatio_L].shape[0]
total = ws.shape[0]
wins, total, wins / float(total), ws.shape[0] - total

(1198, 1693, 0.7076196101594802, 0)

### Experience