In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats import percentileofscore

## Dixon-Coles Model to estimate team offensive and defensive strengths

In [2]:
def parse_team(string):
    return str.split(string, "_")[2]

In [3]:
def results_to_strengths(results, num_teams):
    # turn model results into team strengths
    att_strength = results.params[0:num_teams-1]
    def_strength = results.params[num_teams-1:-3]
    defense = pd.DataFrame(def_strength).reset_index()
    defense.columns = ['team', 'def_strength']
    defense['team'] = defense.team.apply(parse_team)
    offense = pd.DataFrame(att_strength).reset_index()
    offense.columns = ['team', 'att_strength']
    offense['team'] = offense.team.apply(parse_team)
    team_strength = pd.merge(offense, defense)
    return team_strength

### 2017

In [4]:
games17 = pd.read_csv("https://raw.githubusercontent.com/octonion/soccer-m/master/ncaa/csv/ncaa_games_2017.csv")

In [5]:
games17.head()

Unnamed: 0,year,team_name,team_id,opponent_name,opponent_id,game_date,team_score,opponent_score,location,neutral_site_location,game_length,attendance
0,2017,Academy of Art,30123,Cal St. East Bay,98.0,09/02/2016,0,3,Away,,-,116
1,2017,Academy of Art,30123,Stanislaus St.,103.0,09/08/2016,2,2,Home,,2 OT,81
2,2017,Academy of Art,30123,San Fran. St.,628.0,09/10/2016,0,5,Away,,-,248
3,2017,Academy of Art,30123,Cal St. Monterey Bay,30055.0,09/12/2016,0,1,Home,,-,58
4,2017,Academy of Art,30123,Concordia Portland,30200.0,09/16/2016,1,6,Home,,-,34


In [6]:
# do a bit of necessary preprep
games17['team_name'] = ['North Carolina St.' if x == 'NC State' else x for x in games17['team_name']]
games17['opponent_name'] = ['Seattle U' if x == 'Seattle' else x for x in games17['opponent_name']]
games17['team_name'] = ['Detroit' if x == 'Detroit Mercy' else x for x in games17['team_name']]
games17['team_name'] = ["St. Mary's (CA)" if x == "Saint Mary's (CA)" else x for x in games17['team_name']]

In [7]:
d1_teams17 = pd.read_table("https://raw.githubusercontent.com/octonion/soccer-m/master/ncaa_pbp/csv/ncaa_teams_2017_1.csv")

In [8]:
d1_teams17.head()

Unnamed: 0,sport_code,year,year_id,division_id,team_id,team_name,team_url
0,MSO,2017,12440,1,721,Air Force,http://stats.ncaa.org/team/721/12440
1,MSO,2017,12440,1,5,Akron,http://stats.ncaa.org/team/5/12440
2,MSO,2017,12440,1,14,Albany (NY),http://stats.ncaa.org/team/14/12440
3,MSO,2017,12440,1,23,American,http://stats.ncaa.org/team/23/12440
4,MSO,2017,12440,1,27,Appalachian St.,http://stats.ncaa.org/team/27/12440


In [9]:
d1_games17 = games17[(games17.team_id.isin(d1_teams17.team_id)) & (games17.opponent_id.isin(d1_teams17.team_id))]

In [10]:
y = d1_games17.team_score
X = pd.get_dummies(d1_games17[['team_name', 'opponent_name', 'location']], drop_first=True)
X['intercept'] = 1

In [11]:
poisson_model = sm.GLM(y, X, family=sm.families.Poisson())

In [12]:
poiss_results = poisson_model.fit()
print(poiss_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             team_score   No. Observations:                 3854
Model:                            GLM   Df Residuals:                     3441
Model Family:                 Poisson   Df Model:                          412
Link Function:                    log   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -5136.6
Date:                Wed, 10 Jan 2018   Deviance:                       3523.4
Time:                        22:32:01   Pearson chi2:                 3.08e+03
No. Iterations:                     8                                         
                                         coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
team_name_Akron                        0.2789      0.226      1.236      0.216     

In [13]:
ts = results_to_strengths(poiss_results, d1_games17.team_id.nunique())
ts.to_csv("ncaa_2017_team_strengths.csv", index=False)

### 2016

In [14]:
games16 = pd.read_csv("https://raw.githubusercontent.com/octonion/soccer-m/master/ncaa/csv/ncaa_games_2016.csv")

In [15]:
# do a bit of necessary preprep
games16['opponent_name'] = ['Tex.-Pan American' if x == 'UTRGV' else x for x in games16['opponent_name']]
games16['opponent_name'] = ['East Tenn. St.' if x == 'ETSU' else x for x in games16['opponent_name']]

In [16]:
d1_teams16 = pd.read_table("https://raw.githubusercontent.com/octonion/soccer-m/master/ncaa_pbp/csv/ncaa_teams_2016_1.csv")

In [17]:
d1_games16 = games16[(games16.team_id.isin(d1_teams16.team_id)) & (games16.opponent_id.isin(d1_teams16.team_id))]

In [18]:
y = d1_games16.team_score
X = pd.get_dummies(d1_games16[['team_name', 'opponent_name', 'location']], drop_first=True)
X['intercept'] = 1

In [19]:
poisson_model = sm.GLM(y, X, family=sm.families.Poisson())
poiss_results = poisson_model.fit()
print(poiss_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             team_score   No. Observations:                 3860
Model:                            GLM   Df Residuals:                     3447
Model Family:                 Poisson   Df Model:                          412
Link Function:                    log   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -5237.4
Date:                Wed, 10 Jan 2018   Deviance:                       3692.9
Time:                        22:32:12   Pearson chi2:                 3.21e+03
No. Iterations:                     8                                         
                                         coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
team_name_Akron                        1.0126      0.278      3.640      0.000     

In [20]:
ts = results_to_strengths(poiss_results, d1_games16.team_id.nunique())
ts.to_csv("ncaa_2016_team_strengths.csv", index=False)

### 2015

In [21]:
games15 = pd.read_csv("https://raw.githubusercontent.com/octonion/soccer-m/master/ncaa/csv/ncaa_games_2015.csv")

In [22]:
# do a bit of necessary preprep
games15['opponent_name'] = ['UNCG' if x == 'UNC Greensboro' else x for x in games15['opponent_name']]
games15['team_name'] = ['Army' if x == 'Army West Point' else x for x in games15['team_name']]
games15['opponent_name'] = ['CSU Bakersfield' if x == 'Bakersfield' else x for x in games15['opponent_name']]

In [23]:
d1_teams15 = pd.read_table("https://raw.githubusercontent.com/octonion/soccer-m/master/ncaa_pbp/csv/ncaa_teams_2015_1.csv")

In [24]:
d1_games15 = games15[(games15.team_id.isin(d1_teams15.team_id)) & (games15.opponent_id.isin(d1_teams15.team_id))]

In [25]:
y = d1_games15.team_score
X = pd.get_dummies(d1_games15[['team_name', 'opponent_name', 'location']], drop_first=True)
X['intercept'] = 1

In [26]:
poisson_model = sm.GLM(y, X, family=sm.families.Poisson())
poiss_results = poisson_model.fit()
print(poiss_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             team_score   No. Observations:                 3884
Model:                            GLM   Df Residuals:                     3472
Model Family:                 Poisson   Df Model:                          411
Link Function:                    log   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -5170.5
Date:                Wed, 10 Jan 2018   Deviance:                       3725.2
Time:                        22:32:18   Pearson chi2:                 3.24e+03
No. Iterations:                     8                                         
                                         coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
team_name_Akron                        0.5394      0.291      1.851      0.064     

In [27]:
ts = results_to_strengths(poiss_results, d1_games15.team_id.nunique())
ts.to_csv("ncaa_2015_team_strengths.csv", index=False)

### 2014

In [28]:
games14 = pd.read_csv("https://raw.githubusercontent.com/octonion/soccer-m/master/ncaa/csv/ncaa_games_2014.csv")

In [29]:
# do a bit of necessary preprep
games14['opponent_name'] = ['UNCG' if x == 'UNC Greensboro' else x for x in games14['opponent_name']]
games14['team_name'] = ['Army' if x == 'Army West Point' else x for x in games14['team_name']]
games14['opponent_name'] = ['CSU Bakersfield' if x == 'Bakersfield' else x for x in games14['opponent_name']]
games14['opponent_name'] = ['UNCW' if x == 'UNC Wilmington' else x for x in games14['opponent_name']]
games14['opponent_name'] = ['Omaha' if x == 'Neb. Omaha' else x for x in games14['opponent_name']]
games14['opponent_name'] = ['SIUE' if x == 'SIU Edwardsville' else x for x in games14['opponent_name']]
games14['opponent_name'] = ['CSUN' if x == 'Cal St. Northridge' else x for x in games14['opponent_name']]

In [30]:
d1_teams14 = pd.read_table("https://raw.githubusercontent.com/octonion/soccer-m/master/ncaa_pbp/csv/ncaa_teams_2014_1.csv")

In [31]:
d1_games14 = games14[(games14.team_id.isin(d1_teams14.team_id)) & (games14.opponent_id.isin(d1_teams14.team_id))]

In [32]:
y = d1_games14.team_score
X = pd.get_dummies(d1_games14[['team_name', 'opponent_name', 'location']], drop_first=True)
X['intercept'] = 1

In [33]:
poisson_model = sm.GLM(y, X, family=sm.families.Poisson())
poiss_results = poisson_model.fit()
print(poiss_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             team_score   No. Observations:                 3836
Model:                            GLM   Df Residuals:                     3430
Model Family:                 Poisson   Df Model:                          405
Link Function:                    log   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -5119.0
Date:                Wed, 10 Jan 2018   Deviance:                       3578.4
Time:                        22:32:23   Pearson chi2:                 3.07e+03
No. Iterations:                     8                                         
                                         coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
team_name_Akron                        0.8615      0.308      2.799      0.005     

In [34]:
ts = results_to_strengths(poiss_results, d1_games14.team_id.nunique())
ts.to_csv("ncaa_2014_team_strengths.csv", index=False)

### 2012

In [35]:
games12 = pd.read_csv("https://raw.githubusercontent.com/octonion/soccer-m/master/ncaa/csv/ncaa_games_2012.csv")

In [36]:
# do a bit of necessary preprep
games12['opponent_name'] = ['UNCG' if x == 'UNC Greensboro' else x for x in games12['opponent_name']]
games12['team_name'] = ['Army' if x == 'Army West Point' else x for x in games12['team_name']]
games12['opponent_name'] = ['CSU Bakersfield' if x == 'Bakersfield' else x for x in games12['opponent_name']]
games12['opponent_name'] = ['UNCW' if x == 'UNC Wilmington' else x for x in games12['opponent_name']]
games12['opponent_name'] = ['Omaha' if x == 'Neb. Omaha' else x for x in games12['opponent_name']]
games12['opponent_name'] = ['SIUE' if x == 'SIU Edwardsville' else x for x in games12['opponent_name']]
games12['opponent_name'] = ['CSUN' if x == 'Cal St. Northridge' else x for x in games12['opponent_name']]
games12['team_name'] = ['St. Francis (NY)' if x == 'St. Francis Brooklyn' else x for x in games12['team_name']]
games12['opponent_name'] = ['FGCU' if x == 'Fla. Gulf Coast' else x for x in games12['opponent_name']]
games12['opponent_name'] = ['UConn' if x == 'Connecticut' else x for x in games12['opponent_name']]
games12['opponent_name'] = ['USC Upstate' if x == 'S.C. Upstate' else x for x in games12['opponent_name']]

In [37]:
d1_teams12 = pd.read_table("https://raw.githubusercontent.com/octonion/soccer-m/master/ncaa_pbp/csv/ncaa_teams_2012_1.csv")

In [38]:
d1_games12 = games12[(games12.team_id.isin(d1_teams12.team_id)) & (games12.opponent_id.isin(d1_teams12.team_id))]

In [39]:
y = d1_games12.team_score
X = pd.get_dummies(d1_games12[['team_name', 'opponent_name', 'location']], drop_first=True)
X['intercept'] = 1

In [40]:
poisson_model = sm.GLM(y, X, family=sm.families.Poisson())
poiss_results = poisson_model.fit()
print(poiss_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             team_score   No. Observations:                 3876
Model:                            GLM   Df Residuals:                     3469
Model Family:                 Poisson   Df Model:                          406
Link Function:                    log   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -5236.1
Date:                Wed, 10 Jan 2018   Deviance:                       3679.6
Time:                        22:32:29   Pearson chi2:                 3.14e+03
No. Iterations:                     9                                         
                                        coef    std err          z      P>|z|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------------------
team_name_Air Force                   0.5355      0.281      1.909      0.056        

In [41]:
ts = results_to_strengths(poiss_results, d1_games12.team_id.nunique())
ts.to_csv("ncaa_2012_team_strengths.csv", index=False)

### 2018

In [42]:
games18 = pd.read_csv("https://raw.githubusercontent.com/octonion/soccer-m/master/ncaa/csv/ncaa_games_2018.csv")

In [43]:
d1_teams18 = pd.read_table("https://raw.githubusercontent.com/octonion/soccer-m/master/ncaa_pbp/csv/ncaa_teams_2018_1.csv")

In [44]:
d1_games18 = games18[(games18.team_id.isin(d1_teams18.team_id)) & (games18.opponent_id.isin(d1_teams18.team_id))]

In [45]:
y = d1_games18.team_score
X = pd.get_dummies(d1_games18[['team_name', 'opponent_name', 'location']], drop_first=True)
X['intercept'] = 1

In [46]:
poisson_model = sm.GLM(y, X, family=sm.families.Poisson())
poiss_results = poisson_model.fit()
print(poiss_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:             team_score   No. Observations:                 3790
Model:                            GLM   Df Residuals:                     3379
Model Family:                 Poisson   Df Model:                          410
Link Function:                    log   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:                -5129.0
Date:                Wed, 10 Jan 2018   Deviance:                       3640.1
Time:                        22:32:35   Pearson chi2:                 3.15e+03
No. Iterations:                     8                                         
                                         coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------
team_name_Akron                        0.5253      0.236      2.225      0.026     

In [47]:
ts = results_to_strengths(poiss_results, d1_games18.team_id.nunique())
ts.to_csv("ncaa_2018_team_strengths.csv", index=False)

### Combine team strengths into one file

In [48]:
strengths = [pd.read_csv("ncaa_"+str(year)+"_team_strengths.csv") for year in [2012, 2014, 2015, 2016, 2017, 2018]]
team_nums = [x.shape[0] for x in strengths]

Since the base level changes from year to year, these variables aren't on the same scale across seasons, which will lead to biased coefficient estimates. So let's transform those values into percentiles within seasons:

In [49]:
for df in strengths:
    df['att_perc'] = df.att_strength.apply(lambda x: percentileofscore(df.att_strength, x))
    df['def_perc'] = df.def_strength.apply(lambda x: percentileofscore(-1 * df.def_strength, -1 * x))

In [50]:
combined_strengths = pd.concat(strengths, axis=0)

In [51]:
combined_strengths['season'] = np.repeat([2012, 2014, 2015, 2016, 2017, 2018], team_nums)

In [52]:
combined_strengths.head()

Unnamed: 0,team,att_strength,def_strength,att_perc,def_perc,season
0,Air Force,0.535512,-0.400902,83.663366,83.168317,2012
1,Akron,0.835316,-0.69873,97.029703,95.544554,2012
2,Albany (NY),0.196379,0.298272,43.069307,17.326733,2012
3,American,-0.02925,0.058263,24.752475,41.584158,2012
4,Appalachian St.,-0.050153,-0.690867,22.277228,95.049505,2012


In [53]:
combined_strengths.to_csv("ncaa_team_strengths.csv", index=False)