# Gleeful Consulting

In [1]:
import pandas as pd
import statsmodels.formula.api as sm

location = "Datasets/GleeClubData.xlsx"
glee = pd.read_excel(location)
glee.head()

Unnamed: 0,team size,male/female ratio of team,enrollment in school,male/female ratio of School,population of district served by school,# of practices per month,# of competitions before regionals,Regionals Score
0,9,1.1,469,0.5,11432,1,3,3.160421
1,12,0.9,2028,1.1,64874,12,7,4.78919
2,23,1.4,1393,1.4,13843,10,3,3.676977
3,17,0.9,425,0.8,40959,9,0,3.641177
4,37,0.5,2111,1.0,26203,15,4,4.164019


In [2]:
## rename variables

glee['tm_size'] = glee['team size']
glee['mf_ratio_team'] = glee['male/female ratio of team']
glee['enrolled'] = glee['enrollment in school']
glee['mf_ratio_school'] = glee['male/female ratio of School']
glee['pop_district'] = glee['population of district served by school']
glee['prac_per_mnth'] = glee['# of practices per month']
glee['num_comps'] = glee['# of competitions before regionals']
glee['r_score'] = glee['Regionals Score']

glee.head()

Unnamed: 0,team size,male/female ratio of team,enrollment in school,male/female ratio of School,population of district served by school,# of practices per month,# of competitions before regionals,Regionals Score,tm_size,mf_ratio_team,enrolled,mf_ratio_school,pop_district,prac_per_mnth,num_comps,r_score
0,9,1.1,469,0.5,11432,1,3,3.160421,9,1.1,469,0.5,11432,1,3,3.160421
1,12,0.9,2028,1.1,64874,12,7,4.78919,12,0.9,2028,1.1,64874,12,7,4.78919
2,23,1.4,1393,1.4,13843,10,3,3.676977,23,1.4,1393,1.4,13843,10,3,3.676977
3,17,0.9,425,0.8,40959,9,0,3.641177,17,0.9,425,0.8,40959,9,0,3.641177
4,37,0.5,2111,1.0,26203,15,4,4.164019,37,0.5,2111,1.0,26203,15,4,4.164019


In [3]:
## create new dataset with renamed variables

newglee = glee[['tm_size','mf_ratio_team','enrolled','mf_ratio_school','pop_district','prac_per_mnth','num_comps','r_score']]
newglee.head()

Unnamed: 0,tm_size,mf_ratio_team,enrolled,mf_ratio_school,pop_district,prac_per_mnth,num_comps,r_score
0,9,1.1,469,0.5,11432,1,3,3.160421
1,12,0.9,2028,1.1,64874,12,7,4.78919
2,23,1.4,1393,1.4,13843,10,3,3.676977
3,17,0.9,425,0.8,40959,9,0,3.641177
4,37,0.5,2111,1.0,26203,15,4,4.164019


In [4]:
## run correlation matrix

newglee.corr()

Unnamed: 0,tm_size,mf_ratio_team,enrolled,mf_ratio_school,pop_district,prac_per_mnth,num_comps,r_score
tm_size,1.0,0.039261,-0.002032,-0.027581,-0.027813,0.01757,-0.017728,0.037383
mf_ratio_team,0.039261,1.0,0.014302,-0.022603,-0.043413,-0.039693,0.06083,-0.019025
enrolled,-0.002032,0.014302,1.0,-0.007546,0.063731,-0.043782,-0.076423,0.031495
mf_ratio_school,-0.027581,-0.022603,-0.007546,1.0,0.030516,0.002079,-0.048233,0.004502
pop_district,-0.027813,-0.043413,0.063731,0.030516,1.0,0.018413,-0.083559,0.437973
prac_per_mnth,0.01757,-0.039693,-0.043782,0.002079,0.018413,1.0,0.028975,0.634882
num_comps,-0.017728,0.06083,-0.076423,-0.048233,-0.083559,0.028975,1.0,0.474863
r_score,0.037383,-0.019025,0.031495,0.004502,0.437973,0.634882,0.474863,1.0


# Relevant Factors

The correlation matrix shows that the items most strongly correlated with Regional Score are: Population of District Served by School (r = .44), Number of Practices per Month (r = .63), and Number of Competitions before Regionals (r = .47).

In [5]:
## Running Linear Regression Model with Intercept (AdjR2 = .83)

model1 = sm.ols(formula = 'r_score ~ tm_size + mf_ratio_team + enrolled + mf_ratio_school + pop_district + prac_per_mnth + num_comps', data=newglee).fit()
model1.summary()

0,1,2,3
Dep. Variable:,r_score,R-squared:,0.836
Model:,OLS,Adj. R-squared:,0.832
Method:,Least Squares,F-statistic:,213.1
Date:,"Fri, 05 Oct 2018",Prob (F-statistic):,9.38e-111
Time:,22:56:51,Log-Likelihood:,145.71
No. Observations:,300,AIC:,-275.4
Df Residuals:,292,BIC:,-245.8
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.9498,0.055,53.518,0.000,2.841,3.058
tm_size,0.0026,0.001,2.078,0.039,0.000,0.005
mf_ratio_team,-0.0091,0.028,-0.322,0.748,-0.065,0.047
enrolled,3.99e-05,1.41e-05,2.834,0.005,1.22e-05,6.76e-05
mf_ratio_school,0.0181,0.029,0.632,0.528,-0.038,0.074
pop_district,4.285e-06,2.2e-07,19.502,0.000,3.85e-06,4.72e-06
prac_per_mnth,0.0379,0.001,25.842,0.000,0.035,0.041
num_comps,0.0613,0.003,21.046,0.000,0.056,0.067

0,1,2,3
Omnibus:,26.857,Durbin-Watson:,1.808
Prob(Omnibus):,0.0,Jarque-Bera (JB):,105.904
Skew:,0.17,Prob(JB):,1.0099999999999999e-23
Kurtosis:,5.891,Cond. No.,448000.0


In [6]:
## Running Linear Regression Model without Intercept (AdjR2 = .98)

model2 = sm.ols(formula = 'r_score ~ tm_size + mf_ratio_team + enrolled + mf_ratio_school + pop_district + prac_per_mnth + num_comps - 1', data=newglee).fit()
model2.summary()

0,1,2,3
Dep. Variable:,r_score,R-squared:,0.984
Model:,OLS,Adj. R-squared:,0.984
Method:,Least Squares,F-statistic:,2632.0
Date:,"Fri, 05 Oct 2018",Prob (F-statistic):,2.56e-260
Time:,22:57:08,Log-Likelihood:,-211.34
No. Observations:,300,AIC:,436.7
Df Residuals:,293,BIC:,462.6
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
tm_size,0.0309,0.004,8.341,0.000,0.024,0.038
mf_ratio_team,0.7463,0.081,9.263,0.000,0.588,0.905
enrolled,0.0003,4.38e-05,6.429,0.000,0.000,0.000
mf_ratio_school,0.8419,0.079,10.646,0.000,0.686,0.997
pop_district,6.846e-06,7.04e-07,9.728,0.000,5.46e-06,8.23e-06
prac_per_mnth,0.0588,0.005,12.673,0.000,0.050,0.068
num_comps,0.1007,0.009,10.893,0.000,0.083,0.119

0,1,2,3
Omnibus:,1.749,Durbin-Watson:,1.878
Prob(Omnibus):,0.417,Jarque-Bera (JB):,1.604
Skew:,-0.073,Prob(JB):,0.448
Kurtosis:,2.673,Cond. No.,213000.0


# Model 2 - Regression EquationÂ¶

Regionals Score = (0.03 x Team Size) + (0.75 x MF Ratio Team) + (0.0003 x Enrollment in School) + (0.84 x MF Ratio School) + (6.85 x District Population) + (0.06 x Practices per Month) + (0.10 x Competitions before Regionals)