# Project: Levels of Football

Kaleb Sailer

This is the hypothesis testing file for all three levels of football.

In [2]:
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [3]:
# reading in dataframes 
hs_cfb_df = pd.read_csv('hs_cfb_df.csv', encoding='utf-8')
cfb_nfl_df = pd.read_csv('cfb_nfl_df.csv', encoding='utf-8')
hs_nfl_df = pd.read_csv('hs_nfl_df.csv', encoding='utf-8')
all_fb_df = pd.read_csv('all_fb.csv', encoding='utf-8')
all_fb_region_df = pd.read_csv('all_fb_region.csv', encoding='utf-8')

### Question 1: How do average (scoring margins, ppg, ypg, etc.) differ between each level of football?

In [5]:
# Group data by team level and calculate mean for ppg and ppg_a
level_ppg = all_fb_df.groupby('team_level')['ppg'].mean()
level_ppga = all_fb_df.groupby('team_level')['ppg_a'].mean()

# Create a DataFrame for the grouped data
ppg_df = pd.DataFrame({
    'team_level': level_ppg.index,
    'ppg': level_ppg.values,
    'ppg_a': level_ppga.values
})

# correlation test
pearson_corr, pearson_p = stats.pearsonr(ppg_df['ppg'], ppg_df['ppg_a'])

print(f"Pearson Correlation Coefficient: {pearson_corr:.4f}")

# Reshape for plotting
melted_ppg_df = ppg_df.melt(id_vars='team_level', var_name='Metric', value_name='Value')

# Perform ANOVA for ppg based on team_level
fit = ols("ppg ~ team_level", data=all_fb_df).fit()
anova_table = sm.stats.anova_lm(fit, typ=2)

# Extract p-value from the ANOVA table
p_value = anova_table["PR(>F)"].iloc[0]
print(anova_table)
print(f"P-value: {p_value:.8f}")

# Interpretation of ANOVA result
alpha = 0.05
if p_value < alpha:
    print("Reject H0: The average points per game are significantly different across team levels.")
else:
    print("Fail to reject H0: No significant difference in average points per game across team levels.")


# Perform ANOVA for ppg_a based on team_level
fit = ols("ppg_a ~ team_level", data=all_fb_df).fit()
anova_table = sm.stats.anova_lm(fit, typ=2)

# Extract p-value from the ANOVA table
p_value = anova_table["PR(>F)"].iloc[0]
print(anova_table)
print(f"P-value: {p_value:.8f}")

# Interpretation of ANOVA result
alpha = 0.05
if p_value < alpha:
    print("Reject H0: The average points per game allowed are significantly different across team levels.")
else:
    print("Fail to reject H0: No significant difference in average points per game allowed across team levels.")

Pearson Correlation Coefficient: 0.8475
                 sum_sq     df          F    PR(>F)
team_level   964.798335    2.0  11.400366  0.000021
Residual    8378.242660  198.0        NaN       NaN
P-value: 0.00002060
Reject H0: The average points per game are significantly different across team levels.
                 sum_sq     df         F    PR(>F)
team_level   468.507971    2.0  6.621026  0.001647
Residual    7005.302576  198.0       NaN       NaN
P-value: 0.00164663
Reject H0: The average points per game allowed are significantly different across team levels.


### Question 2: Win % (Geographic, Conference, etc.)

In [7]:
# Perform ANOVA for win_pct based on state
fit_state = ols("win_pct ~ state", data=hs_nfl_df).fit()
anova_table_state = sm.stats.anova_lm(fit_state, typ=2)

# Extract p-value for state
p_value_state = anova_table_state["PR(>F)"].iloc[0]
print("ANOVA for Win Percentage by State")
print(anova_table_state)
print(f"P-value: {p_value_state:.8f}")

# Interpretation of ANOVA result for state
alpha = 0.05
if p_value_state < alpha:
    print("Reject H0: The win percentages are significantly different across states.")
else:
    print("Fail to reject H0: No significant difference in win percentages across states.")

# Perform ANOVA for win_pct based on team_level
fit_level = ols("win_pct ~ team_level", data=hs_nfl_df).fit()
anova_table_level = sm.stats.anova_lm(fit_level, typ=2)

# Extract p-value for team_level
p_value_level = anova_table_level["PR(>F)"].iloc[0]
print("\nANOVA for Win Percentage by Team Level")
print(anova_table_level)
print(f"P-value: {p_value_level:.8f}")

# Interpretation of ANOVA result for team_level
if p_value_level < alpha:
    print("Reject H0: The win percentages are significantly different across team levels.")
else:
    print("Fail to reject H0: No significant difference in win percentages across team levels.")

ANOVA for Win Percentage by State
            sum_sq    df         F    PR(>F)
state     0.570506  23.0  0.581671  0.917978
Residual  1.876322  44.0       NaN       NaN
P-value: 0.91797807
Fail to reject H0: No significant difference in win percentages across states.

ANOVA for Win Percentage by Team Level
              sum_sq    df         F    PR(>F)
team_level  0.013434   1.0  0.364361  0.548163
Residual    2.433394  66.0       NaN       NaN
P-value: 0.54816327
Fail to reject H0: No significant difference in win percentages across team levels.


In [8]:
# Perform ANOVA for win_pct based on conference
fit = ols("win_pct ~ C(conference)", data=cfb_nfl_df).fit()
anova_table = sm.stats.anova_lm(fit, typ=2)

print("One-Way ANOVA for Win Percentage by Conference")
print(anova_table)

# Extract p-value from the ANOVA table
p_value = anova_table["PR(>F)"].iloc[0]
print(f"P-value: {p_value:.8f}")

# Interpretation of ANOVA result
alpha = 0.05
if p_value < alpha:
    print("Reject H0: The win percentages are significantly different across conferences.")
else:
    print("Fail to reject H0: No significant difference in win percentages across conferences.")

One-Way ANOVA for Win Percentage by Conference
                 sum_sq     df         F    PR(>F)
C(conference)  0.296639   12.0  0.595238  0.843627
Residual       6.312482  152.0       NaN       NaN
P-value: 0.84362710
Fail to reject H0: No significant difference in win percentages across conferences.


### Question 3: Run vs. Pass by Level of Football, Geographic Regions

In [10]:
# Perform ANOVA for rush_ypg based on team_level
fit_rush = ols("rush_ypg ~ C(team_level)", data=all_fb_df).fit()
anova_table_rush = sm.stats.anova_lm(fit_rush, typ=2)

# Output the ANOVA table and p-value
print("One-Way ANOVA for Rushing Yards Per Game by Team Level")
print(anova_table_rush)

# Extract p-value
p_value_rush = anova_table_rush["PR(>F)"].iloc[0]
print(f"P-value for rush_ypg: {p_value_rush:.8f}")

# Interpretation of ANOVA result
alpha = 0.05
if p_value_rush < alpha:
    print("Reject H0: The mean rushing yards per game are significantly different across team levels.")
else:
    print("Fail to reject H0: No significant difference in rushing yards per game across team levels.")

One-Way ANOVA for Rushing Yards Per Game by Team Level
                      sum_sq     df          F        PR(>F)
C(team_level)   54238.827768    2.0  15.574992  5.223848e-07
Residual       344760.620989  198.0        NaN           NaN
P-value for rush_ypg: 0.00000052
Reject H0: The mean rushing yards per game are significantly different across team levels.


In [11]:
# Perform ANOVA for pass_ypg based on team_level
fit_pass = ols("pass_ypg ~ C(team_level)", data=all_fb_df).fit()
anova_table_pass = sm.stats.anova_lm(fit_pass, typ=2)

# Output the ANOVA table and p-value
print("One-Way ANOVA for Passing Yards Per Game by Team Level")
print(anova_table_pass)

# Extract p-value
p_value_pass = anova_table_pass["PR(>F)"].iloc[0]
print(f"P-value for pass_ypg: {p_value_pass:.8f}")

# Interpretation of ANOVA result
alpha = 0.05
if p_value_pass < alpha:
    print("Reject H0: The mean passing yards per game are significantly different across team levels.")
else:
    print("Fail to reject H0: No significant difference in passing yards per game across team levels.")


One-Way ANOVA for Passing Yards Per Game by Team Level
                      sum_sq     df          F        PR(>F)
C(team_level)  206073.442183    2.0  40.449896  1.864160e-15
Residual       504359.046175  198.0        NaN           NaN
P-value for pass_ypg: 0.00000000
Reject H0: The mean passing yards per game are significantly different across team levels.


In [12]:
# Perform One-Way ANOVA for rush_ypg based on region
fit_rush_region = ols("rush_ypg ~ C(region)", data=all_fb_region_df).fit()
anova_table_rush_region = sm.stats.anova_lm(fit_rush_region, typ=2)

# Output the ANOVA table and p-value
print("One-Way ANOVA for Rushing Yards Per Game by Region")
print(anova_table_rush_region)

# Extract p-value
p_value_rush_region = anova_table_rush_region["PR(>F)"].iloc[0]
print(f"P-value for region (rush_ypg): {p_value_rush_region:.8f}")

# Interpretation of ANOVA result
alpha = 0.05
if p_value_rush_region < alpha:
    print("Reject H0: The mean rushing yards per game are significantly different across regions.")
else:
    print("Fail to reject H0: No significant difference in rushing yards per game across regions.")

One-Way ANOVA for Rushing Yards Per Game by Region
                  sum_sq    df         F    PR(>F)
C(region)   27686.019994   4.0  2.940078  0.027132
Residual   148314.012359  63.0       NaN       NaN
P-value for region (rush_ypg): 0.02713196
Reject H0: The mean rushing yards per game are significantly different across regions.


In [13]:
# Perform One-Way ANOVA for pass_ypg based on region
fit_pass_region = ols("pass_ypg ~ C(region)", data=all_fb_region_df).fit()
anova_table_pass_region = sm.stats.anova_lm(fit_pass_region, typ=2)

# Output the ANOVA table and p-value
print("One-Way ANOVA for Passing Yards Per Game by Region")
print(anova_table_pass_region)

# Extract p-value
p_value_pass_region = anova_table_pass_region["PR(>F)"].iloc[0]
print(f"P-value for region (pass_ypg): {p_value_pass_region:.8f}")

# Interpretation of ANOVA result
alpha = 0.05
if p_value_pass_region < alpha:
    print("Reject H0: The mean passing yards per game are significantly different across regions.")
else:
    print("Fail to reject H0: No significant difference in passing yards per game across regions.")


One-Way ANOVA for Passing Yards Per Game by Region
                  sum_sq    df         F   PR(>F)
C(region)   50761.883234   4.0  4.299351  0.00387
Residual   185958.214854  63.0       NaN      NaN
P-value for region (pass_ypg): 0.00387034
Reject H0: The mean passing yards per game are significantly different across regions.


In [14]:
# Perform ANOVA for rush_ypg across conferences
fit_rush_conf = ols("rush_ypg ~ C(conference)", data=cfb_nfl_df).fit()
anova_table_rush_conf = sm.stats.anova_lm(fit_rush_conf, typ=2)

# Output the ANOVA table and p-value
print("One-Way ANOVA for Rushing Yards Per Game by Conference")
print(anova_table_rush_conf)

# Extract p-value
p_value_rush_conf = anova_table_rush_conf["PR(>F)"].iloc[0]
print(f"P-value for rush_ypg (conference): {p_value_rush_conf:.8f}")

# Interpretation of ANOVA result
alpha = 0.05
if p_value_rush_conf < alpha:
    print("Reject H0: The mean rushing yards per game are significantly different across conferences.")
else:
    print("Fail to reject H0: No significant difference in rushing yards per game across conferences.")

One-Way ANOVA for Rushing Yards Per Game by Conference
                      sum_sq     df         F  PR(>F)
C(conference)   59357.571989   12.0  3.588641  0.0001
Residual       209511.764375  152.0       NaN     NaN
P-value for rush_ypg (conference): 0.00010025
Reject H0: The mean rushing yards per game are significantly different across conferences.


In [15]:
# Perform ANOVA for pass_ypg across conferences
fit_pass_conf = ols("pass_ypg ~ C(conference)", data=cfb_nfl_df).fit()
anova_table_pass_conf = sm.stats.anova_lm(fit_pass_conf, typ=2)

# Output the ANOVA table and p-value
print("One-Way ANOVA for Passing Yards Per Game by Conference")
print(anova_table_pass_conf)

# Extract p-value
p_value_pass_conf = anova_table_pass_conf["PR(>F)"].iloc[0]
print(f"P-value for pass_ypg (conference): {p_value_pass_conf:.8f}")

# Interpretation of ANOVA result
alpha = 0.05
if p_value_pass_conf < alpha:
    print("Reject H0: The mean passing yards per game are significantly different across conferences.")
else:
    print("Fail to reject H0: No significant difference in passing yards per game across conferences.")


One-Way ANOVA for Passing Yards Per Game by Conference
                      sum_sq     df        F    PR(>F)
C(conference)   83182.190336   12.0  3.39731  0.000202
Residual       310139.792937  152.0      NaN       NaN
P-value for pass_ypg (conference): 0.00020183
Reject H0: The mean passing yards per game are significantly different across conferences.
