# Exploratory Data Analysis
- Goal is to investigate how well NFL win totals describe the season's performances and how well they predict future performances

### Recreate Football Outsiders 2011 Week 13 Pythagorean Calc

In [1]:
# ref: https://www.footballoutsiders.com/dvoa-ratings/2011/week-13-dvoa-ratings

In [2]:
import os
import numpy as np
import pandas as pd
import nfl_data_py as nfl

In [3]:
season_df = nfl.import_schedules(years=[2011])
season_df.columns

Index(['game_id', 'season', 'game_type', 'week', 'gameday', 'weekday',
       'gametime', 'away_team', 'away_score', 'home_team', 'home_score',
       'location', 'result', 'total', 'overtime', 'old_game_id', 'gsis',
       'nfl_detail_id', 'pfr', 'pff', 'espn', 'away_rest', 'home_rest',
       'away_moneyline', 'home_moneyline', 'spread_line', 'away_spread_odds',
       'home_spread_odds', 'total_line', 'under_odds', 'over_odds', 'div_game',
       'roof', 'surface', 'temp', 'wind', 'away_qb_id', 'home_qb_id',
       'away_qb_name', 'home_qb_name', 'away_coach', 'home_coach', 'referee',
       'stadium_id', 'stadium'],
      dtype='object')

In [4]:
regular_season = 'REG'
season_cols = ['game_id','season','game_type','week','home_team','away_team','home_score','away_score']
season_df = season_df.loc[((season_df['game_type'] == regular_season)&(season_df['week'] <= 13)),season_cols]

In [5]:
season_df = pd.melt(season_df, id_vars=['game_id','season','game_type','home_score', 'away_score'], var_name=['home_away'], value_vars=['home_team', 'away_team'], value_name='team_name')

In [6]:
season_df

Unnamed: 0,game_id,season,game_type,home_score,away_score,home_away,team_name
0,2011_01_NO_GB,2011,REG,42.0,34.0,home_team,GB
1,2011_01_PIT_BAL,2011,REG,35.0,7.0,home_team,BAL
2,2011_01_ATL_CHI,2011,REG,30.0,12.0,home_team,CHI
3,2011_01_CIN_CLE,2011,REG,17.0,27.0,home_team,CLE
4,2011_01_IND_HOU,2011,REG,34.0,7.0,home_team,HOU
...,...,...,...,...,...,...,...
379,2011_13_DAL_ARI,2011,REG,19.0,13.0,away_team,DAL
380,2011_13_GB_NYG,2011,REG,35.0,38.0,away_team,GB
381,2011_13_STL_SF,2011,REG,26.0,0.0,away_team,STL
382,2011_13_DET_NO,2011,REG,31.0,17.0,away_team,DET


In [7]:
home_team = (season_df.home_away == 'home_team')
away_team = (season_df.home_away == 'away_team')
home_points_for = season_df.home_score
away_points_for = season_df.away_score
conditions = [home_team, away_team]
results = [home_points_for, away_points_for]
season_df['points_for'] = np.select(conditions, results, default=0)

In [8]:
home_team = (season_df.home_away == 'home_team')
away_team = (season_df.home_away == 'away_team')
home_points_against = season_df.away_score
away_points_against = season_df.home_score
conditions = [home_team, away_team]
results = [home_points_against, away_points_against]
season_df['points_against'] = np.select(conditions, results, default=0)

In [9]:
season_df

Unnamed: 0,game_id,season,game_type,home_score,away_score,home_away,team_name,points_for,points_against
0,2011_01_NO_GB,2011,REG,42.0,34.0,home_team,GB,42.0,34.0
1,2011_01_PIT_BAL,2011,REG,35.0,7.0,home_team,BAL,35.0,7.0
2,2011_01_ATL_CHI,2011,REG,30.0,12.0,home_team,CHI,30.0,12.0
3,2011_01_CIN_CLE,2011,REG,17.0,27.0,home_team,CLE,17.0,27.0
4,2011_01_IND_HOU,2011,REG,34.0,7.0,home_team,HOU,34.0,7.0
...,...,...,...,...,...,...,...,...,...
379,2011_13_DAL_ARI,2011,REG,19.0,13.0,away_team,DAL,13.0,19.0
380,2011_13_GB_NYG,2011,REG,35.0,38.0,away_team,GB,38.0,35.0
381,2011_13_STL_SF,2011,REG,26.0,0.0,away_team,STL,0.0,26.0
382,2011_13_DET_NO,2011,REG,31.0,17.0,away_team,DET,17.0,31.0


In [10]:
points_df = season_df.groupby(['team_name','season']).agg({'points_for':'sum', 'points_against':'sum', 'game_type':'count'}).reset_index()

In [11]:
points_df['pythagorean'] = ((points_df['points_for']**2.37) / (points_df['points_for']**2.37 + points_df['points_against']**2.37))
points_df['pythagorean_wins'] = points_df['pythagorean'] * points_df['game_type']

In [12]:
points_total = (points_df['points_for'] + points_df['points_against'])
games_played = points_df['game_type']
avg_tot_pts = points_total / games_played

In [13]:
points_df['fo_pythagenport_exp'] = np.log10(avg_tot_pts) * 1.5 #ref: https://legacy.baseballprospectus.com/glossary/index.php?mode=viewstat&stat=559

In [14]:
points_df['fo_pythagorean'] = ((points_df['points_for']**points_df['fo_pythagenport_exp']) / (points_df['points_for']**points_df['fo_pythagenport_exp'] + points_df['points_against']**points_df['fo_pythagenport_exp']))
points_df['fo_pythagorean_wins'] = points_df['fo_pythagorean'] * points_df['game_type']

In [15]:
points_df

Unnamed: 0,team_name,season,points_for,points_against,game_type,pythagorean,pythagorean_wins,fo_pythagenport_exp,fo_pythagorean,fo_pythagorean_wins
0,ARI,2011,232.0,269.0,12,0.413213,4.958557,2.430985,0.411027,4.932321
1,ATL,2011,269.0,244.0,12,0.557538,6.69046,2.446404,0.559376,6.712512
2,BAL,2011,296.0,192.0,12,0.736118,8.833415,2.413858,0.739789,8.877468
3,BUF,2011,278.0,304.0,12,0.447224,5.366687,2.528613,0.443721,5.32465
4,CAR,2011,290.0,324.0,12,0.434689,5.21627,2.563481,0.429426,5.15311
5,CHI,2011,291.0,242.0,12,0.607542,7.29051,2.471319,0.611988,7.343853
6,CIN,2011,266.0,250.0,12,0.53669,6.440279,2.450203,0.537927,6.455122
7,CLE,2011,175.0,240.0,12,0.321133,3.853592,2.3083,0.325396,3.904751
8,DAL,2011,283.0,244.0,12,0.586962,7.043544,2.463944,0.590335,7.08402
9,DEN,2011,256.0,292.0,12,0.422667,5.072,2.489399,0.418838,5.026054
