#### Merging games dataframe and season stats dataframe

> Run all cells in sequence<br>
> Assumes season stats & game stats files are in '..\data' directory

In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

In [2]:
full_path = os.getcwd()
base_path = str(Path(full_path).parents[0]) + '\data'  # one up

team_stats_file = base_path + r'\1970-2020.csv'
team_schedules_file = base_path + r'\1970-2020_team_schedule.csv'

---

In [3]:
df_stats = pd.read_csv(team_stats_file, sep=';')
df_schedules = pd.read_csv(team_schedules_file, sep=';')

In [4]:
unnamed_cols = {
    'Unnamed: 0': 'year',
    'Unnamed: 1': 'team',
    'Unnamed: 2': 'week_number'
}
df_stats.drop('Unnamed: 1', axis=1, inplace=True)
df_stats.rename(columns=unnamed_cols, inplace=True)
df_schedules.rename(columns=unnamed_cols, inplace=True)

#### add 'is_playoff' col

In [5]:
df_schedules['is_playoff'] = False

for season in df_schedules['year'].unique():
    df_season = df_schedules.query('year == @season')  # only current season
    print(season, end=', ', flush=False)
    for team in df_season['team'].unique():
        df_season_team = df_season.query('team == @team')  # only current team
        
        try:  # try to find a row with 'playoff' value
            playoff_index = df_season_team[df_season_team['game_date'] == 'Playoffs'].index[0]
        except IndexError:
            playoff_index = False

        max_index = df_season_team.index[-1]
        if playoff_index:  # if there is a playoff row
            for row in range(playoff_index+1, max_index+1):  # for games after playoff row and before last row of current team and current season
                playoff_game = df_season_team.loc[row].to_dict()
                
                df_schedules.loc[row, 'is_playoff'] = True

print('Done')

2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999, 1998, 1997, 1996, 1995, 1994, 1993, 1992, 1991, 1990, 1989, 1988, 1987, 1986, 1985, 1984, 1983, 1982, 1981, 1980, 1979, 1978, 1977, 1976, 1975, 1974, 1973, 1972, 1971, 1970, Done


In [6]:
df_schedules[df_schedules['is_playoff'] == True][['game_date', 'year', 'team', 'opp', 'game_date', 'game_outcome']]

Unnamed: 0,game_date,year,team,opp,game_date.1,game_outcome
18,January 9,2020,Buffalo Bills,Indianapolis Colts,January 9,W
19,January 16,2020,Buffalo Bills,Baltimore Ravens,January 16,W
20,January 24,2020,Buffalo Bills,Kansas City Chiefs,January 24,L
90,January 10,2020,Pittsburgh Steelers,Cleveland Browns,January 10,L
109,January 10,2020,Baltimore Ravens,Tennessee Titans,January 10,W
...,...,...,...,...,...,...
25886,January 17,1970,Dallas Cowboys,Baltimore Colts,January 17,L
25958,December 27,1970,Minnesota Vikings,San Francisco 49ers,December 27,L
25974,December 26,1970,Detroit Lions,Dallas Cowboys,December 26,L
26018,December 27,1970,San Francisco 49ers,Minnesota Vikings,December 27,W


#### remove 'bye week' & 'playoff' rows

In [7]:
df_bye_week = df_schedules[(df_schedules['opp'] == 'Bye Week') | (df_schedules['game_date'] == 'Playoffs')]
df_schedules = df_schedules.drop(df_bye_week.index, axis=0)

---

#### column cleaning

In [8]:
home_games = df_schedules[df_schedules['game_location'] == '@'].index
away_games = df_schedules[pd.isna(df_schedules['game_location'])].index

df_schedules.loc[home_games, 'game_location'] = 'home'
df_schedules.loc[away_games, 'game_location'] = 'away'

df_schedules['game_location'].head()

0    away
1    home
2    away
3    home
4    home
Name: game_location, dtype: object

In [9]:
ot_games = df_schedules[df_schedules['overtime'] == 'OT'].index
non_ot_games = df_schedules[pd.isna(df_schedules['overtime'])].index

df_schedules.loc[ot_games, 'overtime'] = True
df_schedules.loc[non_ot_games, 'overtime'] = False

df_schedules['overtime'].head()

0    False
1    False
2    False
3    False
4    False
Name: overtime, dtype: bool

#### further column cleaning

In [10]:
columns = df_schedules.columns
df_schedules = df_schedules.drop('boxscore_word', axis=1)

In [11]:
df_schedules.head()

Unnamed: 0,year,team,week_number,game_day_of_week,game_date,game_time,game_outcome,overtime,team_record,game_location,...,to_off,first_down_def,yards_def,pass_yds_def,rush_yds_def,to_def,exp_pts_off,exp_pts_def,exp_pts_st,is_playoff
0,2020,Buffalo Bills,0,Sun,September 13,1:00PM ET,W,False,1-0,away,...,2.0,15.0,254.0,202.0,52.0,2.0,14.47,0.12,-3.63,False
1,2020,Buffalo Bills,1,Sun,September 20,1:00PM ET,W,False,2-0,home,...,1.0,28.0,410.0,311.0,99.0,,21.16,-12.7,-2.3,False
2,2020,Buffalo Bills,2,Sun,September 27,1:00PM ET,W,False,3-0,away,...,2.0,28.0,478.0,311.0,167.0,2.0,17.83,-19.86,6.79,False
3,2020,Buffalo Bills,3,Sun,October 4,4:25PM ET,W,False,4-0,home,...,,24.0,383.0,297.0,86.0,2.0,16.49,-7.2,-1.37,False
4,2020,Buffalo Bills,4,Tue,October 13,7:00PM ET,L,False,4-1,home,...,3.0,25.0,334.0,195.0,139.0,,1.55,-24.29,-0.82,False


#### the actual merge

##### Column prefixes
    - gs_  : game_stat_
    - hts_ : home_team_stat_
    - ats_ : away_team_stat_

In [12]:
df_schedules.columns = 'gs_' + df_schedules.columns

df_home_team_stats = df_stats.copy()
df_home_team_stats.columns =  'hts_' + df_home_team_stats.columns

df_away_team_stats = df_stats.copy()
df_away_team_stats.columns =  'ats_' + df_away_team_stats.columns

In [13]:
df_games = pd.merge(
    df_schedules, 
    df_home_team_stats, 
    how='inner', 
    left_on=['gs_team', 'gs_year'], 
    right_on=['hts_team', 'hts_year'],
)
print('With home team stats:', df_games.shape)

df_games = pd.merge(
    df_games, 
    df_away_team_stats, 
    how='inner', 
    left_on=['gs_opp', 'gs_year'], 
    right_on=['ats_team', 'ats_year'],
)
print('With thome and away team stats:', df_games.shape)

With home team stats: (24522, 154)
With thome and away team stats: (24522, 281)


In [14]:
df_games[['gs_team', 'gs_opp', 'gs_year', 'gs_week_number', 'hts_wins', 'ats_wins']]

Unnamed: 0,gs_team,gs_opp,gs_year,gs_week_number,hts_wins,ats_wins
0,Buffalo Bills,New York Jets,2020,0,13,2
1,Buffalo Bills,New York Jets,2020,6,13,2
2,Miami Dolphins,New York Jets,2020,5,10,2
3,Miami Dolphins,New York Jets,2020,11,10,2
4,New England Patriots,New York Jets,2020,8,7,2
...,...,...,...,...,...,...
24517,Los Angeles Rams,San Francisco 49ers,1970,10,9,10
24518,Atlanta Falcons,San Francisco 49ers,1970,2,4,10
24519,Atlanta Falcons,San Francisco 49ers,1970,11,4,10
24520,New Orleans Saints,San Francisco 49ers,1970,4,2,10


---

#### playoffs and regular season

In [15]:
df_regular_season = df_games.query('gs_is_playoff == False')
df_playoffs = df_games.query('gs_is_playoff == True')

In [16]:
df_regular_season.head()

Unnamed: 0,gs_year,gs_team,gs_week_number,gs_game_day_of_week,gs_game_date,gs_game_time,gs_game_outcome,gs_overtime,gs_team_record,gs_game_location,...,ats_red_zone_att,ats_red_zone_scores,ats_red_zone_pct,ats_drives,ats_play_count_tip,ats_plays_per_drive,ats_yds_per_drive,ats_start_avg,ats_time_avg,ats_points_avg
0,2020,Buffalo Bills,0,Sun,September 13,1:00PM ET,W,False,1-0,away,...,38.0,16.0,42.1%,175.0,976.0,5.6,25.6,Own 26.4,2:33,1.36
1,2020,Buffalo Bills,6,Sun,October 25,1:00PM ET,W,False,5-2,home,...,38.0,16.0,42.1%,175.0,976.0,5.6,25.6,Own 26.4,2:33,1.36
2,2020,Miami Dolphins,5,Sun,October 18,4:05PM ET,W,False,3-3,away,...,38.0,16.0,42.1%,175.0,976.0,5.6,25.6,Own 26.4,2:33,1.36
3,2020,Miami Dolphins,11,Sun,November 29,1:00PM ET,W,False,7-4,home,...,38.0,16.0,42.1%,175.0,976.0,5.6,25.6,Own 26.4,2:33,1.36
4,2020,New England Patriots,8,Mon,November 9,8:15PM ET,W,False,3-5,home,...,38.0,16.0,42.1%,175.0,976.0,5.6,25.6,Own 26.4,2:33,1.36


In [17]:
df_playoffs.head()

Unnamed: 0,gs_year,gs_team,gs_week_number,gs_game_day_of_week,gs_game_date,gs_game_time,gs_game_outcome,gs_overtime,gs_team_record,gs_game_location,...,ats_red_zone_att,ats_red_zone_scores,ats_red_zone_pct,ats_drives,ats_play_count_tip,ats_plays_per_drive,ats_yds_per_drive,ats_start_avg,ats_time_avg,ats_points_avg
40,2020,Green Bay Packers,18,Sat,January 16,4:35PM ET,W,False,14-3,away,...,57.0,33.0,57.9%,184.0,1118.0,6.1,32.7,Own 27.9,2:47,1.88
45,2020,Seattle Seahawks,18,Sat,January 9,4:40PM ET,L,False,12-5,away,...,57.0,33.0,57.9%,184.0,1118.0,6.1,32.7,Own 27.9,2:47,1.88
69,2020,Baltimore Ravens,18,Sun,January 10,1:05PM ET,W,False,12-5,home,...,64.0,48.0,75.0%,167.0,1059.0,6.3,37.7,Own 28.7,2:45,2.87
84,2020,Buffalo Bills,20,Sun,January 24,6:40PM ET,L,False,15-4,home,...,59.0,36.0,61.0%,163.0,1084.0,6.7,40.7,Own 28.2,3:00,2.74
89,2020,Cleveland Browns,19,Sun,January 17,3:05PM ET,L,False,12-6,home,...,59.0,36.0,61.0%,163.0,1084.0,6.7,40.7,Own 28.2,3:00,2.74


In [18]:
game_stats_cols = df_playoffs.columns[pd.Series(df_playoffs.columns.str.startswith('gs_'))]
home_team_stats_cols = df_playoffs.columns[pd.Series(df_playoffs.columns.str.startswith('hts_'))]
away_team_stats_cols = df_playoffs.columns[pd.Series(df_playoffs.columns.str.startswith('ats_'))]

col_groups = [('game_stats', game_stats_cols), ('home_team_stats', home_team_stats_cols), ('away_team_stats', away_team_stats_cols)]

In [19]:
for name, group in col_groups:
    c = 0
    print(f'{name}:', end=' ')
    for col in group:
        print(col, end=', ')
        c += 1
        if c == 5: break
    print('')

game_stats: gs_year, gs_team, gs_week_number, gs_game_day_of_week, gs_game_date, 
home_team_stats: hts_year, hts_wins, hts_losses, hts_ties, hts_win_loss_perc, 
away_team_stats: ats_year, ats_wins, ats_losses, ats_ties, ats_win_loss_perc, 


#### export to csv

In [20]:
df_games.to_csv('all_games.csv', sep=';', encoding='utf-8', index=False)
df_playoffs.to_csv('playoffs.csv', sep=';', encoding='utf-8', index=False)
df_regular_season.to_csv('regular_season.csv', sep=';', encoding='utf-8', index=False)

---

#### exploration

##### 2020 & Kansas City Chiefs

In [None]:
df_stats[(df_stats['year'] == 2020) & (df_stats['team'] == 'Kansas City Chiefs')]

In [None]:
df_playoffs[(df_playoffs['gs_year'] == 2020) & (df_playoffs['gs_team'] == 'Kansas City Chiefs')][home_team_stats_cols]

##### 1970 & Dallas Cowboys

In [None]:
df_stats[(df_stats['year'] == 1970) & (df_stats['team'] == 'Dallas Cowboys')]

In [None]:
df_playoffs[(df_playoffs['gs_year'] == 1970) & (df_playoffs['gs_team'] == 'Dallas Cowboys')][home_team_stats_cols]