# Data Collection

### Obtain and Clean the Data

kaggle datatsets:
- https://www.kaggle.com/datasets/zynicide/nfl-football-player-stats?select=games_1512362753.8735218.json
- https://www.kaggle.com/datasets/kendallgillies/nflstatistics
- https://www.kaggle.com/datasets/tobycrabtree/nfl-scores-and-betting-data?select=spreadspoke_scores.csv  !!!
- https://www.kaggle.com/datasets/grayengineering425/nfl-box-scores  !!!
- https://www.kaggle.com/datasets/shanyachaubey/nfl20102021offdefplayoffteamstats  !!!

kaggle notebook:
- https://www.kaggle.com/code/sanjayv007/nfl-big-data-bowl-beginner-s-complete-eda#Data-Cleaning
- https://www.kaggle.com/code/zwartfreak/nfl-games-players-analysis-and-visualization#Data-visualization

scrape:
- pro-football-reference.com

In [16]:
import pandas as pd
import numpy as np

### NFL 2010 to 2022 Stats Kaggle Dataset

In [17]:
nfl_game_data = pd.read_csv('data/NFL_data.csv')
nfl_game_data.head()

Unnamed: 0,Team,WinLoss perc,PD,Year,FGM,FG_perc,RedZone_perc,playoff_win_perc,Score_perc,Turnover_perc,RushYperG,PassYperG,PointperG,possperG,YallowedperG,PointallowedperG,perc_punt_20,Result
0,New England Patriots,87.5,205,2010,22,88.0,62.7,87.5,47.0,5.4,123.3,240.4,32.4,0.49,366.9,19.6,30.6,Loss
1,New York Jets,68.8,63,2010,30,76.9,40.0,68.75,32.2,10.6,148.4,203.2,22.9,0.543611,291.4,19.0,45.5,Loss
2,Miami Dolphins,43.8,-60,2010,30,73.2,52.9,0.0,29.7,15.7,102.7,220.4,17.1,0.514722,309.3,20.8,42.5,Loss
3,Buffalo Bills,25.0,-142,2010,16,76.2,51.4,0.0,25.3,19.2,107.5,197.8,17.7,0.476389,361.6,26.6,22.7,Loss
4,Pittsburgh Steelers,75.0,143,2010,29,78.4,48.0,75.0,36.9,9.5,120.2,225.0,23.4,0.54,276.8,14.5,31.6,Loss


In [18]:
# drop columns that are not needed
nfl_game_data.drop(columns=['playoff_win_perc', 'Score_perc', 'Turnover_perc', 'perc_punt_20', 'Result'], inplace=True)

In [19]:
nfl_game_data.head()

Unnamed: 0,Team,WinLoss perc,PD,Year,FGM,FG_perc,RedZone_perc,RushYperG,PassYperG,PointperG,possperG,YallowedperG,PointallowedperG
0,New England Patriots,87.5,205,2010,22,88.0,62.7,123.3,240.4,32.4,0.49,366.9,19.6
1,New York Jets,68.8,63,2010,30,76.9,40.0,148.4,203.2,22.9,0.543611,291.4,19.0
2,Miami Dolphins,43.8,-60,2010,30,73.2,52.9,102.7,220.4,17.1,0.514722,309.3,20.8
3,Buffalo Bills,25.0,-142,2010,16,76.2,51.4,107.5,197.8,17.7,0.476389,361.6,26.6
4,Pittsburgh Steelers,75.0,143,2010,29,78.4,48.0,120.2,225.0,23.4,0.54,276.8,14.5


In [20]:
nfl_game_data.isna().sum()

Team                0
WinLoss perc        0
PD                  0
Year                0
FGM                 0
FG_perc             0
RedZone_perc        0
RushYperG           0
PassYperG           0
PointperG           0
possperG            0
YallowedperG        0
PointallowedperG    0
dtype: int64

In [21]:
nfl_game_data['Year'].unique()

array([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020,
       2021])

In [22]:
nfl_game_data.to_csv('cleaned-data/nfl_team_stats_2010_2021.csv')

### NFL Scores and Betting Data Kaggle Dataset

#### NFL Game Data from 2010 to 2023 CSV

In [23]:
nfl_scores = pd.read_csv('data/spreadspoke_scores.csv')
nfl_scores.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail
0,9/2/1966,1966,1,False,Miami Dolphins,14.0,23.0,Oakland Raiders,,,,Orange Bowl,False,83.0,6.0,71.0,
1,9/3/1966,1966,1,False,Houston Oilers,45.0,7.0,Denver Broncos,,,,Rice Stadium,False,81.0,7.0,70.0,
2,9/4/1966,1966,1,False,San Diego Chargers,27.0,7.0,Buffalo Bills,,,,Balboa Stadium,False,70.0,7.0,82.0,
3,9/9/1966,1966,2,False,Miami Dolphins,14.0,19.0,New York Jets,,,,Orange Bowl,False,82.0,11.0,78.0,
4,9/10/1966,1966,1,False,Green Bay Packers,24.0,3.0,Baltimore Colts,,,,Lambeau Field,False,64.0,8.0,62.0,


In [24]:
nfl_scores.columns

Index(['schedule_date', 'schedule_season', 'schedule_week', 'schedule_playoff',
       'team_home', 'score_home', 'score_away', 'team_away',
       'team_favorite_id', 'spread_favorite', 'over_under_line', 'stadium',
       'stadium_neutral', 'weather_temperature', 'weather_wind_mph',
       'weather_humidity', 'weather_detail'],
      dtype='object')

In [25]:
# drop columns that are not needed
nfl_scores.drop(columns=['schedule_playoff', 'over_under_line', 'stadium_neutral', 'weather_humidity', 'weather_detail'], inplace=True)
nfl_scores.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,stadium,weather_temperature,weather_wind_mph
0,9/2/1966,1966,1,Miami Dolphins,14.0,23.0,Oakland Raiders,,,Orange Bowl,83.0,6.0
1,9/3/1966,1966,1,Houston Oilers,45.0,7.0,Denver Broncos,,,Rice Stadium,81.0,7.0
2,9/4/1966,1966,1,San Diego Chargers,27.0,7.0,Buffalo Bills,,,Balboa Stadium,70.0,7.0
3,9/9/1966,1966,2,Miami Dolphins,14.0,19.0,New York Jets,,,Orange Bowl,82.0,11.0
4,9/10/1966,1966,1,Green Bay Packers,24.0,3.0,Baltimore Colts,,,Lambeau Field,64.0,8.0


In [26]:
# get rid of the years before 2010
nfl_scores = nfl_scores[nfl_scores['schedule_season'] >= 2010]
nfl_scores.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,stadium,weather_temperature,weather_wind_mph
10008,9/9/2010,2010,1,New Orleans Saints,14.0,9.0,Minnesota Vikings,NO,-5.0,Louisiana Superdome,72.0,0.0
10009,9/12/2010,2010,1,Buffalo Bills,10.0,15.0,Miami Dolphins,MIA,-3.0,Ralph Wilson Stadium,64.0,7.0
10010,9/12/2010,2010,1,Chicago Bears,19.0,14.0,Detroit Lions,CHI,-6.5,Soldier Field,75.0,1.0
10011,9/12/2010,2010,1,Houston Texans,34.0,24.0,Indianapolis Colts,IND,-1.0,Reliant Stadium,89.0,5.0
10012,9/12/2010,2010,1,Jacksonville Jaguars,24.0,17.0,Denver Broncos,JAX,-3.0,EverBank Field,91.0,1.0


In [27]:
nfl_scores['schedule_season'].unique()

array([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020,
       2021, 2022, 2023])

In [28]:
# get rid of rows where score is na
nfl_scores = nfl_scores[nfl_scores['score_home'].notna() & nfl_scores['score_away'].notna()]

In [29]:
nfl_scores.isna().sum()

schedule_date            0
schedule_season          0
schedule_week            0
team_home                0
score_home               0
score_away               0
team_away                0
team_favorite_id         0
spread_favorite          0
stadium                  0
weather_temperature    765
weather_wind_mph       766
dtype: int64

In [30]:
# change date to datetime object
nfl_scores['schedule_date'] = pd.to_datetime(nfl_scores['schedule_date'])
nfl_scores.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,stadium,weather_temperature,weather_wind_mph
10008,2010-09-09,2010,1,New Orleans Saints,14.0,9.0,Minnesota Vikings,NO,-5.0,Louisiana Superdome,72.0,0.0
10009,2010-09-12,2010,1,Buffalo Bills,10.0,15.0,Miami Dolphins,MIA,-3.0,Ralph Wilson Stadium,64.0,7.0
10010,2010-09-12,2010,1,Chicago Bears,19.0,14.0,Detroit Lions,CHI,-6.5,Soldier Field,75.0,1.0
10011,2010-09-12,2010,1,Houston Texans,34.0,24.0,Indianapolis Colts,IND,-1.0,Reliant Stadium,89.0,5.0
10012,2010-09-12,2010,1,Jacksonville Jaguars,24.0,17.0,Denver Broncos,JAX,-3.0,EverBank Field,91.0,1.0


In [31]:
nfl_scores.to_csv('cleaned-data/nfl_game_data_2010_2023.csv')

#### NFL Teams CSV

In [32]:
nfl_teams = pd.read_csv('data/nfl_teams.csv')
nfl_teams.head()

Unnamed: 0,team_name,team_name_short,team_id,team_id_pfr,team_conference,team_division,team_conference_pre2002,team_division_pre2002
0,Arizona Cardinals,Cardinals,ARI,CRD,NFC,NFC West,NFC,NFC West
1,Atlanta Falcons,Falcons,ATL,ATL,NFC,NFC South,NFC,NFC West
2,Baltimore Colts,Colts,IND,CLT,AFC,,AFC,AFC East
3,Baltimore Ravens,Ravens,BAL,RAV,AFC,AFC North,AFC,AFC Central
4,Boston Patriots,Patriots,NE,NWE,AFC,,AFC,


In [33]:
# drop columns that are not needed
nfl_teams.drop(columns=['team_id_pfr', 'team_conference_pre2002', 'team_division_pre2002'], inplace=True)
nfl_teams.head()

Unnamed: 0,team_name,team_name_short,team_id,team_conference,team_division
0,Arizona Cardinals,Cardinals,ARI,NFC,NFC West
1,Atlanta Falcons,Falcons,ATL,NFC,NFC South
2,Baltimore Colts,Colts,IND,AFC,
3,Baltimore Ravens,Ravens,BAL,AFC,AFC North
4,Boston Patriots,Patriots,NE,AFC,


In [34]:
# drop rows where the team division is na
nfl_teams = nfl_teams[nfl_teams['team_division'].notna()]
nfl_teams.head()

Unnamed: 0,team_name,team_name_short,team_id,team_conference,team_division
0,Arizona Cardinals,Cardinals,ARI,NFC,NFC West
1,Atlanta Falcons,Falcons,ATL,NFC,NFC South
3,Baltimore Ravens,Ravens,BAL,AFC,AFC North
5,Buffalo Bills,Bills,BUF,AFC,AFC East
6,Carolina Panthers,Panthers,CAR,NFC,NFC South


In [35]:
nfl_teams = nfl_teams.sort_values(by=['team_division']).reset_index(drop=True)
nfl_teams.head()

Unnamed: 0,team_name,team_name_short,team_id,team_conference,team_division
0,New England Patriots,Patriots,NE,AFC,AFC East
1,Buffalo Bills,Bills,BUF,AFC,AFC East
2,Miami Dolphins,Dolphins,MIA,AFC,AFC East
3,New York Jets,Jets,NYJ,NFC,AFC East
4,Baltimore Ravens,Ravens,BAL,AFC,AFC North


In [36]:
# manually change jets conference to afc
nfl_teams.loc[3, 'team_conference'] = 'AFC'
nfl_teams.head()

Unnamed: 0,team_name,team_name_short,team_id,team_conference,team_division
0,New England Patriots,Patriots,NE,AFC,AFC East
1,Buffalo Bills,Bills,BUF,AFC,AFC East
2,Miami Dolphins,Dolphins,MIA,AFC,AFC East
3,New York Jets,Jets,NYJ,AFC,AFC East
4,Baltimore Ravens,Ravens,BAL,AFC,AFC North


In [37]:
nfl_teams.to_csv('cleaned-data/nfl_teams_info.csv')

### NFL Box Scores Kaggle Dataset

#### Box Scores CSV

In [38]:
box_score = pd.read_csv('data/box_scores.csv')
box_score.head()

Unnamed: 0,date,visitor,home,visitor_score,home_score,visitor_first_downs,visitor_rushing_first_downs,visitor_passing_first_downs,visitor_penalties,visitor_net_yards,...,home_kick_return_splits,home_int_return_splits,home_penalty_splits,home_fumble_splits,home_field_goals,home_third_down_splits,home_fourth_down_splits,home_total_plays,home_avg_gain,home_time_of_possession
0,"September 7, 2014",Cleveland,Pittsburgh,27,30,23,9,11,3,389,...,2-29,0-0,11-96,1-0,3-3,4-12-33%,1-1-100%,67,7.5,32:27
1,"September 7, 2014",Jacksonville,Philadelphia,17,34,18,2,14,2,306,...,1-24,0-0,6-50,3-2,2-2,8-19-42%,1-1-100%,82,5.1,30:46
2,"September 4, 2014",Green Bay,Seattle,16,36,19,4,13,2,255,...,3-60,1-21,4-69,2-1,2-2,4-11-36%,1-1-100%,66,6.0,33:20
3,"September 7, 2014",Minnesota,St. Louis,34,6,18,6,10,2,355,...,1-26,0-0,13-121,4-0,2-3,4-14-28%,0-0-0%,63,5.0,31:43
4,"September 7, 2014",Cincinnati,Baltimore,23,16,16,4,11,1,380,...,4-109,0-0,3-29,2-1,1-2,8-17-47%,1-2-50%,85,5.0,29:30


In [39]:
box_score.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4328 entries, 0 to 4327
Data columns (total 57 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   date                         4328 non-null   object 
 1   visitor                      4328 non-null   object 
 2   home                         4328 non-null   object 
 3   visitor_score                4328 non-null   int64  
 4   home_score                   4328 non-null   int64  
 5   visitor_first_downs          4328 non-null   int64  
 6   visitor_rushing_first_downs  4328 non-null   int64  
 7   visitor_passing_first_downs  4328 non-null   int64  
 8   visitor_penalties            4328 non-null   int64  
 9   visitor_net_yards            4328 non-null   int64  
 10  visitor_net_yards_rushing    4328 non-null   int64  
 11  visitor_rushing_plays        4328 non-null   int64  
 12  visitor_avg_rush             4328 non-null   float64
 13  visitor_net_yards_

In [40]:
box_score.columns

Index(['date', 'visitor', 'home', 'visitor_score', 'home_score',
       'visitor_first_downs', 'visitor_rushing_first_downs',
       'visitor_passing_first_downs', 'visitor_penalties', 'visitor_net_yards',
       'visitor_net_yards_rushing', 'visitor_rushing_plays',
       'visitor_avg_rush', 'visitor_net_yards_passing',
       'visitor_passing_splits', 'visitor_sack_splits',
       'visitor_gross_passing', 'visitor_yards_per_pass',
       'visitor_punt_splits_avg', 'visitor_punts_blocked',
       'visitor_punt_return_splits', 'visitor_kick_return_splits',
       'visitor_int_return_splits', 'visitor_penalty_splits',
       'visitor_fumble_splits', 'visitor_field_goals',
       'visitor_third_down_splits', 'visitor_fourth_down_splits',
       'visitor_total_plays', 'visitor_avg_gain', 'visitor_time_of_possession',
       'home_first_downs', 'home_rushing_first_downs',
       'home_passing_first_downs', 'home_penalties', 'home_net_yards',
       'home_net_yards_rushing', 'home_rushing_p

In [41]:
clean_box_score = box_score[['date', 'visitor', 'home', 'visitor_score', 'home_score', 'visitor_first_downs', 'visitor_net_yards', 'visitor_total_plays', 'visitor_avg_gain', 'visitor_time_of_possession', 
                             'home_first_downs', 'home_net_yards', 'home_total_plays', 'home_avg_gain', 'home_time_of_possession']]
clean_box_score.head()

Unnamed: 0,date,visitor,home,visitor_score,home_score,visitor_first_downs,visitor_net_yards,visitor_total_plays,visitor_avg_gain,visitor_time_of_possession,home_first_downs,home_net_yards,home_total_plays,home_avg_gain,home_time_of_possession
0,"September 7, 2014",Cleveland,Pittsburgh,27,30,23,389,64,6.1,27:33,24,503,67,7.5,32:27
1,"September 7, 2014",Jacksonville,Philadelphia,17,34,18,306,70,4.4,29:14,24,420,82,5.1,30:46
2,"September 4, 2014",Green Bay,Seattle,16,36,19,255,57,4.5,26:40,25,398,66,6.0,33:20
3,"September 7, 2014",Minnesota,St. Louis,34,6,18,355,57,6.2,28:17,15,318,63,5.0,31:43
4,"September 7, 2014",Cincinnati,Baltimore,23,16,16,380,64,5.9,30:30,26,423,85,5.0,29:30


In [42]:
clean_box_score['date'] = pd.to_datetime(clean_box_score['date'])
clean_box_score.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_box_score['date'] = pd.to_datetime(clean_box_score['date'])


Unnamed: 0,date,visitor,home,visitor_score,home_score,visitor_first_downs,visitor_net_yards,visitor_total_plays,visitor_avg_gain,visitor_time_of_possession,home_first_downs,home_net_yards,home_total_plays,home_avg_gain,home_time_of_possession
0,2014-09-07,Cleveland,Pittsburgh,27,30,23,389,64,6.1,27:33,24,503,67,7.5,32:27
1,2014-09-07,Jacksonville,Philadelphia,17,34,18,306,70,4.4,29:14,24,420,82,5.1,30:46
2,2014-09-04,Green Bay,Seattle,16,36,19,255,57,4.5,26:40,25,398,66,6.0,33:20
3,2014-09-07,Minnesota,St. Louis,34,6,18,355,57,6.2,28:17,15,318,63,5.0,31:43
4,2014-09-07,Cincinnati,Baltimore,23,16,16,380,64,5.9,30:30,26,423,85,5.0,29:30


In [43]:
clean_box_score = clean_box_score[clean_box_score['date'].dt.year >= 2010]
clean_box_score.head()

Unnamed: 0,date,visitor,home,visitor_score,home_score,visitor_first_downs,visitor_net_yards,visitor_total_plays,visitor_avg_gain,visitor_time_of_possession,home_first_downs,home_net_yards,home_total_plays,home_avg_gain,home_time_of_possession
0,2014-09-07,Cleveland,Pittsburgh,27,30,23,389,64,6.1,27:33,24,503,67,7.5,32:27
1,2014-09-07,Jacksonville,Philadelphia,17,34,18,306,70,4.4,29:14,24,420,82,5.1,30:46
2,2014-09-04,Green Bay,Seattle,16,36,19,255,57,4.5,26:40,25,398,66,6.0,33:20
3,2014-09-07,Minnesota,St. Louis,34,6,18,355,57,6.2,28:17,15,318,63,5.0,31:43
4,2014-09-07,Cincinnati,Baltimore,23,16,16,380,64,5.9,30:30,26,423,85,5.0,29:30


In [44]:
clean_box_score.to_csv('cleaned-data/box_scores_2010_2017.csv')

### Setting Up Data

In [45]:
# read all cleaned tables
box_scores = pd.read_csv("../phase2/cleaned-data/box_scores_2010_2017.csv").drop(columns=["Unnamed: 0"])
game_data = pd.read_csv("../phase2/cleaned-data/nfl_game_data_2010_2023.csv").drop(columns=["Unnamed: 0"])
team_stats = pd.read_csv("../phase2/cleaned-data/nfl_team_stats_2010_2021.csv").drop(columns=["Unnamed: 0"])
nfl_teams = pd.read_csv("../phase2/cleaned-data/nfl_teams_info.csv").drop(columns=["Unnamed: 0"])

In [46]:
# adding values for the St. Louis Rams and Las Vegas Raiders as they show up in the other datasets
rams = pd.DataFrame({"team_name": "St. Louis Rams", "team_name_short": "Rams", "team_id": "LAR", "team_conference": "NFC", "team_division": "NFC West"}, index=[0])
raiders = pd.DataFrame({"team_name": "Las Vegas Raiders", "team_name_short": "Raiders", "team_id": "LVR", "team_conference": "AFC", "team_division": "AFC West"}, index=[0])
nfl_teams = pd.concat([nfl_teams, rams, raiders], ignore_index=True)
nfl_teams = nfl_teams.sort_values(by=["team_division"]).reset_index(drop=True)
nfl_teams.head()

Unnamed: 0,team_name,team_name_short,team_id,team_conference,team_division
0,New England Patriots,Patriots,NE,AFC,AFC East
1,Buffalo Bills,Bills,BUF,AFC,AFC East
2,Miami Dolphins,Dolphins,MIA,AFC,AFC East
3,New York Jets,Jets,NYJ,AFC,AFC East
4,Baltimore Ravens,Ravens,BAL,AFC,AFC North


In [47]:
# function to get team id from city/team name
def get_team_id(city):
    # find the team name 
    for team in nfl_teams["team_name"]:
        if city in team:
            return nfl_teams[nfl_teams["team_name"] == team]["team_id"].values[0]
        elif city == "NY Giants":
            return "NYG"
        elif city == "NY Jets":
            return "NYJ"
        elif city == "LA Rams":
            return "LAR"
        elif city == "LA Chargers":
            return "LAC"

In [48]:
# adding team ids to the box scores dataset
box_scores["home_id"] = box_scores["home"].apply(get_team_id)
box_scores["away_id"] = box_scores["visitor"].apply(get_team_id)

box_scores.head()

Unnamed: 0,date,visitor,home,visitor_score,home_score,visitor_first_downs,visitor_net_yards,visitor_total_plays,visitor_avg_gain,visitor_time_of_possession,home_first_downs,home_net_yards,home_total_plays,home_avg_gain,home_time_of_possession,home_id,away_id
0,2014-09-07,Cleveland,Pittsburgh,27,30,23,389,64,6.1,27:33,24,503,67,7.5,32:27,PIT,CLE
1,2014-09-07,Jacksonville,Philadelphia,17,34,18,306,70,4.4,29:14,24,420,82,5.1,30:46,PHI,JAX
2,2014-09-04,Green Bay,Seattle,16,36,19,255,57,4.5,26:40,25,398,66,6.0,33:20,SEA,GB
3,2014-09-07,Minnesota,St. Louis,34,6,18,355,57,6.2,28:17,15,318,63,5.0,31:43,LAR,MIN
4,2014-09-07,Cincinnati,Baltimore,23,16,16,380,64,5.9,30:30,26,423,85,5.0,29:30,BAL,CIN


In [49]:
# adding team ids to the game data dataset
game_data["home_id"] = game_data["team_home"].apply(get_team_id)
game_data["away_id"] = game_data["team_away"].apply(get_team_id)

game_data.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,stadium,weather_temperature,weather_wind_mph,home_id,away_id
0,2010-09-09,2010,1,New Orleans Saints,14.0,9.0,Minnesota Vikings,NO,-5.0,Louisiana Superdome,72.0,0.0,NO,MIN
1,2010-09-12,2010,1,Buffalo Bills,10.0,15.0,Miami Dolphins,MIA,-3.0,Ralph Wilson Stadium,64.0,7.0,BUF,MIA
2,2010-09-12,2010,1,Chicago Bears,19.0,14.0,Detroit Lions,CHI,-6.5,Soldier Field,75.0,1.0,CHI,DET
3,2010-09-12,2010,1,Houston Texans,34.0,24.0,Indianapolis Colts,IND,-1.0,Reliant Stadium,89.0,5.0,HOU,IND
4,2010-09-12,2010,1,Jacksonville Jaguars,24.0,17.0,Denver Broncos,JAX,-3.0,EverBank Field,91.0,1.0,JAX,DEN


### Joining Data

In [50]:
# merge the box scores and game data using inner join on date and home/away id
box_game_data_merged = pd.merge(box_scores, game_data, how="inner", left_on=["date", "home_id", "away_id"], right_on=["schedule_date", "home_id", "away_id"])
box_game_data_merged.head()

Unnamed: 0,date,visitor,home,visitor_score,home_score,visitor_first_downs,visitor_net_yards,visitor_total_plays,visitor_avg_gain,visitor_time_of_possession,...,schedule_week,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,stadium,weather_temperature,weather_wind_mph
0,2014-09-07,Cleveland,Pittsburgh,27,30,23,389,64,6.1,27:33,...,1,Pittsburgh Steelers,30.0,27.0,Cleveland Browns,PIT,-5.5,Heinz Field,72.0,6.0
1,2014-09-07,Jacksonville,Philadelphia,17,34,18,306,70,4.4,29:14,...,1,Philadelphia Eagles,34.0,17.0,Jacksonville Jaguars,PHI,-10.0,Lincoln Financial Field,80.0,6.0
2,2014-09-04,Green Bay,Seattle,16,36,19,255,57,4.5,26:40,...,1,Seattle Seahawks,36.0,16.0,Green Bay Packers,SEA,-4.5,CenturyLink Field,70.0,5.0
3,2014-09-07,Minnesota,St. Louis,34,6,18,355,57,6.2,28:17,...,1,St. Louis Rams,6.0,34.0,Minnesota Vikings,LAR,-3.0,Edward Jones Dome,72.0,0.0
4,2014-09-07,Cincinnati,Baltimore,23,16,16,380,64,5.9,30:30,...,1,Baltimore Ravens,16.0,23.0,Cincinnati Bengals,BAL,-1.0,M&T Bank Stadium,78.0,0.0


In [51]:
# sort the merged dataset by date
box_game_data_merged = box_game_data_merged.sort_values(by=["date"]).reset_index(drop=True)
box_game_data_merged.head()

Unnamed: 0,date,visitor,home,visitor_score,home_score,visitor_first_downs,visitor_net_yards,visitor_total_plays,visitor_avg_gain,visitor_time_of_possession,...,schedule_week,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,stadium,weather_temperature,weather_wind_mph
0,2010-09-09,Minnesota,New Orleans,9,14,12,253,51,5.0,26:17,...,1,New Orleans Saints,14.0,9.0,Minnesota Vikings,NO,-5.0,Louisiana Superdome,72.0,0.0
1,2010-09-12,Indianapolis,Houston,24,34,25,463,69,6.7,29:07,...,1,Houston Texans,34.0,24.0,Indianapolis Colts,IND,-1.0,Reliant Stadium,89.0,5.0
2,2010-09-12,Detroit,Chicago,14,19,13,168,57,2.9,25:18,...,1,Chicago Bears,19.0,14.0,Detroit Lions,CHI,-6.5,Soldier Field,75.0,1.0
3,2010-09-12,Arizona,St. Louis,17,13,21,378,64,5.9,27:09,...,1,St. Louis Rams,13.0,17.0,Arizona Cardinals,ARI,-3.0,Edward Jones Dome,72.0,0.0
4,2010-09-12,Carolina,NY Giants,18,31,14,237,63,3.8,25:21,...,1,New York Giants,31.0,18.0,Carolina Panthers,NYG,-6.0,MetLife Stadium,65.0,1.0


In [52]:
# drop unnecessary columns
box_game_data_merged = box_game_data_merged.drop(columns=["schedule_date", "visitor", "home", "visitor_score", "home_score"])
box_game_data_merged.head()

Unnamed: 0,date,visitor_first_downs,visitor_net_yards,visitor_total_plays,visitor_avg_gain,visitor_time_of_possession,home_first_downs,home_net_yards,home_total_plays,home_avg_gain,...,schedule_week,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,stadium,weather_temperature,weather_wind_mph
0,2010-09-09,12,253,51,5.0,26:17,18,308,62,5.0,...,1,New Orleans Saints,14.0,9.0,Minnesota Vikings,NO,-5.0,Louisiana Superdome,72.0,0.0
1,2010-09-12,25,463,69,6.7,29:07,23,355,61,5.8,...,1,Houston Texans,34.0,24.0,Indianapolis Colts,IND,-1.0,Reliant Stadium,89.0,5.0
2,2010-09-12,13,168,57,2.9,25:18,23,463,70,6.6,...,1,Chicago Bears,19.0,14.0,Detroit Lions,CHI,-6.5,Soldier Field,75.0,1.0
3,2010-09-12,21,378,64,5.9,27:09,20,325,81,4.0,...,1,St. Louis Rams,13.0,17.0,Arizona Cardinals,ARI,-3.0,Edward Jones Dome,72.0,0.0
4,2010-09-12,14,237,63,3.8,25:21,21,376,67,5.6,...,1,New York Giants,31.0,18.0,Carolina Panthers,NYG,-6.0,MetLife Stadium,65.0,1.0


In [53]:
# convert time of possession to a float for minutes
def convert_time_to_float(time):
  if time == "None":
    return 0
  else:
    time_split = time.split(":")
    return float(time_split[0]) + float(time_split[1])/60

In [54]:
box_game_data_merged["visitor_time_of_possession"] = box_game_data_merged["visitor_time_of_possession"].apply(convert_time_to_float)
box_game_data_merged["home_time_of_possession"] = box_game_data_merged["home_time_of_possession"].apply(convert_time_to_float)

In [55]:
def get_winner_id(row):
    if row["score_home"] > row["score_away"]:
      return 0
    elif row["score_home"] < row["score_away"]:
      return 1
    else:
      return 2

In [56]:
# apply the get_winner_id function to the merged dataset
box_game_data_merged["winner_id"] = box_game_data_merged.apply(get_winner_id, axis=1)
box_game_data_merged.head()

Unnamed: 0,date,visitor_first_downs,visitor_net_yards,visitor_total_plays,visitor_avg_gain,visitor_time_of_possession,home_first_downs,home_net_yards,home_total_plays,home_avg_gain,...,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,stadium,weather_temperature,weather_wind_mph,winner_id
0,2010-09-09,12,253,51,5.0,26.283333,18,308,62,5.0,...,New Orleans Saints,14.0,9.0,Minnesota Vikings,NO,-5.0,Louisiana Superdome,72.0,0.0,0
1,2010-09-12,25,463,69,6.7,29.116667,23,355,61,5.8,...,Houston Texans,34.0,24.0,Indianapolis Colts,IND,-1.0,Reliant Stadium,89.0,5.0,0
2,2010-09-12,13,168,57,2.9,25.3,23,463,70,6.6,...,Chicago Bears,19.0,14.0,Detroit Lions,CHI,-6.5,Soldier Field,75.0,1.0,0
3,2010-09-12,21,378,64,5.9,27.15,20,325,81,4.0,...,St. Louis Rams,13.0,17.0,Arizona Cardinals,ARI,-3.0,Edward Jones Dome,72.0,0.0,1
4,2010-09-12,14,237,63,3.8,25.35,21,376,67,5.6,...,New York Giants,31.0,18.0,Carolina Panthers,NYG,-6.0,MetLife Stadium,65.0,1.0,0


In [57]:
# convert columns to ints / floats
box_game_data_merged["score_home"] = box_game_data_merged["score_home"].astype(int)
box_game_data_merged["score_away"] = box_game_data_merged["score_away"].astype(int)
box_game_data_merged["home_first_downs"] = box_game_data_merged["home_first_downs"].astype(int)
box_game_data_merged["home_net_yards"] = box_game_data_merged["home_net_yards"].astype(int)
box_game_data_merged["home_total_plays"] = box_game_data_merged["home_total_plays"].astype(int)
box_game_data_merged["home_avg_gain"] = box_game_data_merged["home_avg_gain"].astype(float)
box_game_data_merged["visitor_first_downs"] = box_game_data_merged["visitor_first_downs"].astype(int)
box_game_data_merged["visitor_net_yards"] = box_game_data_merged["visitor_net_yards"].astype(int)
box_game_data_merged["visitor_total_plays"] = box_game_data_merged["visitor_total_plays"].astype(int)
box_game_data_merged["visitor_avg_gain"] = box_game_data_merged["visitor_avg_gain"].astype(float)

In [58]:
box_game_data_merged.columns

Index(['date', 'visitor_first_downs', 'visitor_net_yards',
       'visitor_total_plays', 'visitor_avg_gain', 'visitor_time_of_possession',
       'home_first_downs', 'home_net_yards', 'home_total_plays',
       'home_avg_gain', 'home_time_of_possession', 'home_id', 'away_id',
       'schedule_season', 'schedule_week', 'team_home', 'score_home',
       'score_away', 'team_away', 'team_favorite_id', 'spread_favorite',
       'stadium', 'weather_temperature', 'weather_wind_mph', 'winner_id'],
      dtype='object')

Most Important Columns: 'home_id', 'away_id', 'visitor_net_yards', 'visitor_time_of_possession', 'home_net_yards', 'home_time_of_possession', 'score_home', 'score_away', 'stadium'

In [59]:
# convert box_game_data_merged to csv
box_game_data_merged.to_csv('cleaned-data/box_game_data_merged.csv')