# Collecting Data

In [8]:
import pandas as pd
from basketball_reference_web_scraper import client
from basketball_reference_web_scraper.data import OutputType

# if a variable does not seem to be defined it is in here 
# this is because it is also needed in the analysis and visualization
%run 'variables.ipynb'  

In [2]:
for year in range(START_YEAR, END_YEAR):
    client.season_schedule(
        season_end_year=year, 
        output_type=OutputType.CSV, 
        output_file_path = './Games/allgames%s.csv' % str(year)
    )

In [3]:
# compile all of the seasons into one list, and only include regular season games 

shortened_years = {
    2012: 990, 
    2013: 1229, 
    2020: 971
}

# create a list containing a dataframe for each season being studied, and cut off the post season 
allgames = [pd.read_csv('./Games/allgames%s.csv' % (year)).iloc[range(0, 1230), :] if year not in shortened_years 
            else pd.read_csv('./Games/allgames%s.csv' % (year)).iloc[range(0, shortened_years[year]), :]  
            for year in range(START_YEAR, END_YEAR)]

# add column for home win
for game in allgames: 
    game['home_team_win?'] = game['home_team_score'] > game['away_team_score']

# re-save the dataframes, but with only the reg season and the home_team_win? column 
for year in range(START_YEAR, END_YEAR): 
    game = allgames[year - START_YEAR]
    path = './Games/allgames%s.csv' % str(year)
    game.to_csv(path, index = False)

# Finding Travel Distance Between Each Road Game

# Some Definitions: 
###### road trip: Any continuous stretch of away games directly following a home game (the length is the # of games played)
###### cumulative distance: Refers to the total distance traveled from game to game during a road trip 
###### n game road trip: A road trip of length n, where length is measured in the number of games played


#### Some Notes: 
###### An n game road trip may be a fraction of a larger road trip. For example, a 5 game road trip consists of a 4 game road trip, a 3 game road trip, etc...
###### Therefore, when I refer to a n game road trip, it does not imply that the n + 1 game is a home game. 
###### However, all road trips start from the first away game following a home game. For example, the last 3 games of a 5 game road trip is not a 3 game road trip. However, the first 3 games of a 5 game road trip is a 3 game road trip. 
###### All travel distances are approximated as the distance between the home cities of each team. For example, If the 76ers played an away  game against the Celtics, the distance traveled would be estimated as the distance between Philadelphia and Boston, according to this API: https://www.distance24.org/api.xhtml
###### I did not consider games played between the Clippers and Lakers in my calculations, as they share a stadium, which pretty much defeats the purpose of home v. away. 


In [4]:
# Lets find out if winning an away game is correlated with distance traveled 
import json
import requests

class TeamTravelingVsWinning: 
    def __init__(self, team, year):
        # each object of this class will be associated with a unique NBA team and year 
        self.year = year 
        self.team = team  
        
        self.home_city = team_cities[team]  # city of the team 
        self.curr_city = team_cities[team]  # current location of the team (initialized to the home city)
        self.games_played = 0  # keeps track of the current game # 
        
        num_games = 82 if year != 2012 else 66 # only 66 games were played in the 2011-12 season
        
        if self.year == 2013:
            if self.team == 'BOSTON CELTICS' or self.team == 'INDIANA PACERS': # 1 game cancelled bc of boston marathon bombing  
                num_games = 81 
        
        #  As we iterate through the season's games, we will maintain this table for each team 
        self.table = {
            'cumulative_distance' : [0 for i in range(num_games)], 
            'n_game_road_trip' : [0 for i in range(num_games)], 
            'win?' : [None for i in range(num_games)]
        }
        
        # The teams dictionary will store a team for every (year, team) combination
        # Some combinations will be invalid, for instance the Charlotte Bobcats were not a team after 2013-14
        self.valid = False 
    
    def add_game(self, home_team, h_pnts, v_pnts):
        if  self.home_city != team_cities[home_team]: 
            cities = 'stops=%s|%s' % (self.curr_city, team_cities[home_team])
            ep = 'https://www.distance24.org/route.json?%s' % cities
            dist = requests.get(ep).json()['distance']
            prev_road_count = self.table['n_game_road_trip'][self.games_played - 1]
            prev_cumulative_dist = self.table['cumulative_distance'][self.games_played - 1]
            self.table['n_game_road_trip'][self.games_played] = prev_road_count + 1 
            self.table['cumulative_distance'][self.games_played] = prev_cumulative_dist + dist 
        else: 
            self.table['n_game_road_trip'][self.games_played] = 0 
            self.table['cumulative_distance'][self.games_played] = 0 
        if self.team == home_team: 
            self.table['win?'][self.games_played] = 1 if h_pnts > v_pnts else 0 
        else: 
            self.table['win?'][self.games_played] = 0 if h_pnts > v_pnts else 1 
        self.valid = True # if this method is accessed, it means that this team is "valid" 
        self.games_played += 1  
        self.curr_city = team_cities[home_team] 

teams = {} 
for year in range(START_YEAR, END_YEAR): 
    for team in team_cities: 
        teams[(team, year)] = TeamTravelingVsWinning(team, year)


In [5]:
for year in range(START_YEAR, END_YEAR): 
    print(str((year - START_YEAR) / (END_YEAR - START_YEAR) * 100) + '% done')
    for row in allgames[year - START_YEAR].iterrows():
        visitor = row[1].loc['away_team']
        home_team = row[1].loc['home_team'] 
        h_pnts = row[1].loc['home_team_score'] 
        v_pnts = row[1].loc['away_team_score']
        teams[(visitor, year)].add_game(home_team, h_pnts, v_pnts) 
        teams[(home_team, year)].add_game(home_team, h_pnts, v_pnts) 

print('100% done')
        
        

0.0% done
100% done


In [6]:
# Lets save each team's table as a csv, because that took a while to calculate 
dfs = {(team, year):pd.DataFrame.from_dict(data.table) for (team, year), data in teams.items() 
       if data.valid}
for (team, year), df in dfs.items(): 
    name = '%s%s.csv' % (team, year)
    path = 'TeamTravelDistVsWinning/%s' % name
    df.to_csv(path, index = False)


In [9]:
# Now, let's compile each teams data into one big DataFrame

team_travel_data = []
for year in range(START_YEAR, END_YEAR):
    for team, city in team_cities.items(): 
        file = '%s%s.csv' % (team, year) 
        try:
            team_travel_data.append(pd.read_csv('TeamTravelDistVsWinning/%s' % file))
        except FileNotFoundError: # not every team, city combination is a valid team 
            continue 

complete_data = pd.concat(team_travel_data)

# delete home games 
complete_data = complete_data[complete_data['n_game_road_trip'] > 0]

# get table sorted by distance
complete_data.sort_values(by = ['cumulative_distance', 'n_game_road_trip'], inplace = True)

# reset indices 
complete_data.reset_index(drop=True, inplace=True)

# complete_data['cumulative_distance'].describe()
complete_data['n_game_road_trip'].describe()


count    14214.000000
mean         1.948572
std          1.261698
min          1.000000
25%          1.000000
50%          2.000000
75%          2.000000
max         10.000000
Name: n_game_road_trip, dtype: float64

In [10]:
# Save complete_data as a csv 

complete_data.to_csv('CompleteData/completedata.csv')