## This Program gathers information for various sports that will be used to find the effect fans in thhe stands have on winning

#### Global Imports

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# Basketball

## Imports

In [None]:
from basketball_reference_scraper.teams import get_roster, get_team_stats, get_opp_stats, get_roster_stats, get_team_misc
from basketball_reference_scraper.seasons import get_schedule, get_standings

## Gathering Data

In [None]:
# Dicts that will be used to go between team names and acronyms. Useful for the API being used

team_to_acronym = {'Atlanta Hawks': 'ATL',
 'Boston Celtics': 'BOS',
 'Brooklyn Nets': 'NJN',
 'New Jersey Nets': 'NJN',
 'Chicago Bulls': 'CHI',
 'Charlotte Hornets (2014-present)': 'CHO',
 'Charlotte Bobcats': 'CHA',
 'Cleveland Cavaliers': 'CLE',
 'Dallas Mavericks': 'DAL',
 'Denver Nuggets': 'DEN',
 'Detroit Pistons': 'DET',
 'Golden State Warriors': 'GSW',
 'Houston Rockets': 'HOU',
 'Indiana Pacers': 'IND',
 'Los Angeles Clippers': 'LAC',
 'Los Angeles Lakers': 'LAL',
 'Memphis Grizzlies': 'MEM',
 'Miami Heat': 'MIA',
 'Milwaukee Bucks': 'MIL',
 'Minnesota Timberwolves': 'MIN',
 'New Orleans Pelicans': 'NOP',
 'New Orleans Hornets': 'NOH',
 'New York Knicks': 'NYK',
 'Oklahoma City Thunder': 'OKC',
 'Orlando Magic': 'ORL',
 'Philadelphia 76ers': 'PHI',
 'Phoenix Suns': 'PHO',
 'Portland Trail Blazers': 'POR',
 'Sacramento Kings': 'SAC',
 'San Antonio Spurs': 'SAS',
 'Toronto Raptors': 'TOR',
 'Utah Jazz': 'UTA',
 'Washington Wizards': 'WAS'}

acronym_to_team = {'ATL': 'Atlanta Hawks',
 'BOS': 'Boston Celtics',
 'BRK': 'Brooklyn Nets',
 'NJN': 'New jersey Nets',
 'CHI': 'Chicago Bulls',
 'CHO': 'Charlotte Hornets (2014-present)',
 'CHA': 'Charlotte Bobcats',
 'CLE': 'Cleveland Cavaliers',
 'DAL': 'Dallas Mavericks',
 'DEN': 'Denver Nuggets',
 'DET': 'Detroit Pistons',
 'GSW': 'Golden State Warriors',
 'HOU': 'Houston Rockets',
 'IND': 'Indiana Pacers',
 'LAC': 'Los Angeles Clippers',
 'LAL': 'Los Angeles Lakers',
 'MEM': 'Memphis Grizzlies',
 'MIA': 'Miami Heat',
 'MIL': 'Milwaukee Bucks',
 'MIN': 'Minnesota Timberwolves',
 'NOP': 'New Orleans Pelicans',
 'NOJ': 'New Orleans Pelicans',
 'NYK': 'New York Knicks',
 'OKC': 'Oklahoma City Thunder',
 'ORL': 'Orlando Magic',
 'PHI': 'Philadelphia 76ers',
 'PHO': 'Phoenix Suns',
 'POR': 'Portland Trail Blazers',
 'SAC': 'Sacramento Kings',
 'SAS': 'San Antonio Spurs',
 'TOR': 'Toronto Raptors',
 'UTA': 'Utah Jazz',
 'WAS': 'Washington Wizards'}

In [None]:
def basketball_data_generator():
    """ Uses API that connect to SportsReference.com to gather information related to NBA (basketball) team attendance. Note that this API
        only has average attendance so generalizations were made from an entire season to a single game
    
    Output: basketball_games_df: DataFrame of NBA game by game matchups, scores, and attendance
    """

    basketball_games_df = pd.DataFrame()   # DataFrame that will hold data
    start_year = 2010
    end_year = 2021

    for year in range(start_year, end_year+1):   # Iterating through desired years
        
        # Adding generic information
        game_data['Sport'] = 'Basketball'
        game_data['League'] = 'NBA'
        game_data['Season'] = year
        
        # Accessing API through basketball_reference_scraper library
        game_data = get_schedule(year)

        # Adding game-by-game information
        game_data.rename(columns={'DATE': 'Date', 'HOME': 'Home_Team', 'VISITOR': 'Away_Team',   # Renaming columns to names used in final DataFrame
                                  'HOME_PTS': 'Home_Team_Score', 'VISITOR_PTS': 'Away_Team_Score'},inplace=True)
        game_data['Winner'] = game_data.apply(lambda x: 'Home' if x['Home_Team_Score'] > x['Away_Team_Score'] 
                                                               else 'Away' if x['Home_Team_Score'] < x['Away_Team_Score'] else 'Tie', axis=1)
        game_data['Home_Team_Won'] = game_data.apply(lambda x: 1 if x['Winner'] == 'Home' else 0, axis=1)

        for team in acronym_to_team.keys():    # Going through all teams in the league to gather attendance information
            try:
                attendance_pg = get_team_misc(team, year)['ATTENDANCE/G']
                game_data.loc[game_data['Home_Team']==acronym_to_team[team], 'Attendance'] = attendance_pg
            except:
                continue

        game_data = game_data[['Sport', 'League', 'Date', 'Season', 'Home_Team', 'Away_Team', 'Home_Team_Score', 'Away_Team_Score', 'Winner', 'Home_Team_Won', 'Attendance']]
        basketball_games_df = basketball_games_df.append(game_data)
        basketball_games_df.drop_duplicates(inplace=True)

    # Accounting for nulls in attendance (no fans) and teams changing names over the years
    basketball_games_df['Attendance'].replace(np.nan, 0, regex=True, inplace=True)
    basketball_games_df['Home_Team'].replace('New Jersey Nets', 'Brooklyn Nets', regex=True, inplace=True)
    basketball_games_df['Away_Team'].replace('New Jersey Nets', 'Brooklyn Nets', regex=True, inplace=True)
    basketball_games_df['Home_Team'].replace('Charlotte Bobcats', 'Charlotte Hornets', regex=True, inplace=True)
    basketball_games_df['Away_Team'].replace('Charlotte Bobcats', 'Charlotte Hornets', regex=True, inplace=True)
    basketball_games_df['Home_Team'].replace('Charlotte Hornets (2014-present)', 'Charlotte Hornets', regex=True, inplace=True)
    basketball_games_df['Away_Team'].replace('Charlotte Hornets (2014-present)', 'Charlotte Hornets', regex=True, inplace=True)
    basketball_games_df['Home_Team'].replace('New Orleans Hornets', 'New Orleans Pelicans', regex=True, inplace=True)
    basketball_games_df['Away_Team'].replace('New Orleans Hornets', 'New Orleans Pelicans', regex=True, inplace=True)
    basketball_games_df.dropna(inplace=True)

    return basketball_games_df

## Writing Data to CSV

In [None]:
basketball_games_df.to_csv("C:\\Users\\Jack\\OneDrive\\Documents\\basketball_data.csv", index=False)

# Baseball

## Imports

In [None]:
from baseball_scraper import baseball_reference

## Gathering Data

In [None]:
# Dicts that will be used to go between team names and acronyms. Useful for the API being used

acronym_to_city_dict = {'ATL': 'Atlanta',
                        'WSN': 'Washington',
                        'NYM': 'New York Mets',
                        'PHI': 'Philadelphia',
                        'MIA': 'Miami',
                        'FLA': 'Miami',     # Changed name in 2012
                        'STL': 'St. Louis',
                        'MIL': 'Milwaukee',
                        'CHC': 'Chicago Cubs',
                        'CIN': 'Cincinnati',
                        'PIT': 'Pittsburgh',
                        'LAD': 'Los Angeles Dodgers',
                        'ARI': 'Arizona',
                        'SFG': 'San Francisco',
                        'COL': 'Colorado',
                        'SDP': 'San Diego',
                        'NYY': 'New York Yankees',
                        'TBR': 'Tampa Bay',
                        'BOS': 'Boston',
                        'TOR': 'Toronto',
                        'BAL': 'Baltimore',
                        'MIN': 'Minnesota',
                        'CLE': 'Cleveland',
                        'CHW': 'Chicago White Sox',
                        'KCR': 'Kansas City',
                        'DET': 'Detroit',
                        'HOU': 'Houston',
                        'OAK': 'Oakland',
                        'TEX': 'Texas',
                        'LAA': 'Los Angeles Angels',
                        'SEA': 'Seattle'}
nickname_to_city_dict = {'braves': 'Atlanta',
                        'nationals': 'Washington',
                        'mets': 'New York Mets',
                        'phillies': 'Philadelphia',
                        'marlins': 'Miami',
                        'cardinals': 'St. Louis',
                        'brewers': 'Milwaukee',
                        'cubs': 'Chicago Cubs',
                        'reds': 'Cincinnati',
                        'pirates': 'Pittsburgh',
                        'dodgers': 'Los Angeles Dodgers',
                        'diamondbacks': 'Arizona',
                        'd-backs': 'Arizona',
                        'giants': 'San Francisco',
                        'rockies': 'Colorado',
                        'padres': 'San Diego',
                        'yankees': 'New York Yankees',
                        'rays': 'Tampa Bay',
                        'red sox': 'Boston',
                        'blue jays': 'Toronto',
                        'orioles': 'Baltimore',
                        'twins': 'Minnesota',
                        'indians': 'Cleveland',
                        'white sox': 'Chicago White Sox',
                        'royals': 'Kansas City',
                        'tigers': 'Detroit',
                        'astros': 'Houston',
                        'athletics': 'Oakland',
                        'rangers': 'Texas',
                        'angels': 'Los Angeles Angels',
                        'mariners': 'Seattle'}

In [None]:
def baseball_data_generator():
    """ Uses API that connects to SportsReference.com to gather information related to MLB (baseball) team attendance. 
    
    Output: baseball_games_df: DataFrame of MLB game by game matchups, scores, and attendance
    """


    s = baseball_reference.TeamScraper()  # calling the API's scraping tool
    baseball_games_df = pd.DataFrame()    # DataFrame that will hold data
    start_year = 2010
    end_year = 2021

    for year in range(start_year, end_year+1):  # Iterating through desired years

        # Adding generic information
        game_data['Sport'] = 'Baseball'
        game_data['League'] = 'MLB'
        game_data['Season'] = year


        # Accessing API through basketball_reference_scraper library
        s.set_season(year)  # Calling API for a given year  

        for team in acronym_to_city_dict.keys():  # Scraping data for all teams

            try:   
                game_data = s.scrape(team)
                game_data['Home_Team'] = game_data.apply(lambda x: acronym_to_city_dict[x['Opp']] if x['Home_Away'] == '@' else acronym_to_city_dict[x['Tm']], axis=1)
                game_data['Home_Team_Score'] = game_data.apply(lambda x: x['RA'] if x['Home_Away'] == '@' else x['R'], axis=1)
                game_data['Away_Team'] = game_data.apply(lambda x: acronym_to_city_dict[x['Opp']] if x['Home_Away'] == 'Home' else acronym_to_city_dict[x['Tm']], axis=1)
                game_data['Away_Team_Score'] = game_data.apply(lambda x: x['RA'] if x['Home_Away'] == 'Home' else x['R'], axis=1)
                game_data['Winner'] = game_data.apply(lambda x: 'Home' if x['Home_Team_Score'] > x['Away_Team_Score'] else 'Away' if x['Home_Team_Score'] < x['Away_Team_Score'] else 'Tie', axis=1)
                game_data['Home_Team_Won'] = game_data.apply(lambda x: 1 if x['Winner'] == 'Home' else 0, axis=1)
                game_data = game_data[['Sport', 'League', 'Date', 'Home_Team', 'Away_Team', 'Home_Team_Score', 'Away_Team_Score', 'Winner', 'Home_Team_Won', 'Attendance']]
                baseball_games_df = baseball_games_df.append(game_data)
                baseball_games_df.drop_duplicates(inplace=True)
            except:
                continue

    # Accounting for nulls in attendance (no fans) and teams changing names over the years
    baseball_games_df['Attendance'].replace(np.nan, 0, regex=True, inplace=True)        
    baseball_games_df['Home_Team'].replace('FLA', 'MIA', regex=True, inplace=True)
    baseball_games_df['Away_Team'].replace('FLA', 'MIA', regex=True, inplace=True)
    baseball_games_df.dropna(inplace=True)

    return baseball_games_df

## Writing Data to CSV

In [None]:
baseball_games_df.to_csv("C:\\Users\\Jack\\OneDrive\\Documents\\baseball_data.csv", index=False)

# American Football. Data is loaded here and Joined in SQL

## Loading in files from GITHUB with Data

In [None]:
attendance = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-04/attendance.csv')
standings = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-04/standings.csv')
games = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-04/games.csv')

games.head()

## Formatting dates to traditional mm/dd/yyy format

In [None]:
games['date'] = games.apply(lambda x: datetime.strptime(x['date'].split()[0]+'/'+str(x['date'].split()[1])+'/'+str(x['year']), '%B/%d/%Y'), axis=1)

## Updating home teams in attendance data to match format from games DataFrame

In [None]:
attendance.rename(columns={"home": 'home_team'}, inplace=True)
attendance['home_team'] = attendance.apply(lambda x: x['team'] + ' ' + x['team_name'], axis=1)
attendance.head()

## Writing Data to CSV

In [None]:
attendance.to_csv("C:\\Users\\Jack\\OneDrive\\Documents\\nfl_attendance.csv", index=False)
games.to_csv("C:\\Users\\Jack\\OneDrive\\Documents\\nfl_games.csv", index=False)

# Combining Sports

## Gathering CSV's

In [None]:
baseball = pd.read_csv("C:\\Users\\Jack\\OneDrive\\Documents\\baseball_data.csv")
basketball = pd.read_csv("C:\\Users\\Jack\\OneDrive\\Documents\\basketball_data.csv")
soccer = pd.read_csv("C:\\Users\\Jack\\OneDrive\\Documents\\soccer_data.csv") # Soccer data pulled directly from SportsReference.com
football = pd.read_csv("C:\\Users\\Jack\\OneDrive\\Documents\\american_football_data.csv")
hockey = pd.read_csv("C:\\Users\\Jack\\OneDrive\\Documents\\hockey_data.csv")  # Hockey data pulled directly from SportsReference.com

## Removing NBA games that were played in a neutral court "bubble"

In [None]:
basketball['Date'] = pd.to_datetime(basketball['Date'])  
basketball = basketball.loc[(basketball['Date'] < '2020-4-1') | (basketball['Date'] > '2020-11-1')]
basketball['Date'] = basketball['Date'].apply(lambda x: x.strftime('%Y-%m-%d'))

basketball.shape

## Combining DataFrames

In [None]:
sports_df = baseball.append(basketball, ignore_index=True)
sports_df = sports_df.append(soccer, ignore_index=True)
sports_df = sports_df.append(football, ignore_index=True)
sports_df = sports_df.append(hockey, ignore_index=True)

sports_df.dropna(inplace=True)  # One hockey game reschedules but still in original hockey DataFrame

sports_df.head()

## Writing Data to CSV

In [None]:
sports_df.to_csv("C:\\Users\\Jack\\OneDrive\\Documents\\sports_data.csv", index=False)

# Statstical Tests - Proportion Test

In [26]:
import numpy as np
from statsmodels.stats.proportion import proportions_ztest
sports_df = pd.read_csv("sports_data.csv")

## Baseball

In [27]:

# Finding the numbers home team wins in Baseball
Baseball_pre_wins = len(sports_df.loc[(sports_df['Sport'] == 'Baseball') & (sports_df['During_COVID'] == 0) & (sports_df['Winner'] == 'Home')])
Baseball_post_wins = len(sports_df.loc[(sports_df['Sport'] == 'Baseball') & (sports_df['During_COVID'] == 1) & (sports_df['Winner'] == 'Home')])

# Finding the numbers games in Baseball
Baseball_pre_games = len(sports_df.loc[(sports_df['Sport'] == 'Baseball') & (sports_df['During_COVID'] == 0) & (sports_df['Winner'] != 'Tie')])
Baseball_post_games = len(sports_df.loc[(sports_df['Sport'] == 'Baseball') & (sports_df['During_COVID'] == 1) & (sports_df['Winner'] != 'Tie')])

# Running Prop Test
count = np.array([Baseball_pre_wins, Baseball_post_wins])
nobs = np.array([Baseball_pre_games, Baseball_post_games])
stat, pval = proportions_ztest(count, nobs)
if pval < 0.05:
    print("The result is significant with a p-value of: ", pval, "So, there is evidence to suggest that the home teams perform differently pre and post COVID in Baseball.")
else:
    print("The result is not significant with a p-value of: ", pval, "So, there is no evidence to suggest that the home teams perform differently pre and post COVID in Baseball.")

The result is not significant with a p-value of:  0.40254184434893026 So, there is no evidence to suggest that the home teams perform differently pre and post COVID in Baseball.


## Basketball

In [28]:
# Finding the numbers home team wins in Basketball
Basketball_pre_wins = len(sports_df.loc[(sports_df['Sport'] == 'Basketball') & (sports_df['During_COVID'] == 0) & (sports_df['Winner'] == 'Home')])
Basketball_post_wins = len(sports_df.loc[(sports_df['Sport'] == 'Basketball') & (sports_df['During_COVID'] == 1) & (sports_df['Winner'] == 'Home')])

# Finding the numbers games in Basketball
Basketball_pre_games = len(sports_df.loc[(sports_df['Sport'] == 'Basketball') & (sports_df['During_COVID'] == 0) & (sports_df['Winner'] != 'Tie')])
Basketball_post_games = len(sports_df.loc[(sports_df['Sport'] == 'Basketball') & (sports_df['During_COVID'] == 1) & (sports_df['Winner'] != 'Tie')])

# Running Prop Test
count = np.array([Basketball_pre_wins, Basketball_post_wins])
nobs = np.array([Basketball_pre_games, Basketball_post_games])
stat, pval = proportions_ztest(count, nobs)
if pval < 0.05:
    print("The result is significant with a p-value of: ", pval, "So, there is evidence to suggest that the home teams perform differently pre and post COVID in Basketball.")
else:
    print("The result is not significant with a p-value of: ", pval, "So, there is no evidence to suggest that the home teams perform differently pre and post COVID in Basketball.")

The result is significant with a p-value of:  0.005624721621969695 So, there is evidence to suggest that the home teams perform differently pre and post COVID in Basketball.


## Soccer

In [29]:
# Finding the numbers home team wins in Soccer
Soccer_pre_wins = len(sports_df.loc[(sports_df['Sport'] == 'Soccer') & (sports_df['During_COVID'] == 0) & (sports_df['Winner'] == 'Home')])
Soccer_post_wins = len(sports_df.loc[(sports_df['Sport'] == 'Soccer') & (sports_df['During_COVID'] == 1) & (sports_df['Winner'] == 'Home')])

# Finding the numbers games in Soccer
Soccer_pre_games = len(sports_df.loc[(sports_df['Sport'] == 'Soccer') & (sports_df['During_COVID'] == 0) & (sports_df['Winner'] != 'Tie')])
Soccer_post_games = len(sports_df.loc[(sports_df['Sport'] == 'Soccer') & (sports_df['During_COVID'] == 1) & (sports_df['Winner'] != 'Tie')])

# Running Prop Test
count = np.array([Soccer_pre_wins, Soccer_post_wins])
nobs = np.array([Soccer_pre_games, Soccer_post_games])
stat, pval = proportions_ztest(count, nobs)
if pval < 0.05:
    print("The result is significant with a p-value of: ", pval, "So, there is evidence to suggest that the home teams perform differently pre and post COVID in Soccer.")
else:
    print("The result is not significant with a p-value of: ", pval, "So, there is no evidence to suggest that the home teams perform differently pre and post COVID in Soccer.")

The result is significant with a p-value of:  0.00029807234424245705 So, there is evidence to suggest that the home teams perform differently pre and post COVID in Soccer.


## American Football

In [30]:
# Finding the numbers home team wins in American Football
American_Football_pre_wins = len(sports_df.loc[(sports_df['Sport'] == 'American Football') & (sports_df['During_COVID'] == 0) & (sports_df['Winner'] == 'Home')])
American_Football_post_wins = len(sports_df.loc[(sports_df['Sport'] == 'American Football') & (sports_df['During_COVID'] == 1) & (sports_df['Winner'] == 'Home')])

# Finding the numbers games in American Football
American_Football_pre_games = len(sports_df.loc[(sports_df['Sport'] == 'American Football') & (sports_df['During_COVID'] == 0) & (sports_df['Winner'] != 'Tie')])
American_Football_post_games = len(sports_df.loc[(sports_df['Sport'] == 'American Football') & (sports_df['During_COVID'] == 1) & (sports_df['Winner'] != 'Tie')])

# Running Prop Test
count = np.array([American_Football_pre_wins, American_Football_post_wins])
nobs = np.array([American_Football_pre_games, American_Football_post_games])
stat, pval = proportions_ztest(count, nobs)
if pval < 0.05:
    print("The result is significant with a p-value of: ", pval, "So, there is evidence to suggest that the home teams perform differently pre and post COVID in American_Football.")
else:
    print("The result is not significant with a p-value of: ", pval, "So, there is no evidence to suggest that the home teams perform differently pre and post COVID in American_Football.")

The result is significant with a p-value of:  0.026311392086841585 So, there is evidence to suggest that the home teams perform differently pre and post COVID in American_Football.


## Hockey

In [31]:
# Finding the numbers home team wins in Hockey
Hockey_pre_wins = len(sports_df.loc[(sports_df['Sport'] == 'Hockey') & (sports_df['During_COVID'] == 0) & (sports_df['Winner'] == 'Home')])
Hockey_post_wins = len(sports_df.loc[(sports_df['Sport'] == 'Hockey') & (sports_df['During_COVID'] == 1) & (sports_df['Winner'] == 'Home')])

# Finding the numbers games in Hockey
Hockey_pre_games = len(sports_df.loc[(sports_df['Sport'] == 'Hockey') & (sports_df['During_COVID'] == 0) & (sports_df['Winner'] != 'Tie')])
Hockey_post_games = len(sports_df.loc[(sports_df['Sport'] == 'Hockey') & (sports_df['During_COVID'] == 1) & (sports_df['Winner'] != 'Tie')])

# Running Prop Test
count = np.array([Hockey_pre_wins, Hockey_post_wins])
nobs = np.array([Hockey_pre_games, Hockey_post_games])
stat, pval = proportions_ztest(count, nobs)
if pval < 0.05:
    print("The result is significant with a p-value of: ", pval, "So, there is evidence to suggest that the home teams perform differently pre and post COVID in Hockey.")
else:
    print("The result is not significant with a p-value of: ", pval, "So, there is no evidence to suggest that the home teams perform differently pre and post COVID in Hockey.")

The result is not significant with a p-value of:  0.1821231074123455 So, there is no evidence to suggest that the home teams perform differently pre and post COVID in Hockey.
