In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import re
import warnings
warnings.filterwarnings("ignore")

# Question 1

In [6]:
def nhl_correlation():
    # Let's bring in the Dataset to this exercise
    nhl_df = pd.read_csv("Datasets/nhl.csv")
    cities = pd.read_html("Datasets/wikipedia_data.html")[1]
    cities = cities.iloc[:-1,[0,3,5,6,7,8]]

    # Cleaning Wikipadia Data by using regex
    for x in ['NFL', 'MLB', 'NBA', 'NHL']:
        # There's team names that have character whithin of []
        # Also, we replace void values with NaN values and in the team column we see there are multiple teams in one rows, 
        # so lets collapse them into a single string
        cities[x] = cities[x].replace('\[.*\]', '', regex=True).replace('', np.nan).replace('[\—*]', np.nan, regex=True)

    # Let's change the name of the column 'Population (2016 est.)[8]' by 'Population', for convenience
    cities = cities.rename(columns={'Population (2016 est.)[8]': 'Population', 'NHL': 'team'})
    # and we goint to use only the columns relation to the Big 4 Sports, population and its Metropolitan area
    # let's drop NaN values as well
    cities = cities[['Metropolitan area', 'Population', 'team']].dropna()
    # In the team column we see there are multiple teams in one rows, so lets collapse them into a single array
    cities['team'].iloc[0] = 'RangersIslandersDevils'
    cities['team'].iloc[1] = 'KingsDucks'
    cities['team'] = cities['team'].str.split(' ').str[-1].astype(str) # let's take into account just the last word of the each team

    # Cleaning NHL Data
    nhl_df['team'] = nhl_df['team'].replace('[\*]', '', regex=True) # take away useless characters
    nhl_df = nhl_df[nhl_df['year']==2018] # we're interested on data of 2018
    nhl_df = nhl_df[['team', 'W', 'L', 'year']].drop([0, 9, 18, 26]) # only these columns are essential
    # now we compute the win/loss ratio
    nhl_df['win/loss'] = nhl_df['W'].astype(float)/(nhl_df['W'].astype(float) + nhl_df['L'].astype(float))
    nhl_df['team'] = nhl_df['team'].str.split(' ').str[-1].astype(str) 
    nhl_df = nhl_df.set_index('team')

    # now let's take the average on the team which belong of the seam metropolitan area
        # new york --> 'Rangers', 'Islanders', 'Devils'
        # los angeles --> 'Kings', 'Ducks'
    W = int(nhl_df[nhl_df.index == 'Rangers'].values[0][0]) + int(nhl_df[nhl_df.index == 'Islanders'].values[0][0]) \
        + int(nhl_df[nhl_df.index == 'Devils'].values[0][0])
    L = int(nhl_df[nhl_df.index == 'Rangers'].values[0][1]) + int(nhl_df[nhl_df.index == 'Islanders'].values[0][1]) \
        + int(nhl_df[nhl_df.index == 'Devils'].values[0][1])
    mean = (nhl_df[nhl_df.index == 'Rangers'].values[0][-1] + nhl_df[nhl_df.index == 'Islanders'].values[0][-1] \
            + nhl_df[nhl_df.index == 'Devils'].values[0][-1])/3
    new_york = {'team': 'RangersIslandersDevils',
                'W': W, 'L': L, 'year': 2018, 
                'win/loss': W/(W+L)}
    W = int(nhl_df[nhl_df.index == 'Kings'].values[0][0]) + int(nhl_df[nhl_df.index == 'Ducks'].values[0][0])
    L = int(nhl_df[nhl_df.index == 'Kings'].values[0][1]) + int(nhl_df[nhl_df.index == 'Ducks'].values[0][1])
    mean = (nhl_df[nhl_df.index == 'Kings'].values[0][-1] + nhl_df[nhl_df.index == 'Ducks'].values[0][-1])/2
    los_angeles = {'team': 'KingsDucks',
                   'W': W, 'L': L, 'year': 2018, 
                   'win/loss': W/(W+L)}
    # let's drop these columns
    nhl_df = nhl_df.reset_index()
    nhl_df = nhl_df.drop([12,14,15,24,26])
    # and let's add the rows related to new york and los angeles
    nhl_df = nhl_df.append([new_york, los_angeles], ignore_index=True)

    # finally, let's merge both nhl_df and cities data
    merge = pd.merge(nhl_df, cities, how='outer', on='team')
    merge['Population'] = merge['Population'].astype(float)
    merge = merge.groupby('Metropolitan area').agg({'Population': np.mean, 'win/loss': np.mean})

    population_by_region = merge['Population'] # pass in metropolitan area population from cities
    win_loss_by_region = merge['win/loss'] # pass in win/loss ratio from nhl_df in the same order as cities["Metropolitan area"]

    assert len(population_by_region) == len(win_loss_by_region), "Q1: Your lists must be the same length"
    assert len(population_by_region) == 28, "Q1: There should be 28 teams being analysed for NHL"

    corr, pval = stats.pearsonr(population_by_region, win_loss_by_region)
    return corr
nhl_correlation()

0.012308996455744264

# Question 2

In [7]:
def nba_correlation():
    # Let's bring in the Dataset to this exercise
    nba_df = pd.read_csv("Datasets/nba.csv")
    cities = pd.read_html("Datasets/wikipedia_data.html")[1]
    cities = cities.iloc[:-1,[0,3,5,6,7,8]]

    # Cleaning Wikipadia Data by using regex
    for x in ['NFL', 'MLB', 'NBA', 'NHL']:
        # There's team names that have character whithin of []
        # Also, we replace void values with NaN values and in the team column we see there are multiple teams in one rows, 
        # so lets collapse them into a single string
        cities[x] = cities[x].replace('\[.*\]', '', regex=True).replace('', np.nan).replace('[\—*]', np.nan, regex=True)

    # Let's change the name of the column 'Population (2016 est.)[8]' by 'Population', for convenience
    cities = cities.rename(columns={'Population (2016 est.)[8]': 'Population', 'NBA': 'team'})
    # and we goint to use only the columns relation to the Big 4 Sports, population and its Metropolitan area
    # let's drop NaN values as well
    cities = cities[['Metropolitan area', 'Population', 'team']].dropna()
    # In the team column we see there are multiple teams in one rows, so lets collapse them into a single array
    cities['team'].iloc[0] = 'KnicksNets'
    cities['team'].iloc[1] = 'LakersClippers'
    cities['team'] = cities['team'].str.split(' ').str[-1].astype(str) # let's take into account just the last word of the each team

    # Cleaning NBA Data
    nba_df['team'] = nba_df['team'].replace(['[\*]','\(.*\)','\s*$'], '', regex=True) # take away useless characters
    nba_df = nba_df[nba_df['year']==2018] # we're interested on data of 2018
    nba_df = nba_df[['team', 'W', 'L', 'year']] # only these columns are essential
    # now we compute the win/loss ratio
    nba_df['win/loss'] = nba_df['W'].astype(float)/(nba_df['W'].astype(float) + nba_df['L'].astype(float))
    nba_df['team'] = nba_df['team'].str.split(' ').str[-1].astype(str) 
    nba_df = nba_df.set_index('team')

    # now let's take the average on the team which belong of the seam metropolitan area
            # new york --> 'Knicks', 'Nets'
            # los angeles --> 'Lakers', 'Clippers'
    W = int(nba_df[nba_df.index == 'Knicks'].values[0][0]) + int(nba_df[nba_df.index == 'Nets'].values[0][0])
    L = int(nba_df[nba_df.index == 'Knicks'].values[0][1]) + int(nba_df[nba_df.index == 'Nets'].values[0][1])
    mean = (nba_df[nba_df.index == 'Knicks'].values[0][-1] + nba_df[nba_df.index == 'Nets'].values[0][-1])/2
    new_york = {'team': 'KnicksNets',
                'W': W, 'L': L, 'year': 2018, 
                'win/loss': W/(W+L)}
    W = int(nba_df[nba_df.index == 'Lakers'].values[0][0]) + int(nba_df[nba_df.index == 'Clippers'].values[0][0])
    L = int(nba_df[nba_df.index == 'Lakers'].values[0][1]) + int(nba_df[nba_df.index == 'Clippers'].values[0][1])
    mean = (nba_df[nba_df.index == 'Lakers'].values[0][-1] + nba_df[nba_df.index == 'Clippers'].values[0][-1])/2
    los_angeles = {'team': 'LakersClippers',
                   'W': W, 'L': L, 'year': 2018, 
                   'win/loss': W/(W+L)}
    # let's drop these columns
    nba_df = nba_df.reset_index()
    nba_df = nba_df.drop([10,11,25,24])
    # and let's add the rows related to new york and los angeles
    nba_df = nba_df.append([new_york, los_angeles], ignore_index=True)

    # finally, let's merge both nba_df and cities data
    merge = pd.merge(nba_df, cities, how='outer', on='team')
    merge['Population'] = merge['Population'].astype(float)
    merge = merge.groupby('Metropolitan area').agg({'Population': np.mean, 'win/loss': np.mean})

    population_by_region = merge['Population'] # pass in metropolitan area population from cities
    win_loss_by_region = merge['win/loss'] # pass in win/loss ratio from nhl_df in the same order as cities["Metropolitan a

    assert len(population_by_region) == len(win_loss_by_region), "Q2: Your lists must be the same length"
    assert len(population_by_region) == 28, "Q2: There should be 28 teams being analysed for NBA"

    corr, pval = stats.pearsonr(population_by_region, win_loss_by_region)
    return corr
nba_correlation()

-0.17657160252844614

# Question 3

In [8]:
def mlb_correlation():    
    # Let's bring in the Dataset to this exercise
    mlb_df = pd.read_csv("Datasets/mlb.csv")
    cities = pd.read_html("Datasets/wikipedia_data.html")[1]
    cities = cities.iloc[:-1,[0,3,5,6,7,8]]

    # Cleaning Wikipadia Data by using regex
    for x in ['NFL', 'MLB', 'NBA', 'NHL']:
        # There's team names that have character whithin of []
        # Also, we replace void values with NaN values and in the team column we see there are multiple teams in one rows, 
        # so lets collapse them into a single string
        cities[x] = cities[x].replace('\[.*\]', '', regex=True).replace('', np.nan).replace('[\—*]', np.nan, regex=True)

    # Let's change the name of the column 'Population (2016 est.)[8]' by 'Population', for convenience
    cities = cities.rename(columns={'Population (2016 est.)[8]': 'Population', 'MLB': 'team'})
    # and we goint to use only the columns relation to the Big 4 Sports, population and its Metropolitan area
    # let's drop NaN values as well
    cities = cities[['Metropolitan area', 'Population', 'team']].dropna()
    # In the team column we see there are multiple teams in one rows, so lets collapse them into a single array
    cities['team'].iloc[0] = 'YankeesMets'
    cities['team'].iloc[1] = 'DodgersAngels'
    cities['team'].iloc[2] = 'GiantsAthletics'
    cities['team'].iloc[3] = 'CubsWhiteSox'
    cities['team'] = cities['team'].str.split(' ').str[-1].astype(str) # let's take into account just the last word of the each team

    # Cleaning NBA Data
    mlb_df['team'] = mlb_df['team'].replace(['[\*]','\(.*\)','\s*$'], '', regex=True) # take away useless characters
    mlb_df = mlb_df[mlb_df['year']==2018] # we're interested on data of 2018
    mlb_df = mlb_df[['team', 'W', 'L', 'year']] # only these columns are essential
    mlb_df['team'].values[8] = 'WhiteSox'
    # # now we compute the win/loss ratio
    mlb_df['win/loss'] = mlb_df['W'].astype(float)/(mlb_df['W'].astype(float) + mlb_df['L'].astype(float))
    mlb_df['team'] = mlb_df['team'].str.split(' ').str[-1].astype(str) 
    mlb_df = mlb_df.set_index('team')

    # now let's take the average on the team which belong of the seam metropolitan area
            # new york --> 'Yankees', 'Mets'
            # los angeles --> 'Dodgers', 'Angels'
            # san francisco --> 'Giants', 'Athletics'
            # chicago --> 'Cubs' 'WhiteSox'
    W = int(mlb_df[mlb_df.index == 'Yankees'].values[0][0]) + int(mlb_df[mlb_df.index == 'Mets'].values[0][0])
    L = int(mlb_df[mlb_df.index == 'Yankees'].values[0][1]) + int(mlb_df[mlb_df.index == 'Mets'].values[0][1])
    mean = (mlb_df[mlb_df.index == 'Yankees'].values[0][-1] + mlb_df[mlb_df.index == 'Mets'].values[0][-1])/2
    new_york = {'team': 'YankeesMets',
                'W': W, 'L': L, 'year': 2018, 
                'win/loss': W/(W+L)}
    W = int(mlb_df[mlb_df.index == 'Dodgers'].values[0][0]) + int(mlb_df[mlb_df.index == 'Angels'].values[0][0])
    L = int(mlb_df[mlb_df.index == 'Dodgers'].values[0][1]) + int(mlb_df[mlb_df.index == 'Angels'].values[0][1])
    mean = (mlb_df[mlb_df.index == 'Dodgers'].values[0][-1] + mlb_df[mlb_df.index == 'Angels'].values[0][-1])/2
    los_angeles = {'team': 'DodgersAngels',
                'W': W, 'L': L, 'year': 2018, 
                'win/loss': W/(W+L)}
    W = int(mlb_df[mlb_df.index == 'Giants'].values[0][0]) + int(mlb_df[mlb_df.index == 'Athletics'].values[0][0])
    L = int(mlb_df[mlb_df.index == 'Giants'].values[0][1]) + int(mlb_df[mlb_df.index == 'Athletics'].values[0][1])
    mean = (mlb_df[mlb_df.index == 'Giants'].values[0][-1] + mlb_df[mlb_df.index == 'Athletics'].values[0][-1])/2
    san_francisco = {'team': 'GiantsAthletics',
                'W': W, 'L': L, 'year': 2018, 
                'win/loss': W/(W+L)}
    W = int(mlb_df[mlb_df.index == 'Cubs'].values[0][0]) + int(mlb_df[mlb_df.index == 'WhiteSox'].values[0][0])
    L = int(mlb_df[mlb_df.index == 'Cubs'].values[0][1]) + int(mlb_df[mlb_df.index == 'WhiteSox'].values[0][1])
    mean = (mlb_df[mlb_df.index == 'Cubs'].values[0][-1] + mlb_df[mlb_df.index == 'WhiteSox'].values[0][-1])/2
    chicago = {'team': 'CubsWhiteSox',
                'W': W, 'L': L, 'year': 2018, 
                'win/loss': W/(W+L)}
    # let's drop these columns
    mlb_df = mlb_df.reset_index()
    mlb_df = mlb_df.drop([1,8,11,13,18,21,25,28])
    # and let's add the rows related to new york, los angeles, san_francisco and chicago
    mlb_df = mlb_df.append([new_york, los_angeles, san_francisco, chicago], ignore_index=True)

    # finally, let's merge both mlb_df and cities data
    merge = pd.merge(mlb_df, cities, how='outer', on='team')
    merge['Population'] = merge['Population'].astype(float)
    merge = merge.groupby('Metropolitan area').agg({'Population': np.mean, 'win/loss': np.mean})

    population_by_region = merge['Population'] # pass in metropolitan area population from cities
    win_loss_by_region = merge['win/loss'] # pass in win/loss ratio from mlb_df in the same order as cities["Metropolitan area"]

    assert len(population_by_region) == len(win_loss_by_region), "Q3: Your lists must be the same length"
    assert len(population_by_region) == 26, "Q3: There should be 26 teams being analysed for MLB"

    corr, pval = stats.pearsonr(population_by_region, win_loss_by_region)
    return corr
mlb_correlation()

0.1505230448710485

# Question 4

In [9]:
def nfl_correlation():    
    # Let's bring in the Dataset to this exercise
    nfl_df = pd.read_csv("Datasets/nfl.csv")
    cities = pd.read_html("Datasets/wikipedia_data.html")[1]
    cities = cities.iloc[:-1,[0,3,5,6,7,8]]

    # Cleaning Wikipadia Data by using regex
    for x in ['NFL', 'MLB', 'NBA', 'NHL']:
        # There's team names that have character whithin of []
        # Also, we replace void values with NaN values and in the team column we see there are multiple teams in one rows, 
        # so lets collapse them into a single string
        cities[x] = cities[x].replace('\[.*\]', '', regex=True).replace('', np.nan).replace('[\—*]', np.nan, regex=True)

    # Let's change the name of the column 'Population (2016 est.)[8]' by 'Population', for convenience
    cities = cities.rename(columns={'Population (2016 est.)[8]': 'Population', 'NFL': 'team'})
    # and we goint to use only the columns relation to the Big 4 Sports, population and its Metropolitan area
    # let's drop NaN values as well
    cities = cities[['Metropolitan area', 'Population', 'team']].dropna()
    # In the team column we see there are multiple teams in one rows, so lets collapse them into a single array
    cities['team'].iloc[0] = 'GiantsJets'
    cities['team'].iloc[1] = 'RamsChargers'
    cities['team'].iloc[2] = '49ersRaiders'
    cities['team'] = cities['team'].str.split(' ').str[-1].astype(str) # let's take into account just the last word of the each team
    cities

    # Cleaning NFL Data
    nfl_df['team'] = nfl_df['team'].replace(['[\*]','\(.*\)','\s*$','[\+]'], '', regex=True) # take away useless characters
    nfl_df = nfl_df[nfl_df['year']==2018] # we're interested on data of 2018
    nfl_df = nfl_df[['team', 'W', 'L', 'year']].drop([0,5,10,15,20,25,30,35]) # only these columns are essential
    # now we compute the win/loss ratio
    nfl_df['win/loss'] = nfl_df['W'].astype(float)/(nfl_df['W'].astype(float) + nfl_df['L'].astype(float))
    nfl_df['team'] = nfl_df['team'].str.split(' ').str[-1].astype(str) 
    nfl_df = nfl_df.set_index('team')

    # now let's take the average on the team which belong of the seam metropolitan area
            # new york --> 'Giants', 'Jets'
            # los angeles --> 'Rams', 'Chargers'
            # san francisco --> '49ers', 'Raiders'
    W = int(nfl_df[nfl_df.index == 'Giants'].values[0][0]) + int(nfl_df[nfl_df.index == 'Jets'].values[0][0])
    L = int(nfl_df[nfl_df.index == 'Giants'].values[0][1]) + int(nfl_df[nfl_df.index == 'Jets'].values[0][1])
    mean = (nfl_df[nfl_df.index == 'Giants'].values[0][-1] + nfl_df[nfl_df.index == 'Jets'].values[0][-1])/2
    new_york = {'team': 'GiantsJets',
                'W': W, 'L': L, 'year': 2018, 
                'win/loss': W/(W+L)}
    W = int(nfl_df[nfl_df.index == 'Rams'].values[0][0]) + int(nfl_df[nfl_df.index == 'Chargers'].values[0][0])
    L = int(nfl_df[nfl_df.index == 'Rams'].values[0][1]) + int(nfl_df[nfl_df.index == 'Chargers'].values[0][1])
    mean = (nfl_df[nfl_df.index == 'Rams'].values[0][-1] + nfl_df[nfl_df.index == 'Chargers'].values[0][-1])/2
    los_angeles = {'team': 'RamsChargers',
                'W': W, 'L': L, 'year': 2018, 
                'win/loss': W/(W+L)}
    W = int(nfl_df[nfl_df.index == '49ers'].values[0][0]) + int(nfl_df[nfl_df.index == 'Raiders'].values[0][0])
    L = int(nfl_df[nfl_df.index == '49ers'].values[0][1]) + int(nfl_df[nfl_df.index == 'Raiders'].values[0][1])
    mean = (nfl_df[nfl_df.index == '49ers'].values[0][-1] + nfl_df[nfl_df.index == 'Raiders'].values[0][-1])/2
    san_francisco = {'team': '49ersRaiders',
                'W': W, 'L': L, 'year': 2018, 
                'win/loss': W/(W+L)}
    # let's drop these columns
    nfl_df = nfl_df.reset_index()
    nfl_df = nfl_df.drop([3,19,15,13,28,30])
    # and let's add the rows related to new york, los angeles and san francisco 
    nfl_df = nfl_df.append([new_york, los_angeles, san_francisco], ignore_index=True)

    # finally, let's merge both mlb_df and cities data
    merge = pd.merge(nfl_df, cities, how='outer', on='team')
    merge['Population'] = merge['Population'].astype(float)
    merge = merge.groupby('Metropolitan area').agg({'Population': np.mean, 'win/loss': np.mean})

    population_by_region = merge['Population'] # pass in metropolitan area population from cities
    win_loss_by_region = merge['win/loss'] # pass in win/loss ratio from nfl_df in the same order as cities["Metropolitan area"]

    assert len(population_by_region) == len(win_loss_by_region), "Q4: Your lists must be the same length"
    assert len(population_by_region) == 29, "Q4: There should be 29 teams being analysed for NFL"

    corr, pval = stats.pearsonr(population_by_region, win_loss_by_region)
    return corr
nfl_correlation()

0.004922112149349409

# Question 5