In [14]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import re

nhl_df = pd.read_csv("assets/nhl.csv")
cities = pd.read_html("assets/wikipedia_data.html")[1]
cities = cities.iloc[:-1, [0, 3, 5, 6, 7, 8]]


def clean_nhl_df():
    # load data
    nhl_df = pd.read_csv("assets/nhl.csv")
    cities = pd.read_html("assets/wikipedia_data.html")[1]
    cities = cities.iloc[:-1, [0, 3, 5, 6, 7, 8]]

    # cleaning the cities dataframe
    cities["NHL"] = cities["NHL"].apply(lambda x: re.sub(r"\[.+\]", "", x))
    cities["NHL"] = cities["NHL"].replace({"RangersIslandersDevils": "Rangers,Islanders,Devils",
                                           "KingsDucks": "Kings,Ducks",
                                           "Red Wings": "Red,Wings",
                                           "Maple Leafs": "Maple,Leafs",
                                           "Blue Jackets": "Blue,Jackets",
                                           "Golden Knights": "Golden,Knights"})
    cities["NHL"] = cities["NHL"].apply(lambda x: x.split(","))
    cities = cities.explode("NHL")

    # cleaning the nhl_df dataframe
    nhl_df = nhl_df[nhl_df["year"] == 2018]
    nhl_df["team"] = nhl_df["team"].apply(lambda x: x.replace("*", ""))
    nhl_df["team"] = nhl_df["team"].apply(lambda x: x.split(" ")[-1])

    # merge the dataframes
    df = pd.merge(cities, nhl_df, left_on="NHL", right_on="team")
    df = df[["Metropolitan area",
             "Population (2016 est.)[8]", "NHL", "team", "W", "L"]]
    df["W-L%"] = df["W"].astype("int") / \
        (df["W"].astype("int") + df["L"].astype("int"))
    df["Population (2016 est.)[8]"] = df["Population (2016 est.)[8]"].astype(
        "float")
    df["W-L%"] = df["W-L%"].astype("float")

    # drop duplicated columns
    df.loc[df["Metropolitan area"] == "New York City",
           "W-L%"] = 0.5182013333333334  # mean of NY W-L%
    df.loc[df["Metropolitan area"] == "Los Angeles",
           "W-L%"] = 0.6228945  # mean of LA W-L%
    df = df.drop_duplicates(subset="Metropolitan area").reset_index()
    df = df.drop(columns="index")

    return df


def nhl_correlation():
    # YOUR CODE HERE
    #     raise NotImplementedError()
    df = clean_nhl_df()

    # pass in metropolitan area population from cities
    population_by_region = df["Population (2016 est.)[8]"]
    # pass in win/loss ratio from nhl_df in the same order as cities["Metropolitan area"]
    win_loss_by_region = df["W-L%"]

    assert len(population_by_region) == len(
        win_loss_by_region), "Q1: Your lists must be the same length"
    assert len(
        population_by_region) == 28, "Q1: There should be 28 teams being analysed for NHL"

    result = stats.pearsonr(population_by_region, win_loss_by_region)

    return result[0]


In [2]:
clean_nhl_df()


Unnamed: 0,Metropolitan area,Population (2016 est.)[8],NHL,team,W,L,W-L%
0,New York City,20153634.0,Rangers,Rangers,34,39,0.518201
1,Los Angeles,13310447.0,Kings,Kings,45,29,0.622895
2,San Francisco Bay Area,6657982.0,Sharks,Sharks,45,27,0.625
3,Chicago,9512999.0,Blackhawks,Blackhawks,33,39,0.458333
4,Dallas–Fort Worth,7233323.0,Stars,Stars,42,32,0.567568
5,"Washington, D.C.",6131977.0,Capitals,Capitals,49,26,0.653333
6,Philadelphia,6070500.0,Flyers,Flyers,42,26,0.617647
7,Boston,4794447.0,Bruins,Bruins,50,20,0.714286
8,Minneapolis–Saint Paul,3551036.0,Wild,Wild,45,26,0.633803
9,Denver,2853077.0,Avalanche,Avalanche,43,30,0.589041


In [3]:
nhl_correlation()


0.012485959345532895

In [2]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import re

mlb_df = pd.read_csv("assets/mlb.csv")
cities = pd.read_html("assets/wikipedia_data.html")[1]
cities = cities.iloc[:-1, [0, 3, 5, 6, 7, 8]]


def mlb_correlation():
    mlb_df = pd.read_csv("assets/mlb.csv")
    cities = pd.read_html("assets/wikipedia_data.html")[1]
    cities = cities.iloc[:-1, [0, 3, 5, 6, 7, 8]]

    # mlb
    mlb_df = mlb_df.query('year == 2018')
    mlb_df = mlb_df.replace(
        {'Boston Red Sox': 'RedSox', 'Chicago White Sox': 'WhiteSox'})
    mlb_df['team'] = mlb_df['team'].str.split(' ').str[-1]
    mlb_df['W/L%'] = mlb_df['W'] / (mlb_df['W'] + mlb_df['L'])

    # cities
    teams = {'YankeesMets': 'Yankees Mets', 'DodgersAngels': 'Dodgers Angels',
             'GiantsAthletics': 'Giants Athletics', 'CubsWhite Sox': 'Cubs WhiteSox', 'Red Sox': 'RedSox'}
    cities['MLB'] = cities['MLB'].str.replace(r'\[.*\]', '', regex=True)
    cities['MLB'] = cities['MLB'].replace(teams).str.split(' ')
    cities = cities.explode('MLB')

    # merge
    df = pd.merge(mlb_df, cities, left_on='team', right_on='MLB')
    df = df[['Metropolitan area', 'Population (2016 est.)[8]', 'MLB', 'W-L%']].astype(
        {'Population (2016 est.)[8]': 'int64', 'W-L%': 'float'})

    # mean and merge
    df_mean = df.groupby('Metropolitan area')[['W-L%']].mean()
    df = pd.merge(df, df_mean, on='Metropolitan area').drop_duplicates(
        'Metropolitan area').reset_index(drop=True)
    df = df[['Metropolitan area',
             'Population (2016 est.)[8]', 'MLB', 'W-L%_y']].rename(columns={'W-L%_y': 'W-L%'})

    # pass in metropolitan area population from cities
    population_by_region = df['Population (2016 est.)[8]']
    # pass in win/loss ratio from mlb_df in the same order as cities["Metropolitan area"]
    win_loss_by_region = df['W-L%']

    assert len(population_by_region) == len(
        win_loss_by_region), "Q3: Your lists must be the same length"
    assert len(
        population_by_region) == 26, "Q3: There should be 26 teams being analysed for MLB"
    corr = stats.pearsonr(population_by_region, win_loss_by_region)

    return corr[0]
    raise NotImplementedError()


In [4]:
mlb_correlation()


0.15003737475409495

In [3]:
import numpy as np

a = np.arange(8)
b = a[4:6]
b[:] = 40
c = a[4] + a[6]
c

46

In [7]:
import re
s = 'ABCAC'

bool(re.match('A', s)) == True

True

In [49]:
def result():
    s = 'ACAABAACAAABACDBADDDFSDDDFFSSSASDAFAAACBAAAFASD'

    result = []
    # compete the pattern below
    pattern = '([^A{3}])(?=A{3})'
    for item in re.finditer(pattern, s):
      # identify the group number below.
      result.append(item.group())
      
    return result

In [50]:
result()

['C', 'F', 'B']

In [58]:
import pandas as pd

df = pd.Series({[4,7,-5,3]:['d','b','a','c'],[1,2,3]:['f','g','h']})
df

TypeError: unhashable type: 'list'

In [41]:
df2 = 

In [44]:
df2['d'] == df.add(df1, fill_value = 0)['d']
df2

a   NaN
b   NaN
c   NaN
d   NaN
f   NaN
g   NaN
h   NaN
dtype: float64

In [52]:
S = pd.Series(np.arange(5), index=['a', 'b', 'c', 'd', 'e'])
S

a    0
b    1
c    2
d    3
e    4
dtype: int32

In [53]:
S[['b', 'c', 'd']]

b    1
c    2
d    3
dtype: int32

In [54]:
S[S <= 3][S > 0]

b    1
c    2
d    3
dtype: int32

In [55]:
S[1:4]

b    1
c    2
d    3
dtype: int32

In [None]:
f = lambda x: x.max() + x.min()
df_new = df.apply(f)

In [None]:
46

bool(re.match('A', s)) == True

'([^A])(?=A{3})'

df.index[0]

s3['Mango'] >=  s1.add(s2, fill_value = 0)['Mango']

S['b':'e']

new_df.unstack().unstack()


30.0

In [59]:
df = pd.DataFrame({'a':[5,5,71,67], 'b':[6,82,31,37], 'c':[20,28,92,49]})
df

Unnamed: 0,a,b,c
0,5,6,20
1,5,82,28
2,71,31,92
3,67,37,49


In [60]:
f = lambda x: x.max() + x.min()
df_new = df.apply(f)

In [63]:
df_new

a     76
b     88
c    112
dtype: int64