# Merge & Convert Data

In [1]:
import os
import pandas as pd
import numpy as np

## Game scores (regular season and playoffs):
source: https://www.hockey-reference.com/leagues/NHL_2019_games.html

In [2]:
dfs = []
for f in os.listdir():
    if f.endswith('.csv') and f.startswith('scores'):
        name = f[:-4].split('_')
        season = '20' + name[1] + '-20' + name[2]
        category = name[3] + ' ' + name[4] if len(name) > 4 else name[3]
        tmp_df = pd.read_csv(f)
        tmp_df['Season'] = season
        tmp_df['Type'] = category
        dfs.append(tmp_df)
scores = pd.concat(dfs)

In [3]:
print(scores['Season'].unique())

['2018-2019' '2020-2021' '2019-2020']


In [4]:
scores.head()

Unnamed: 0,Date,Visitor,G,Home,G.1,Unnamed: 5,Att.,LOG,Notes,Season,Type
0,2019-04-10,Pittsburgh Penguins,3.0,New York Islanders,4.0,OT,13917.0,3:06,at Nassau Veterans Memorial Coliseum (Uniondal...,2018-2019,playoffs
1,2019-04-10,Dallas Stars,3.0,Nashville Predators,2.0,,17458.0,2:30,,2018-2019,playoffs
2,2019-04-10,St. Louis Blues,2.0,Winnipeg Jets,1.0,,15321.0,2:31,,2018-2019,playoffs
3,2019-04-10,Vegas Golden Knights,2.0,San Jose Sharks,5.0,,17562.0,2:39,,2018-2019,playoffs
4,2019-04-10,Columbus Blue Jackets,4.0,Tampa Bay Lightning,3.0,,19092.0,2:28,,2018-2019,playoffs


Let's change some of the column names.<br>
  * Goals should be clear
  * Unnamed shows Overtime
  * Att. stands for Attendence
  * LOG stands for Length of Game

In [5]:
scores = scores.rename(columns={'G':'Visitor Goals', 'G.1':'Home Goals', 'Unnamed: 5':'Overtime', 'Att.':'Attendance', 'LOG':'Length'}, inplace=False)

Let's remove the data for the games that haven't been played yet.

In [6]:
scores.loc[scores['Visitor Goals'].isnull()]

Unnamed: 0,Date,Visitor,Visitor Goals,Home,Home Goals,Overtime,Attendance,Length,Notes,Season,Type
692,2021-04-20,Boston Bruins,,Buffalo Sabres,,,,,,2020-2021,regular season
693,2021-04-20,Detroit Red Wings,,Dallas Stars,,,,,,2020-2021,regular season
694,2021-04-20,Columbus Blue Jackets,,Florida Panthers,,,,,,2020-2021,regular season
695,2021-04-20,Anaheim Ducks,,Los Angeles Kings,,,,,,2020-2021,regular season
696,2021-04-20,New York Rangers,,New York Islanders,,,,,at Nassau Veterans Memorial Coliseum (Uniondal...,2020-2021,regular season
...,...,...,...,...,...,...,...,...,...,...,...
860,2021-05-14,Toronto Maple Leafs,,Winnipeg Jets,,,,,,2020-2021,regular season
861,2021-05-15,Vancouver Canucks,,Edmonton Oilers,,,,,,2020-2021,regular season
862,2021-05-16,Calgary Flames,,Vancouver Canucks,,,,,,2020-2021,regular season
863,2021-05-18,Calgary Flames,,Vancouver Canucks,,,,,,2020-2021,regular season


In [7]:
scores = scores.dropna(subset=['Visitor Goals'])
scores['Attendance'] = scores['Attendance'].fillna(0)
scores['Length'] = scores['Length'].fillna('0:00')

Any convert the data types, just to clean things up a bit.

In [8]:
scores.dtypes

Date              object
Visitor           object
Visitor Goals    float64
Home              object
Home Goals       float64
Overtime          object
Attendance       float64
Length            object
Notes             object
Season            object
Type              object
dtype: object

In [9]:
scores.memory_usage(index=False)

Date             26096
Visitor          26096
Visitor Goals    26096
Home             26096
Home Goals       26096
Overtime         26096
Attendance       26096
Length           26096
Notes            26096
Season           26096
Type             26096
dtype: int64

In [10]:
scores['Date'] = scores['Date'].astype('datetime64')
scores['Visitor'] = scores['Visitor'].astype('category')
scores['Visitor Goals'] = scores['Visitor Goals'].astype('int64')
scores['Home'] = scores['Home'].astype('category')
scores['Home Goals'] = scores['Home Goals'].astype('int64')
scores['Overtime'] = scores['Overtime'].astype('category')
scores['Attendance'] = scores['Attendance'].astype('int64')
scores['Length'] = pd.to_timedelta((scores['Length'].str.split(':', expand=True).astype(int) * (60, 1)).sum(axis=1), unit='min')
scores['Season'] = scores['Season'].astype('category')
scores['Type'] = scores['Type'].astype('category')

In [11]:
scores.head()

Unnamed: 0,Date,Visitor,Visitor Goals,Home,Home Goals,Overtime,Attendance,Length,Notes,Season,Type
0,2019-04-10,Pittsburgh Penguins,3,New York Islanders,4,OT,13917,0 days 03:06:00,at Nassau Veterans Memorial Coliseum (Uniondal...,2018-2019,playoffs
1,2019-04-10,Dallas Stars,3,Nashville Predators,2,,17458,0 days 02:30:00,,2018-2019,playoffs
2,2019-04-10,St. Louis Blues,2,Winnipeg Jets,1,,15321,0 days 02:31:00,,2018-2019,playoffs
3,2019-04-10,Vegas Golden Knights,2,San Jose Sharks,5,,17562,0 days 02:39:00,,2018-2019,playoffs
4,2019-04-10,Columbus Blue Jackets,4,Tampa Bay Lightning,3,,19092,0 days 02:28:00,,2018-2019,playoffs


Let's save the data for now.

In [12]:
scores.to_pickle('scores.pkl')

## Convert to Standings
Let's get a feel for the data by creating Standings

In [13]:
for name, games in scores.loc[scores['Type'] == 'regular season'].groupby('Season'):
    # create empty list
    df = pd.DataFrame({
        'Team': pd.Series([], dtype='category'),
        'Games': pd.Series([], dtype='int'),
        'Wins': pd.Series([], dtype='int'),
        'Losses': pd.Series([], dtype='int'),
        'OT Losses': pd.Series([], dtype='int'),
        'Points': pd.Series([], dtype='int'),
        'P%': pd.Series([], dtype='float'),
        'OT Wins': pd.Series([], dtype='int'),
        'Reg Win': pd.Series([], dtype='int')
    })
    # add teams
    df['Team'] = games['Visitor'].unique()
    # replace nans with zeros
    df = df.apply(lambda x: x.fillna(0) if x.dtype.kind in 'biufc' else x)
    # count wins, losses and OT
    for _, row in games.iterrows():
        home_idx = df.index[df['Team'] == row['Home']].tolist()[0]
        visit_idx = df.index[df['Team'] == row['Visitor']].tolist()[0]
        if row['Home Goals'] > row['Visitor Goals']:    # Home team wins
            df.at[home_idx, 'Wins'] += 1
            if pd.notnull(row['Overtime']):   # SO, 2OT, etc possible
                df.at[home_idx, 'OT Wins'] += 1
                df.at[visit_idx, 'OT Losses'] += 1
            else:
                df.at[visit_idx, 'Losses'] += 1
        elif row['Home Goals'] < row['Visitor Goals']:  # Visitor team wins
            df.at[visit_idx, 'Wins'] += 1
            if pd.notnull(row['Overtime']):   # SO, 2OT, etc possible
                df.at[visit_idx, 'OT Wins'] += 1
                df.at[home_idx, 'OT Losses'] += 1
            else:
                df.at[home_idx, 'Losses'] += 1
    # add metrics
    df['Games'] = df['Wins'] + df['Losses'] + df['OT Losses']
    df['Points'] = 2 * df['Wins'] + 1 * df['OT Losses']
    df['Reg Win'] = df['Wins'] - df['OT Wins']
    df['P%'] = df['Points'] / (2 * df['Games'])
    # print table
    print('Season ' + name)
    display(df.sort_values('P%', ascending=False)) # since number of games are not always the same
    print()

Season 2018-2019


Unnamed: 0,Team,Games,Wins,Losses,OT Losses,Points,P%,OT Wins,Reg Win
29,Tampa Bay Lightning,82.0,62.0,16.0,4.0,128.0,0.780488,13.0,49.0
3,Boston Bruins,82.0,49.0,24.0,9.0,107.0,0.652439,11.0,38.0
2,Calgary Flames,82.0,50.0,25.0,7.0,107.0,0.652439,5.0,45.0
10,Washington Capitals,82.0,48.0,26.0,8.0,104.0,0.634146,9.0,39.0
4,New York Islanders,82.0,48.0,27.0,7.0,103.0,0.628049,11.0,37.0
14,San Jose Sharks,82.0,46.0,27.0,9.0,101.0,0.615854,8.0,38.0
8,Nashville Predators,82.0,47.0,29.0,6.0,100.0,0.609756,9.0,38.0
21,Toronto Maple Leafs,82.0,46.0,28.0,8.0,100.0,0.609756,6.0,40.0
27,Pittsburgh Penguins,82.0,44.0,26.0,12.0,100.0,0.609756,7.0,37.0
11,Winnipeg Jets,82.0,47.0,30.0,5.0,99.0,0.603659,9.0,38.0



Season 2019-2020


Unnamed: 0,Team,Games,Wins,Losses,OT Losses,Points,P%,OT Wins,Reg Win
7,Boston Bruins,70.0,44.0,14.0,12.0,100.0,0.714286,6.0,38.0
23,St. Louis Blues,71.0,42.0,19.0,10.0,94.0,0.661972,9.0,33.0
30,Colorado Avalanche,70.0,42.0,20.0,8.0,92.0,0.657143,5.0,37.0
17,Tampa Bay Lightning,70.0,43.0,21.0,6.0,92.0,0.657143,8.0,35.0
1,Washington Capitals,69.0,41.0,20.0,8.0,90.0,0.652174,10.0,31.0
29,Philadelphia Flyers,69.0,41.0,21.0,7.0,89.0,0.644928,10.0,31.0
28,Pittsburgh Penguins,69.0,40.0,23.0,6.0,86.0,0.623188,11.0,29.0
14,Vegas Golden Knights,71.0,39.0,24.0,8.0,86.0,0.605634,9.0,30.0
22,Carolina Hurricanes,68.0,38.0,25.0,5.0,81.0,0.595588,11.0,27.0
21,Dallas Stars,69.0,37.0,24.0,8.0,82.0,0.594203,11.0,26.0



Season 2020-2021


Unnamed: 0,Team,Games,Wins,Losses,OT Losses,Points,P%,OT Wins,Reg Win
18,Colorado Avalanche,43.0,30.0,9.0,4.0,64.0,0.744186,3.0,27.0
23,Vegas Golden Knights,45.0,32.0,11.0,2.0,66.0,0.733333,7.0,25.0
7,Carolina Hurricanes,44.0,29.0,10.0,5.0,63.0,0.715909,8.0,21.0
22,Tampa Bay Lightning,45.0,30.0,13.0,2.0,62.0,0.688889,6.0,24.0
29,Florida Panthers,46.0,29.0,12.0,5.0,63.0,0.684783,8.0,21.0
14,Toronto Maple Leafs,45.0,28.0,12.0,5.0,61.0,0.677778,6.0,22.0
6,Washington Capitals,46.0,29.0,13.0,4.0,62.0,0.673913,5.0,24.0
8,Minnesota Wild,44.0,28.0,13.0,3.0,59.0,0.670455,5.0,23.0
11,New York Islanders,45.0,28.0,13.0,4.0,60.0,0.666667,8.0,20.0
2,Pittsburgh Penguins,45.0,28.0,14.0,3.0,59.0,0.655556,7.0,21.0





## Matchups
FIXME: Unnecessary<br>
Analyse the performance of each team against the others

In [14]:
matches = pd.DataFrame({
    'Team': pd.Series([], dtype='category'),
    'Opponent': pd.Series([], dtype='category'),
    'Goals': pd.Series([], dtype='int'),    # Goals scored
    'GA': pd.Series([], dtype='int'),       # Goals against
    'Home': pd.Series([], dtype='bool'),
    'Overtime': pd.Series([], dtype='bool'),     # FIXME: SO, 2OT, etc.
    'Date': pd.Series([], dtype='datetime64[ns]'),
    'Season': pd.Series([], dtype='category'),
    'Type': pd.Series([], dtype='category')
})

In [15]:
home = scores.drop(columns=['Attendance', 'Length', 'Notes'])
away = home.copy()

In [16]:
home = home.rename(columns={'Home':'Team', 'Visitor':'Opponent', 'Home Goals':'Goals', 'Visitor Goals':'GA'}, inplace=False)
home['Home'] = True
home['Overtime'] = home['Overtime'].notnull()

In [17]:
away = away.rename(columns={'Visitor':'Team', 'Home':'Opponent', 'Home Goals':'GA', 'Visitor Goals':'Goals'}, inplace=False)
away['Home'] = False
away['Overtime'] = away['Overtime'].notnull()

In [18]:
matches = pd.concat([matches, home, away], ignore_index=True)
matches.head()

Unnamed: 0,Team,Opponent,Goals,GA,Home,Overtime,Date,Season,Type
0,New York Islanders,Pittsburgh Penguins,4,3,True,True,2019-04-10,2018-2019,playoffs
1,Nashville Predators,Dallas Stars,2,3,True,False,2019-04-10,2018-2019,playoffs
2,Winnipeg Jets,St. Louis Blues,1,2,True,False,2019-04-10,2018-2019,playoffs
3,San Jose Sharks,Vegas Golden Knights,5,2,True,False,2019-04-10,2018-2019,playoffs
4,Tampa Bay Lightning,Columbus Blue Jackets,3,4,True,False,2019-04-10,2018-2019,playoffs


## Team analysis

In [19]:
teams = sorted(matches['Team'].unique())

def record_per_team(data, team):
    record = pd.DataFrame({
        'Opponent': pd.Series([], dtype='category'),
        'Self': pd.Series([], dtype='bool'),
        'Games': pd.Series([], dtype='int'),
        'Wins': pd.Series([], dtype='int'),
        'Losses': pd.Series([], dtype='int'),
        'OT Wins': pd.Series([], dtype='int'),
        'OT Losses': pd.Series([], dtype='int'),
        'Reg Win': pd.Series([], dtype='int'),
        'Goals': pd.Series([], dtype='int'),     # Goals scored
        'GA': pd.Series([], dtype='int'),        # Goals against
        'Season': pd.Series([], dtype='category')
    })
    # Add Teams
    record['Opponent'] = teams  #data['Team'].unique()
    record['Self'] = record['Opponent'] == team
    record['Season'] = (data['Season'].unique()).to_list()[0]
    # Fill zeros
    record = record.apply(lambda x: x.fillna(0) if x.dtype.kind in 'biufc' else x)
    # Go through all of the games (FIXME: could be sped up)
    for _, row in data.loc[data['Team'] == team].iterrows():
        # get oppenent's position in the df
        idx = record.index[record['Opponent'] == row['Opponent']].tolist()[0]
        # count number of games
        record.at[idx, 'Games'] += 1
        # count wins, losses and OT
        if row['Goals'] > row['GA']:
            record.at[idx, 'Wins'] += 1
            if row['Overtime']:
                record.at[idx, 'OT Wins'] += 1
        elif row['Overtime']:
            record.at[idx, 'OT Losses'] += 1
        else:
            record.at[idx, 'Losses'] += 1
        # count goals and goals against
        record.at[idx, 'Goals'] += row['Goals']
        record.at[idx, 'GA'] += row['GA']
    # compute goal difference
    record['Diff'] = record['Goals'] - record['GA']
    # compute regulation wins
    record['Reg Win'] = record['Wins'] - record['OT Wins']
    # display(record.sort_values('Opponent', ascending=False))
    return (record.sort_values('Opponent', ascending=False)).to_numpy()

In [20]:
stats = []
teams = sorted(matches['Team'].unique())
seasons = sorted(matches['Season'].unique())
for season in seasons:
    print(season)
    for team in teams:
        # compute data for regular season matches
        season_matches = matches.loc[(matches['Type'] == 'regular season') & (matches['Season'] == season)].copy()
        stats.append(record_per_team(season_matches, team))
        # compute for playoffs
        playoff_matches = matches.loc[(matches['Type'] == 'playoffs') & (matches['Season'] == season)].copy()
        if len(playoff_matches) > 0:
            stats.append(record_per_team(playoff_matches, team))

2018-2019
2019-2020
2020-2021


In [21]:
stats = np.stack(stats)     # 155 x 31 x 12 matrix of all of the data
stats.shape

(155, 31, 12)

## Betting Odds
source: https://www.sportsoddshistory.com/nhl-odds/

In [22]:
# TODO: forecast points for the end of the season 2020-21