In [251]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [287]:
scores_dict = pd.read_csv('C:/Users/Gabriel/Documents/MIE368/Data/season_data.csv')
playoff_scores_dict = pd.read_csv('C:/Users/Gabriel/Documents/MIE368/Data/playoff_data.csv')

In [288]:
scores_dict.head()

Unnamed: 0,Date,Visitor,G_Vis,Home,G_Home,OT Ind,Season
0,2000-10-04,Colorado Avalanche,2.0,Dallas Stars,2.0,1,2001
1,2000-10-05,Ottawa Senators,4.0,Boston Bruins,4.0,1,2001
2,2000-10-05,Chicago Blackhawks,2.0,Buffalo Sabres,4.0,0,2001
3,2000-10-05,Detroit Red Wings,4.0,Calgary Flames,3.0,0,2001
4,2000-10-05,Vancouver Canucks,3.0,Philadelphia Flyers,6.0,0,2001


## Game Result Elo Adjustment

In [275]:
first_yr = scores_dict[scores_dict['Season'] == 2001]
Teams = first_yr['Home'].unique().tolist()

team_changes = {
    'Atlanta Thrashers': 'Winnipeg Jets',
    'Phoenix Coyotes': 'Arizona Coyotes',
    'Mighty Ducks of Anaheim': 'Anaheim Ducks'
    # Add more mappings as needed
}

elo_ratings = {}

K = 6

In [267]:
def calculate_prob_winning(home_team, away_team):
    
    Elo_diff_home = elo_ratings[home_team]-elo_ratings[away_team] + 50  #add extra for home-ice advantage   
    #Elo_diff_away = elo_ratings[away_team] - elo_ratings[home_team]
    prob_win_home = 1/(10**(-1*Elo_diff_home/400)+1)
    #prob_win_away = 1/(10**(-1*Elo_diff_away/400)+1)
    prob_win_away = 1 - prob_win_home
    
    return prob_win_home, prob_win_away, Elo_diff_home

In [268]:
def margin_of_victory(home_goals, away_goals):
    
    mov = abs(home_goals - away_goals)
    mov_mult = 0.6686*np.log(mov)+0.8048
    
    return mov_mult

In [269]:
# Function to update ELO ratings after a game -- Could add input for elo_ratings dictionary 
def update_elo(home_team, away_team, home_goals, away_goals, season, df, idx):
    
    # Map old team names to new ones
    home_team = team_changes.get(home_team, home_team)
    away_team = team_changes.get(away_team, away_team)
    
    #print('Home team is ' + str(home_team))
    #print('Away team is ' + str(away_team))

    if home_team not in elo_ratings:
        # New teams added past a certain year start with a different ELO rating
        elo_ratings[home_team] = 1490 if season >= 2005 else 1380
    if away_team not in elo_ratings:
        elo_ratings[away_team] = 1490 if season >= 2005 else 1380

     # Calculate the probability of winning the game for each team
    home_prob, away_prob, Elo_diff_home = calculate_prob_winning(home_team, away_team)
    
    # Update DataFrame values
    df.at[idx, 'EloDiffHome'] = Elo_diff_home
    df.at[idx, 'EloDiffAway'] = -1 * Elo_diff_home
    
     # Update Pregame Favorite Multiplier
    if home_goals > away_goals:
        home_win = 1
        #away_win = 0
        winner_elo_diff = Elo_diff_home
    elif home_goals < away_goals:
        home_win = 0
        #away_win = 1
        winner_elo_diff = -1*Elo_diff_home
    else:
        return
    
    pre_g_fav_h = home_win - home_prob  
    #pre_g_fav_a = away_win - away_prob  
    
    auto_corr = 2.05/(winner_elo_diff*0.001 + 2.05)
    
     # Adjust ELO shift for margin of victory
    mov_multiplier = auto_corr * margin_of_victory(home_goals, away_goals)
    #print('home goals:' + str(home_goals))
    #print('away goals:' + str(away_goals))
    #print('mult is: ' + str(mov_multiplier))
    
     # Calculate ELO shift based on game result
    elo_shift_h = K * mov_multiplier * pre_g_fav_h 
    #elo_shift_a = K * mov_multiplier * pre_g_fav_a
    #print('elo shift is: ' + str(elo_shift_h))
    
    elo_ratings[home_team] += elo_shift_h
    elo_ratings[away_team] -= elo_shift_h

    #if (home_team == 'Nashville Predators') or (away_team == 'Nashville Predators'):
     #   print(elo_shift_h)
      #  print(elo_ratings['Nashville Predators'])
    #print("Home team elo is: " + str(elo_ratings[home_team]))
    #print("Away team elo is: " + str(elo_ratings[away_team]))

### Testing Game Result Elo Adjustments

In [261]:
'''
elo_ratings = {}
# Iterate over seasons and games
for idx, row in scores_dict.iterrows():
    update_elo(row['Home'], row['Visitor'], row['G_Home'], row['G_Vis'], row['Season'], scores_dict, idx)

truncated_dict = {k: round(v, 2) for k, v in elo_ratings.items()}
print(truncated_dict)
'''

{'Dallas Stars': 1439.13, 'Colorado Avalanche': 1482.44, 'Boston Bruins': 1544.44, 'Ottawa Senators': 1353.63, 'Buffalo Sabres': 1360.53, 'Chicago Blackhawks': 1268.95, 'Calgary Flames': 1409.08, 'Detroit Red Wings': 1313.12, 'Philadelphia Flyers': 1294.58, 'Vancouver Canucks': 1367.49, 'Arizona Coyotes': 1276.48, 'St. Louis Blues': 1379.45, 'Edmonton Oilers': 1474.04, 'Florida Panthers': 1437.49, 'Anaheim Ducks': 1240.46, 'Minnesota Wild': 1442.06, 'New Jersey Devils': 1405.28, 'Montreal Canadiens': 1279.67, 'Pittsburgh Penguins': 1400.75, 'Nashville Predators': 1396.28, 'San Jose Sharks': 1259.65, 'Tampa Bay Lightning': 1440.6, 'New York Islanders': 1401.21, 'Washington Capitals': 1369.91, 'Los Angeles Kings': 1418.89, 'Winnipeg Jets': 1400.91, 'New York Rangers': 1459.63, 'Carolina Hurricanes': 1475.18, 'Columbus Blue Jackets': 1268.28, 'Toronto Maple Leafs': 1466.79, 'Vegas Golden Knights': 1451.57, 'Seattle Kraken': 1402.05}


In [272]:
'''
elo_ratings = {}
# For testing on one season
for idx, row in scores_dict.iterrows():
    if row['Season'] == 2001:
        update_elo(row['Home'], row['Visitor'], row['G_Home'], row['G_Vis'], row['Season'], scores_dict, idx)

print(elo_ratings)
'''

"\nelo_ratings = {}\n# For testing on one season\nfor idx, row in scores_dict.iterrows():\n    if row['Season'] == 2001:\n        update_elo(row['Home'], row['Visitor'], row['G_Home'], row['G_Vis'], row['Season'], scores_dict, idx)\n\nprint(elo_ratings)\n"

## End of Season Elo Adjustments

In [279]:
# Initialize variables to store the final Elo ratings for the season
final_elo_ratings = {}
elo_ratings = {}
season_weight = 0.7
average_weight = 0.3
current_season = None

# Iterate over seasons and games
for idx, row in scores_dict.iterrows():
    if current_season is None:
        current_season = row['Season']
    
    # Check if the season in the current row is different from the current season
    if row['Season'] != current_season:
        # Save the previous season's Elo ratings and calculate the starting Elo for the new season
        final_elo_ratings[current_season] = dict(elo_ratings)
        elo_ratings = {}  # Reset Elo ratings for the new season

        for team in final_elo_ratings[current_season]:
            prev_season_elo = final_elo_ratings[current_season][team]
            league_avg_elo = sum(final_elo_ratings[current_season].values()) / len(final_elo_ratings[current_season])
            starting_elo = (season_weight * prev_season_elo) + (average_weight * league_avg_elo)
            elo_ratings[team] = round(starting_elo, 2)

        current_season = row['Season']


    # Update Elo ratings for the game
    update_elo(row['Home'], row['Visitor'], row['G_Home'], row['G_Vis'], row['Season'], scores_dict, idx)

# Calculate the final Elo ratings for the last season
final_elo_ratings[current_season] = dict(elo_ratings)

In [283]:
elo_ratings

{'Dallas Stars': 1438.449568876547,
 'Colorado Avalanche': 1467.0041326779394,
 'Boston Bruins': 1530.397215501669,
 'Ottawa Senators': 1365.1050381874425,
 'Buffalo Sabres': 1373.1864106169082,
 'Chicago Blackhawks': 1280.4201757826659,
 'Calgary Flames': 1402.1947282646518,
 'Detroit Red Wings': 1329.8577019700033,
 'Philadelphia Flyers': 1307.0972745553063,
 'Vancouver Canucks': 1370.2816315577088,
 'Arizona Coyotes': 1289.5863386862557,
 'St. Louis Blues': 1371.011391435606,
 'Edmonton Oilers': 1468.3836695037394,
 'Florida Panthers': 1425.486802121703,
 'Anaheim Ducks': 1255.6070111408524,
 'Minnesota Wild': 1431.4151312241875,
 'New Jersey Devils': 1419.86898941051,
 'Montreal Canadiens': 1294.8165146511856,
 'Pittsburgh Penguins': 1392.0652070159683,
 'Nashville Predators': 1392.4566649394321,
 'San Jose Sharks': 1272.6965553339462,
 'Tampa Bay Lightning': 1427.353379517724,
 'New York Islanders': 1399.2857789281184,
 'Washington Capitals': 1362.2730179203713,
 'Los Angeles King

### Testing End of Season Adjustments

In [303]:
first_2_yrs = scores_dict[scores_dict['Season'] < 2003]

In [304]:
elo_ratings = {}
final_elo_ratings = {}
season_weight = 0.7
average_weight = 0.3
current_season = None

In [305]:
# Iterate over seasons and games
for idx, row in first_2_yrs.iterrows():
    if current_season is None:
        current_season = row['Season']
        print('initialize season ' + str(current_season))
    
    # Check if the season in the current row is different from the current season
    if row['Season'] != current_season:
        print('new season: ' + str(row['Season']))
        # Save the previous season's Elo ratings and calculate the starting Elo for the new season
        final_elo_ratings[current_season] = dict(elo_ratings)
        print(f'season end game elo\n{final_elo_ratings[current_season]}')
        elo_ratings = {}  # Reset Elo ratings for the new season

        for team in final_elo_ratings[current_season]:
            prev_season_elo = final_elo_ratings[current_season][team]
            print(prev_season_elo)
            league_avg_elo = sum(final_elo_ratings[current_season].values()) / len(final_elo_ratings[current_season])
            starting_elo = (season_weight * prev_season_elo) + (average_weight * league_avg_elo)
            elo_ratings[team] = round(starting_elo, 2)
            print(f'next season start elo\n{elo_ratings}')
            
        current_season = row['Season']
        print('update current season: ' + str(current_season))
    
    # Update Elo ratings for the game
    update_elo(row['Home'], row['Visitor'], row['G_Home'], row['G_Vis'], row['Season'], first_2_yrs, idx)
              
# Calculate the final Elo ratings for the last season
final_elo_ratings[current_season] = dict(elo_ratings)

initialize season 2001
new season: 2002
season end game elo
{'Dallas Stars': 1442.1871487872013, 'Colorado Avalanche': 1463.6007576504824, 'Boston Bruins': 1370.8443121575954, 'Ottawa Senators': 1440.9596278618217, 'Buffalo Sabres': 1418.5185026724023, 'Chicago Blackhawks': 1331.7657522078846, 'Calgary Flames': 1349.5232215229676, 'Detroit Red Wings': 1447.4927286851896, 'Philadelphia Flyers': 1421.2560312360063, 'Vancouver Canucks': 1376.122568833142, 'Arizona Coyotes': 1382.564178166439, 'St. Louis Blues': 1413.213590659353, 'Edmonton Oilers': 1411.1910748869525, 'Florida Panthers': 1327.4000769284844, 'Anaheim Ducks': 1324.3538979678942, 'Minnesota Wild': 1334.8669848138652, 'New Jersey Devils': 1473.0956689217276, 'Montreal Canadiens': 1351.2170413504818, 'Pittsburgh Penguins': 1408.5171668318396, 'Nashville Predators': 1373.2966700415834, 'San Jose Sharks': 1395.3632673671054, 'Tampa Bay Lightning': 1300.8204113464956, 'New York Islanders': 1285.6877671604716, 'Washington Capitals

In [306]:
first_2_yrs

Unnamed: 0,Date,Visitor,G_Vis,Home,G_Home,OT Ind,Season,EloDiffHome,EloDiffAway
0,2000-10-04,Colorado Avalanche,2.0,Dallas Stars,2.0,1,2001,50.000000,-50.000000
1,2000-10-05,Ottawa Senators,4.0,Boston Bruins,4.0,1,2001,50.000000,-50.000000
2,2000-10-05,Chicago Blackhawks,2.0,Buffalo Sabres,4.0,0,2001,50.000000,-50.000000
3,2000-10-05,Detroit Red Wings,4.0,Calgary Flames,3.0,0,2001,50.000000,-50.000000
4,2000-10-05,Vancouver Canucks,3.0,Philadelphia Flyers,6.0,0,2001,50.000000,-50.000000
...,...,...,...,...,...,...,...,...,...
2455,2002-04-14,Tampa Bay Lightning,3.0,Florida Panthers,2.0,1,2002,38.106987,-38.106987
2456,2002-04-14,Mighty Ducks of Anaheim,0.0,Los Angeles Kings,1.0,0,2002,134.775792,-134.775792
2457,2002-04-14,Edmonton Oilers,4.0,Minnesota Wild,2.0,0,2002,-31.236527,31.236527
2458,2002-04-14,Philadelphia Flyers,1.0,New York Islanders,3.0,0,2002,-8.688937,8.688937


## Linear Regression for OT Games

In [187]:
#def df_elodiff_add(row):
#    home_prob, away_prob, Elo_diff_home = calculate_prob_winning(row['Home'], row['Visitor'], row['G_Home'], row['G_Vis'])
#    row['EloDiffHome'] = Elo_diff_home
#    row['EloDiffAway'] = -1*Elo_diff_home

In [213]:
df_list = []

for season, df in scores_dict.items():
    df_list.append(df)

combined_df = pd.concat(df_list, ignore_index = True)
   
#for idx, row in df.iterrows():

In [216]:
combined_df.head()

Unnamed: 0,Date,Visitor,G_Vis,Home,G_Home,OT Ind,EloDiffHome,EloDiffAway
0,2000-10-04,Colorado Avalanche,2.0,Dallas Stars,2.0,1,50.0,-50.0
1,2000-10-05,Ottawa Senators,4.0,Boston Bruins,4.0,1,50.0,-50.0
2,2000-10-05,Chicago Blackhawks,2.0,Buffalo Sabres,4.0,0,50.0,-50.0
3,2000-10-05,Detroit Red Wings,4.0,Calgary Flames,3.0,0,50.0,-50.0
4,2000-10-05,Vancouver Canucks,3.0,Philadelphia Flyers,6.0,0,50.0,-50.0


In [226]:
combined_df.iloc[15017]

Date                    2014-01-24
Visitor            Ottawa Senators
G_Vis                          NaN
Home           Carolina Hurricanes
G_Home                         NaN
OT Ind                           0
EloDiffHome               22.54011
EloDiffAway              -22.54011
Name: 15017, dtype: object

In [225]:
y_train[y_train.isnull() == True]

15017   NaN
Name: G_Home, dtype: float64

In [220]:
# Define features to remove from the feature data (i.e., X)
drop_for_X_H = ['Date','Visitor','Home','OT Ind', 'G_Home', 'G_Vis', 'EloDiffAway']
drop_for_X_V = ['Date','Visitor','Home','OT Ind', 'G_Home', 'G_Vis', 'EloDiffHome']

X_train, X_test, y_train, y_test = train_test_split(combined_df.drop(columns=drop_for_X_H, axis=1), combined_df['G_Home'], test_size=0.30, random_state=5)
X_trainV, X_testV, y_trainV, y_testV = train_test_split(combined_df.drop(columns=drop_for_X_V, axis=1), combined_df['G_Vis'], test_size=0.30, random_state=5)


# Fit the model for Home
linreg = LinearRegression()
linreg.fit(X_train, y_train)

# Fit the model for Visitor
linregV = LinearRegression()
linregV.fit(X_trainV, y_trainV)


# Predict the scores of Home
y_test_predictions = linreg.predict(X_test)

# Predict the scores of Visitor
y_test_predictions_V = linreg.predict(X_testV)

# Home scoring
train_score = linreg.score(X_train, y_train)
test_score = linreg.score(X_test, y_test)

# Visitor scoring
train_score_V = linreg.score(X_trainV, y_trainV)
test_score_V = linreg.score(X_testV, y_testV)

print(f'The train score for Home games is {train_score:.3f} and the test score is {test_score:.3f}')
print(f'The train score for Away games is {train_score_V:.3f} and the test score is {test_score_V:.3f}')

ValueError: Input y contains NaN.