In [81]:
import pandas as pd


stats = pd.read_csv("NHL_HR_Stats_Cleaned_2016_to_2024.csv")

def next_year(player):
    player = player.sort_values("Year")
    player["Next_PTS"] = player["Scoring PTS"].shift(-1)
    return player


stats = stats.groupby("ID", group_keys=False).apply(next_year)


stats.drop(columns=['Ice Time ATOI'], inplace=True)


def convert_to_minutes(time_str):
    if pd.isna(time_str):
        return None
    minutes, seconds = map(int, time_str.split(':'))
    return minutes + seconds / 60.0


stats['Ice Time TOI'] = stats['Ice Time TOI'].apply(convert_to_minutes)


stats["team_code"] = stats["Team"].astype("category").cat.codes
stats["pos_code"] = stats["Pos"].astype("category").cat.codes

def filter_teams(group):
    if any(group['Team'].isin(['2TM', '3TM', '4TM'])):
        return group[group['Team'].isin(['2TM', '3TM', '4TM'])]
    return group

stats = stats.groupby(['ID', 'Year'], group_keys=False).apply(filter_teams)

stats = stats.dropna().copy()

from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler

rr = Ridge(alpha=1)
split = TimeSeriesSplit(n_splits=3)
sfs = SequentialFeatureSelector(rr, n_features_to_select=20, direction="forward", cv=split, n_jobs=4)
removed_columns = ["Next_PTS", "Player", "Year", "ID", "Pos", "Team"]
selected_columns = stats.columns[~stats.columns.isin(removed_columns)]

scaler = MinMaxScaler()
stats.loc[:, selected_columns] = scaler.fit_transform(stats[selected_columns])

sfs.fit(stats[selected_columns], stats["Next_PTS"])









  stats = stats.groupby("ID", group_keys=False).apply(next_year)
  stats = stats.groupby(['ID', 'Year'], group_keys=False).apply(filter_teams)
  stats.loc[:, selected_columns] = scaler.fit_transform(stats[selected_columns])
  stats.loc[:, selected_columns] = scaler.fit_transform(stats[selected_columns])


Index(['Player', 'Age', 'Team', 'Pos', 'GP', 'Scoring G', 'Scoring A',
       'Scoring PTS', '+/-', 'PIM', 'Goals EVG', 'Goals PPG', 'Goals SHG',
       'Goals GWG', 'Assists EV', 'Assists PP', 'Assists SH', 'Shots SOG',
       'Shots SPCT', 'Shots TSA', 'Ice Time TOI', 'Faceoffs FOW',
       'Faceoffs FOL', 'BLK', 'HIT', 'TAKE', 'GIVE', 'Year', 'ID', 'Next_PTS',
       'team_code', 'pos_code'],
      dtype='object')


In [82]:
predictors = list(selected_columns[sfs.get_support()])

def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []
    
    yearss = sorted(data["Year"].unique())

    for i in range (start, len(yearss), step):
        current_year = yearss[i]

        train = data[data["Year"] < current_year]
        test = data[data["Year"] == current_year]

        model.fit(train[predictors], train["Next_PTS"])

        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_PTS"], preds], axis=1)
        combined.columns = ["actual", "prediction"]

        all_predictions.append(combined)
    return pd.concat(all_predictions)

predictions = backtest(stats, rr, predictors)

predictions



Index(['Player', 'Age', 'Team', 'Pos', 'GP', 'Scoring G', 'Scoring A',
       'Scoring PTS', '+/-', 'PIM', 'Goals EVG', 'Goals PPG', 'Goals SHG',
       'Goals GWG', 'Assists EV', 'Assists PP', 'Assists SH', 'Shots SOG',
       'Shots SPCT', 'Shots TSA', 'Ice Time TOI', 'Faceoffs FOW',
       'Faceoffs FOL', 'BLK', 'HIT', 'TAKE', 'GIVE', 'Year', 'ID', 'Next_PTS',
       'team_code', 'pos_code'],
      dtype='object')


In [83]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])



Index(['Player', 'Age', 'Team', 'Pos', 'GP', 'Scoring G', 'Scoring A',
       'Scoring PTS', '+/-', 'PIM', 'Goals EVG', 'Goals PPG', 'Goals SHG',
       'Goals GWG', 'Assists EV', 'Assists PP', 'Assists SH', 'Shots SOG',
       'Shots SPCT', 'Shots TSA', 'Ice Time TOI', 'Faceoffs FOW',
       'Faceoffs FOL', 'BLK', 'HIT', 'TAKE', 'GIVE', 'Year', 'ID', 'Next_PTS',
       'team_code', 'pos_code'],
      dtype='object')


In [104]:
stats["Next_PTS"].describe()



count    6227.000000
mean       23.738879
std        22.158380
min         0.000000
25%         6.000000
50%        18.000000
75%        35.000000
max       153.000000
Name: Next_PTS, dtype: float64

In [135]:
240.3478380459831 ** .5



15.503155744750265

In [121]:
def player_history(df):
    df = df.sort_values("Year")

    df["player_year"] = range(0, df.shape[0])
    df["pts_corr"] = list(df[["player_year", "Scoring PTS"]].expanding().corr().loc[(slice(None), "player_year"), "Scoring PTS"])
    df["pts_corr"].fillna(1, inplace=True)

    # Add a small constant to avoid division by zero
    df["pts_diff"] = df["Scoring PTS"] / (df["Scoring PTS"].shift(1) + 0.01)
    df["pts_diff"].replace([np.inf, -np.inf], np.nan, inplace=True)
    df["pts_diff"].fillna(1, inplace=True)  # Fill initial NaN and any remaining after replacement

    return df

stats = stats.groupby("ID", group_keys=False).apply(player_history)

# After computing all features
if np.isinf(stats["pts_diff"]).any() or np.isnan(stats["pts_diff"]).any():
    print("Infinities or NaNs present in 'pts_diff'.")
else:
    print("No infinities or NaNs in 'pts_diff'.")

# Check for large values before fitting model
if (stats.select_dtypes(include=[np.number]) > 1e9).any().any():
    print("Extremely large values found.")
else:
    print("No large values found.")


# Add 1 to avoid log(0) which is undefined
stats['pts_diff_log'] = np.log(stats['pts_diff'] + 1)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["pts_corr"].fillna(1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["pts_diff"].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setti

No infinities or NaNs in 'pts_diff'.
No large values found.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["pts_corr"].fillna(1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["pts_diff"].replace([np.inf, -np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setti

In [122]:
def group_averages(df):
    return df["Scoring PTS"] / df["Scoring PTS"].mean()

In [123]:
stats["pts_year"] = stats.groupby("Year", group_keys=False).apply(group_averages)

  stats["pts_year"] = stats.groupby("Year", group_keys=False).apply(group_averages)


In [210]:
new_predictors = predictors + ["pts_corr", "pts_diff_log"]

In [211]:

print(stats.dtypes)


print("Infinities in actual:", np.isinf(predictions['actual']).any())
print("Infinities in prediction:", np.isinf(predictions['prediction']).any())
print("NaNs in actual:", predictions['actual'].isna().any())
print("NaNs in prediction:", predictions['prediction'].isna().any())


print(predictions[['actual', 'prediction']].describe())


try:
    mse = mean_squared_error(predictions['actual'], predictions['prediction'])
    print("Mean Squared Error:", mse)
except Exception as e:
    print("Error calculating MSE:", str(e))



Player           object
Age             float64
Team             object
Pos              object
GP              float64
Scoring G       float64
Scoring A       float64
Scoring PTS     float64
+/-             float64
PIM             float64
Goals EVG       float64
Goals PPG       float64
Goals SHG       float64
Goals GWG       float64
Assists EV      float64
Assists PP      float64
Assists SH      float64
Shots SOG       float64
Shots SPCT      float64
Shots TSA       float64
Ice Time TOI    float64
Faceoffs FOW    float64
Faceoffs FOL    float64
BLK             float64
HIT             float64
TAKE            float64
GIVE            float64
Year              int64
ID                int64
Next_PTS        float64
team_code       float64
pos_code        float64
player_year       int64
pts_corr        float64
pts_diff        float64
pts_season      float64
pts_year        float64
pts_diff_log    float64
dtype: object
Infinities in actual: False
Infinities in prediction: False
NaNs in actual

In [212]:

check_columns = ['pts_corr', 'pts_year', 'pts_diff_log', 'pts_season']
for col in check_columns:
    print(f"--- Checking {col} ---")
    print("Infinities:", np.isinf(stats[col]).any())
    print("NaN values:", stats[col].isna().any())
    print("Statistics:\n", stats[col].describe())


--- Checking pts_corr ---
Infinities: False
NaN values: False
Statistics:
 count    6227.000000
mean        0.361191
std         0.734365
min        -1.000000
25%        -0.277350
50%         0.737852
75%         1.000000
max         1.000000
Name: pts_corr, dtype: float64
--- Checking pts_year ---
Infinities: False
NaN values: False
Statistics:
 count    6227.000000
mean        1.000000
std         0.906505
min         0.000000
25%         0.263505
50%         0.754023
75%         1.508046
max         6.375455
Name: pts_year, dtype: float64
--- Checking pts_diff_log ---
Infinities: False
NaN values: False
Statistics:
 count    6227.000000
mean        0.726092
std         0.410913
min         0.000000
25%         0.542596
50%         0.693147
75%         0.802308
max         3.456914
Name: pts_diff_log, dtype: float64
--- Checking pts_season ---
Infinities: False
NaN values: False
Statistics:
 count    6227.000000
mean        1.000000
std         0.906505
min         0.000000
25%      

In [213]:

stats['pts_corr'].replace([-np.inf, np.inf], np.nan, inplace=True)


median_pts_corr = stats['pts_corr'].median()


stats['pts_corr'].fillna(median_pts_corr, inplace=True)



print("--- Updated pts_corr ---")
print("NaN values:", stats['pts_corr'].isna().any())
print("Statistics:\n", stats['pts_corr'].describe())


--- Updated pts_corr ---
NaN values: False
Statistics:
 count    6227.000000
mean        0.361191
std         0.734365
min        -1.000000
25%        -0.277350
50%         0.737852
75%         1.000000
max         1.000000
Name: pts_corr, dtype: float64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  stats['pts_corr'].replace([-np.inf, np.inf], np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  stats['pts_corr'].fillna(median_pts_corr, inplace=True)


In [214]:
predictions = backtest(stats, rr, new_predictors)

In [215]:
mean_squared_error(predictions["actual"], predictions["prediction"])

np.float64(236.1478787912999)

In [216]:
236.1478787912999 ** 0.5

15.367103786702941

In [217]:
pd.Series(rr.coef_, index=new_predictors)

Age            -14.926942
GP             -17.832888
Scoring G       28.576298
Scoring A       17.983922
Scoring PTS     22.414720
+/-              2.657752
PIM             -0.702238
Goals PPG        1.720454
Assists EV       8.244079
Assists PP      21.642543
Shots SOG       13.138165
Shots SPCT      10.143064
Shots TSA       10.963978
Ice Time TOI    24.442918
Faceoffs FOW    -0.264494
Faceoffs FOL     5.739224
BLK            -14.773664
TAKE            10.480481
team_code        2.502978
pos_code         0.326996
pts_corr        -1.967072
pts_diff_log    -1.566998
dtype: float64

In [222]:
diff = predictions["actual"] - predictions["prediction"]
merged = predictions.merge(stats, left_index=True, right_index=True)
merged["diff"] = (predictions["actual"] - predictions["prediction"]).abs()
merged

Unnamed: 0,actual,prediction,Player,Age,Team,Pos,GP,Scoring G,Scoring A,Scoring PTS,...,Next_PTS,team_code,pos_code,player_year,pts_corr,pts_diff,pts_season,pts_year,pts_diff_log,diff
4980,22.0,8.673053,Peyton Krebs,0.076923,VEG,C,0.035714,0.000000,0.011236,0.006536,...,22.0,0.941176,0.0,0,1.000000,1.000000,0.060719,0.060719,0.693147,13.326947
4555,35.0,25.923703,Erik Karlsson,0.461538,SJS,D,0.607143,0.125000,0.157303,0.143791,...,35.0,0.794118,0.2,5,-0.992607,0.529738,1.335810,1.335810,0.425096,9.076297
4854,1.0,4.783901,Matthew Peca,0.346154,OTT,C,0.047619,0.000000,0.011236,0.006536,...,1.0,0.676471,0.0,4,-0.215166,0.283286,0.060719,0.060719,0.249424,3.783901
4333,8.0,12.908019,Anders Bjork,0.230769,2TM,LW,0.523810,0.078125,0.067416,0.071895,...,8.0,0.000000,0.6,3,0.256186,0.535801,0.667905,0.667905,0.429052,4.908019
4779,93.0,45.466483,Kyle Connor,0.230769,WPG,LW,0.654762,0.406250,0.269663,0.326797,...,93.0,0.970588,0.6,4,0.626923,0.670871,3.035931,3.035931,0.513345,47.533517
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6278,4.0,6.246969,Brendan Gaunce,0.384615,CBJ,C,0.047619,0.000000,0.011236,0.006536,...,4.0,0.235294,0.0,5,0.062577,0.117233,0.036359,0.036359,0.110855,2.246969
6510,22.0,25.626852,Jacob Trouba,0.384615,NYR,D,0.964286,0.125000,0.247191,0.196078,...,22.0,0.647059,0.2,7,0.085649,0.740192,1.090781,1.090781,0.553996,3.626852
6995,53.0,35.017227,Teuvo TerÃ¤vÃ¤inen,0.384615,CAR,F,0.797619,0.187500,0.280899,0.241830,...,53.0,0.205882,0.4,7,-0.044051,0.556140,1.345297,1.345297,0.442208,17.982773
7828,0.0,10.349570,Tobias BjÃ¶rnfot,0.153846,3TM,D,0.035714,0.000000,0.000000,0.000000,...,0.0,0.029412,0.2,3,-0.835658,0.000000,0.000000,0.000000,0.000000,10.349570


In [223]:
merged.to_csv('HR_PPM_2016_to_2024.csv', index=False)