In [11]:
import pandas as pd

dropped_columns = [
    'birthDate',
    'nationality',
    'id',
    'jerseyNumber',
    'code',
    'type',
    'abbreviation',
    'birthStateProvince',
    'Team_Number',
    'Unnamed: 0',
    'alternateCaptain',
    'captain',
    'active',
    'rookie',
    'rosterStatus',
    'birthCity',
    'link'
]

offense_base = [
    'fullName',
    'salary_rank',
    'salary_2021-22',
    'name',
    'assists22',
    'goals22',
    'shots22',
    'faceOffPct22',
    'shotPct22',
    'gameWinningGoals22',
    'overTimeGoals22',
    'points22',
    'plusMinus22',
]
special_teams_base = [
    'fullName',
    'salary_2021-22',
    'salary_rank',
    'name',
    'powerPlayGoals22',
    'powerPlayPoints22',
    'powerPlayTimeOnIce22',
    'shortHandedGoals22',
    'shortHandedPoints22',
    'shortHandedTimeOnIce22',
]
enforcer_base = [
    'fullName',
    'salary_2021-22',
    'salary_rank',
    'name',
    'hits22',
    'penaltyMinutes22',
]
endurance_base = [
    'fullName',
    'salary_2021-22',
    'name',
    'timeOnIce22',
    'games22',
    'shifts22',
    'blocked22',
    'timeOnIcePerGame22',
    'evenTimeOnIcePerGame22',
    'shortHandedTimeOnIcePerGame22',
    'powerPlayTimeOnIcePerGame22',
]

# --------------------------- DataFrame Formatting ------------------------------------------------------

filter_list = [offense_base, special_teams_base, enforcer_base, endurance_base]
filter_str = ['offense', 'special_teams', 'enforcer', 'endurance']


df = pd.read_csv('https://raw.githubusercontent.com/kyledufrane/NHL-Salary-Predictions/main/data/forward_roster_df.csv').drop(dropped_columns, axis=1)
df = df[df['Salary_2021-22'] != 0.0]

df = df.rename(columns={'Salary_2021-22': 'salary_2021-22'})
df['shootsCatches'] = df['shootsCatches'].replace('L', 'Left').replace('R', 'Right')
df['salary_rank'] = df['salary_2021-22'].rank(method='first', ascending=False).astype('int64')

for col in df.columns:
    if 'timeOnIce' in col or 'TimeOnIce' in col:
        df[col] = df[col].astype(str)
        df[col] = df[col].apply(lambda x: x.replace(':', '.'))
        df[col] = df[col].astype(float)

df['height_inches'] = df['height'].astype(str)
df['height_inches'] = df['height_inches'].map(lambda x: x.rstrip('"'))
df['height_inches'] = [val.split("'") for val in df['height_inches']]
df['height_inches'] = [(int(val[0]) * 12) + int(val[1]) for val in df['height_inches']]

for col in df.columns:
    if "Pct" in col:
        df[col] = df[col]/100

percentage_nan = 50.0
min_count = int(((100-percentage_nan)/100)*df.shape[0] + 1)

df = df.dropna(axis=1,
                thresh=min_count)

for col in df.columns:
    if '14' in col or '15' in col or '16' in col:
        df.drop(col, axis=1, inplace=True)

df['id'] = df['fullName']
df.set_index('id', inplace=True, drop=False)

# Ranking players based on category
offense_columns_ = []
special_teams_columns_ = []
enforcer_columns_ = []
endurance_columns_ = []

for idx, filter_ in enumerate(filter_list):
    dff = df[filter_].copy()
    for col in dff.columns:
        if dff[col].dtype != 'object' and 'Rank' not in col:
            dff.sort_values(f"{col}", ascending=False, inplace=True)
            df[f'{col}_quantile'] = pd.qcut(dff[col].rank(method='first'), 5, duplicates='drop', labels=[0, 1, 2, 3, 4])

    for col_str in filter_:
        for col in df.columns:
            if 'quantile' in col and col_str in col:
                if idx == 0:
                    offense_columns_.append(f'{col_str}_quantile')
                elif idx == 1:
                    special_teams_columns_.append(f'{col_str}_quantile')
                elif idx == 2:
                    enforcer_columns_.append(f'{col_str}_quantile')
                else:
                    endurance_columns_.append(f'{col_str}_quantile')
    for filter_string in filter_str:
        if filter_string == 'offense':
            dff = df[offense_columns_]
        elif filter_string == 'special_teams':
            dff = df[special_teams_columns_]
        elif filter_string == 'enforcer':
            dff = df[enforcer_columns_]
        else:
            dff = df[endurance_columns_]

        df[f"{filter_string}_quantiles_total"] = dff.sum(axis=1)
        df[f"{filter_string}_overall_rank"] = df[f"{filter_string}_quantiles_total"].rank(method='first', ascending=False).astype('int64')

overall_rank = []

for col in df.columns:
    if 'overall_rank' in col:
        overall_rank.append(col)
df['overall_rank_sum'] = df[overall_rank].sum(axis=1)
df['overall_rank'] = df['overall_rank_sum'].rank(
    method='first').astype('int64')

rename_columns = {
    'overall_rank': 'Overall Rank',
    'salary_rank': 'Salary Rank',
    'fullName': 'Player Name',
    'salary_2021-22': 'Salary',
    'currentAge': 'Age',
    'height': 'Height',
    'height_inches': 'Height (Inches)',
    'weight': 'Weight',
    'name': 'Position',
    'shootsCatches': 'Shoots',
    'birthCountry': 'Nationality',
    'offense_overall_rank': 'Offensive Overall Rank',
    'special_teams_overall_rank': 'Special Teams Overall Rank',
    'enforcer_overall_rank': 'Enforcer Overall Rank',
    'endurance_overall_rank': 'Endurance Overall Rank',
    'assists22': 'Total Assists',
    'goals22': 'Total Goals',
    'shots22': 'Total Shots',
    'faceOffPct22': 'Face Off Percentage',
    'shotPct22': 'Shot Percentage',
    'gameWinningGoals22': 'Game Winning Goals',
    'overTimeGoals22': 'Over Time Goals',
    'points22': 'Points',
    'plusMinus22': 'Plus Minus',
    'powerPlayGoals22': 'Power Play Goals',
    'powerPlayPoints22': 'Power Play Points',
    'powerPlayTimeOnIce22': 'Power Play Time On Ice',
    'shortHandedGoals22': 'Short Handed Goals',
    'shortHandedPoints22': 'Short Handed Points',
    'shortHandedTimeOnIce22': 'Short Handed Time On Ice',
    'timeOnIce22': 'Time On Ice',
    'games22': 'Total Games',
    'shifts22': 'Total Shifts',
    'blocked22': 'Blocked Shots',
    'timeOnIcePerGame22': 'Time On Ice Per Game',
    'evenTimeOnIcePerGame22': 'Even Time On Ice Per Game',
    'shortHandedTimeOnIcePerGame22': 'Short Handed Time On Ice Per Game',
    'powerPlayTimeOnIcePerGame22': 'Power Play Time On Ice Per Game',
    'hits22': 'Total Hits',
    'penaltyMinutes22': 'Total Penalty Minutes'
}

df = df.rename(columns=rename_columns)

df.to_csv('data/dash_cleaned_player_data.csv')


In [12]:
df[df['Face Off Percentage'] == 0]

Unnamed: 0_level_0,Player Name,Position,Age,Nationality,Height,Weight,Shoots,Time On Ice,Total Assists,Total Goals,...,timeOnIce22_quantile,games22_quantile,shifts22_quantile,blocked22_quantile,timeOnIcePerGame22_quantile,evenTimeOnIcePerGame22_quantile,shortHandedTimeOnIcePerGame22_quantile,powerPlayTimeOnIcePerGame22_quantile,overall_rank_sum,Overall Rank
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P.K. Subban,P.K. Subban,Defenseman,32,CAN,"6' 0""",210,Right,1408.48,17.0,5.0,...,3,3,3,4,3,4,2,1,709,229
Ryan Murray,Ryan Murray,Defenseman,27,CAN,"6' 1""",206,Left,521.14,4.0,0.0,...,0,0,0,2,1,1,2,0,1380,33
Damon Severson,Damon Severson,Defenseman,26,CAN,"6' 2""",205,Right,1888.23,35.0,11.0,...,4,4,4,4,4,4,4,3,337,368
Will Butcher,Will Butcher,Defenseman,26,USA,"5' 10""",190,Left,609.53,6.0,2.0,...,0,0,0,2,2,3,1,2,1335,47
Miles Wood,Miles Wood,Left Wing,25,USA,"6' 2""",195,Left,44.17,0.0,0.0,...,0,0,0,0,1,1,1,2,1421,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ilya Lyubushkin,Ilya Lyubushkin,Defenseman,27,RUS,"6' 2""",201,Right,1344.14,13.0,2.0,...,3,3,3,4,2,3,4,0,889,149
Alec Martinez,Alec Martinez,Defenseman,33,USA,"6' 1""",209,Left,508.03,5.0,3.0,...,0,0,0,3,3,4,2,1,1364,37
Alex Pietrangelo,Alex Pietrangelo,Defenseman,31,CAN,"6' 3""",210,Right,1971.54,31.0,13.0,...,4,4,4,4,4,4,4,3,397,348
Shea Theodore,Shea Theodore,Defenseman,25,CAN,"6' 2""",195,Left,1804.27,38.0,14.0,...,4,3,4,4,4,4,1,3,621,273
