In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv("B2B_updated_final2.csv")
df1 = pd.read_csv("player_performance_final.csv")

In [4]:
#Cleaning the player performance data
df1 = df1[df1['Player'] != 'TOTAL']
df1.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis = 1, inplace = True)
df1['date'] = pd.to_datetime(df1['date'])
df1['season'] = df1['date'].dt.year.where(df1['date'].dt.month > 6, df1['date'].dt.year - 1)
df1['season'] = df1['season'].astype(str) + (df1['season'] + 1).astype(str)
df1['TOI'] = df1['TOI'].str.split(':').apply(lambda x: int(x[0]) + int(x[1])/60)

In [5]:
#The date columns in the scraped dataframes were not consistent
dc1 = pd.to_datetime(df1['date'], format = '%Y-%m-%d', errors = 'coerce')
dc2 = pd.to_datetime(df1['date'], format = '%d/%m/%y', errors = 'coerce')
df1['date'] = dc1.fillna(dc2)

In [6]:
#Adding back to back games column based on team and date. 
b2b_home_games = df[df['home_b2b'] == 1][['date', 'homename']]
b2b_away_games = df[df['away_b2b'] == 1][['date', 'awayname']]
b2b_home_games.columns = ['date', 'team']
b2b_away_games.columns = ['date', 'team']

b2b_final = pd.concat([b2b_home_games, b2b_away_games])
b2b_final['is_b2b_game'] = 1
b2b_final['date'] = pd.to_datetime(b2b_final['date'])

df1 = df1.merge(b2b_final, on=['date', 'team'], how='left')
df1['is_b2b_game'].fillna(0, inplace=True)
df1['is_b2b_game'] = df1['is_b2b_game'].astype(int)

In [7]:
#Adding 3in4 
cg1_home_games = df[df['home_3in4'] == 1][['date', 'homename']]
cg1_away_games = df[df['away_3in4'] == 1][['date', 'awayname']]
cg1_home_games.columns = ['date', 'team']
cg1_away_games.columns = ['date', 'team']

cg1_final = pd.concat([b2b_home_games, b2b_away_games])
cg1_final['is_3in4_game'] = 1
cg1_final['date'] = pd.to_datetime(cg1_final['date'])

df1 = df1.merge(cg1_final, on=['date', 'team'], how='left')
df1['is_3in4_game'].fillna(0, inplace=True)
df1['is_3in4_game'] = df1['is_3in4_game'].astype(int)

In [8]:
#Adding 5in8
cg2_home_games = df[df['home_5in8'] == 1][['date', 'homename']]
cg2_away_games = df[df['away_5in8'] == 1][['date', 'awayname']]
cg2_home_games.columns = ['date', 'team']
cg2_away_games.columns = ['date', 'team']

cg2_final = pd.concat([b2b_home_games, b2b_away_games])
cg2_final['is_5in8_game'] = 1
cg2_final['date'] = pd.to_datetime(cg2_final['date'])

df1 = df1.merge(cg2_final, on=['date', 'team'], how='left')
df1['is_5in8_game'].fillna(0, inplace=True)
df1['is_5in8_game'] = df1['is_5in8_game'].astype(int)

In [39]:
print(df1.isnull().sum())

Rk                   0
Player               0
G                    0
A                    0
PTS                  0
+/-                  0
PIM                  0
EV                   0
EV.1            146672
PP                   0
PP.1            146672
SH                   0
SH.1            146672
GW                   0
EV.2            146672
EV.3            146672
PP.2            146672
PP.3            146672
SH.2            146672
SH.3            146672
S                    0
S%              129011
SHFT                 0
TOI                  0
iCF                  0
SAT‑F                0
SAT‑A                0
CF%                  0
CRel%                0
ZSO                132
ZSD                132
oZS%              2076
HIT                  0
BLK               1046
team                 0
date                 0
season               0
is_b2b_game          0
is_3in4_game         0
is_5in8_game         0
dtype: int64


### No significant features from this dataset have any missing values

In [9]:
testdf = df[df['season'] > 20092010]

In [10]:
counts = testdf['is_b2b_game'].value_counts()
counts1 = df1['is_b2b_game'].value_counts()

print(counts)
print(counts1)

0    9888
1    3325
Name: is_b2b_game, dtype: int64
0    478947
1     68711
Name: is_b2b_game, dtype: int64


##### 3325*19 is approx 68711. So all the values have been added correctly. 

In [50]:
print(df1.head(5))

    Rk           Player    G    A  PTS  +/-  PIM   EV  EV.1   PP  PP.1   SH  \
0  1.0     Matt Benning  0.0  0.0    0 -2.0    2  0.0   0.0  0.0   0.0  0.0   
1  2.0      Nick Bonino  0.0  0.0    0 -1.0    0  0.0   0.0  0.0   0.0  0.0   
2  3.0    Logan Couture  0.0  0.0    0 -2.0    0  0.0   0.0  0.0   0.0  0.0   
3  4.0    Mario Ferraro  0.0  0.0    0 -1.0    0  0.0   0.0  0.0   0.0  0.0   
4  5.0  Jonah Gadjovich  0.0  0.0    0  0.0    7  0.0   0.0  0.0   0.0  0.0   

   SH.1   GW  EV.2  EV.3  PP.2  PP.3  SH.2  SH.3    S   S%  SHFT        TOI  \
0   0.0  0.0   0.0   0.0   0.0   0.0   0.0   0.0  2.0  0.0  26.0  14.933333   
1   0.0  0.0   0.0   0.0   0.0   0.0   0.0   0.0  4.0  0.0  23.0  16.450000   
2   0.0  0.0   0.0   0.0   0.0   0.0   0.0   0.0  2.0  0.0  25.0  17.716667   
3   0.0  0.0   0.0   0.0   0.0   0.0   0.0   0.0  2.0  0.0  29.0  24.250000   
4   0.0  0.0   0.0   0.0   0.0   0.0   0.0   0.0  0.0  NaN  15.0   8.816667   

   iCF  SAT‑F  SAT‑A   CF%  CRel%   ZSO   ZSD  oZS

In [11]:
df1.to_csv("player_performance_data.csv")