In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

## Functions
def get_shift_data(game_id,player_id):
    return shifts_agg_df[(shifts_agg_df.game_id == game_id) & 
                        (shifts_agg_df.player_id == player_id)].agg('mean')

def avg_icetime(game_id,team_id):
    player_df = player_game_df[(player_game_df.game_id == game_id) &\
                              (player_game_df.team_id == team_id)]
    return np.round(player_df.agg({'timeOnIce':{'mean'}}).values[0][0],0)   

def avg_shift_len(game_id,team_id):
    player_df = player_game_df[(player_game_df.game_id == game_id) &\
                              (player_game_df.team_id == team_id)]
    return np.round(player_df.agg({'mean_shift_length':{'mean'}}).values[0][0],0)

def avg_num_shift(game_id,team_id):
    player_df = player_game_df[(player_game_df.game_id == game_id) &\
                              (player_game_df.team_id == team_id)]
    return np.round(player_df.agg({'number_shifts':{'mean'}}).values[0][0],0)

def num_players(game_id,team_id):
    player_df = player_game_df[(player_game_df.game_id == game_id) &\
                              (player_game_df.team_id == team_id)]
    return np.round(player_df.agg({'number_shifts':{'count'}}).values[0][0],0)

def get_winning_stats(row):
    if row['outcome'] in ['home win OT','home win REG','home win SO']:
        winner = row['home_team_id']
        winner_name = row['home_team_name']
        loser = row['away_team_id']
        loser_name = row['away_team_name']
        winner_loc = 'home'
        winner_avgShiftLen = row['home_avg_shiftLen']
        loser_avgShiftLen = row['away_avg_shiftLen']
        winner_avgNumShift = row['home_avg_numShift']
        loser_avgNumShift = row['away_avg_numShift']
        winner_avgIceTime = row['home_avg_icetime']
        loser_avgIceTime  = row['away_avg_icetime']
    else:
        winner = row['away_team_id']
        winner_name = row['away_team_name']
        loser = row['home_team_id']
        loser_name = row['home_team_name']
        winner_loc = 'away'
        winner_avgShiftLen = row['away_avg_shiftLen']
        loser_avgShiftLen = row['home_avg_shiftLen']
        winner_avgNumShift = row['home_avg_numShift']
        loser_avgNumShift = row['away_avg_numShift']
        winner_avgIceTime = row['away_avg_icetime']
        loser_avgIceTime  = row['home_avg_icetime']
        
    return pd.Series({'winner':winner,
                      'winner_name':winner_name,
                      'loser':loser,
                      'loser_name':loser_name,
                      'winner_loc':winner_loc,
                      'winner_avgShiftLen':winner_avgShiftLen,
                      'loser_avgShiftLen':loser_avgShiftLen,
                      'winner_avgNumShift':winner_avgNumShift,
                      'loser_avgNumShift':loser_avgNumShift,
                      'winner_avgIceTime':winner_avgIceTime,
                      'loser_avgIceTime':loser_avgIceTime})

## Load and Prepare Data

#### 1. Prepare Data for Shifts for Each Player-Game Pair

In [None]:
## Get data  on shifts for each player/game
shifts_df = pd.read_csv('data/game_shifts.csv')
shifts_df['shift_duration'] = shifts_df['shift_end'] - shifts_df['shift_start']
shifts_df = shifts_df[shifts_df.shift_duration > 10]
shifts_df = shifts_df[shifts_df.shift_duration < 200]

#Aggregate data by game and player
shifts_agg_df = shifts_df.groupby(['game_id','player_id'])\
.agg({'shift_duration':{'count','sum','mean'}})
shifts_agg_df.columns = shifts_agg_df.columns.droplevel()
shifts_agg_df.reset_index(inplace=True)

## Merge Player Game data With 
player_game_df = pd.read_csv('data/game_skater_stats.csv')
player_game_df = pd.concat([player_game_df,
                            player_game_df.apply(lambda x:
                                                 get_shift_data(x['game_id'],
                                                                x['player_id']),
                                                 axis= 1).iloc[:,2:]],
                           axis = 1)

player_game_df.rename(columns = {'sum':'total_timeOnIce',
                                 'count':'number_shifts',
                                 'mean':'mean_shift_length'},
                      inplace = True)

player_game_df.to_csv('data/upd_game_skater_stats.csv')

#### 2. Aggregate Average Player Data for Each Game

In [None]:
#Load Games Dataset
games_df = pd.read_csv('data/game.csv')
games_df.drop(['venue_link','venue_time_zone_id'], axis= 1, inplace= True)

#Load and Drop Unused Columns
team_info = pd.read_csv('data/team_info.csv')
team_info['combined_name'] = team_info.shortName + ' ' + team_info.teamName
team_info.drop(['franchiseId','shortName','teamName','abbreviation','link'],
               axis = 1,inplace=True)

#Create home and away datasets for joining
away_info = team_info.copy()
away_info.columns = ['away_team_id','away_team_name']
home_info = team_info.copy()
home_info.columns = ['home_team_id','home_team_name']

#Merge Columns
games_df = games_df.merge(away_info)
games_df = games_df.merge(home_info)

#Run functions to extract winning team data
games_df['home_avg_icetime'] =\
games_df.apply(lambda x: avg_icetime(x['game_id'],x['home_team_id']),axis = 1)
games_df['away_avg_icetime'] =\
games_df.apply(lambda x: avg_icetime(x['game_id'],x['away_team_id']),axis = 1)

games_df['home_avg_shiftLen'] =\
games_df.apply(lambda x: avg_shift_len(x['game_id'],x['home_team_id']),axis = 1)
games_df['away_avg_shiftLen'] =\
games_df.apply(lambda x: avg_shift_len(x['game_id'],x['away_team_id']),axis = 1)

games_df['home_avg_numShift'] =\
games_df.apply(lambda x: avg_num_shift(x['game_id'],x['home_team_id']),axis = 1)
games_df['away_avg_numShift'] =\
games_df.apply(lambda x: avg_num_shift(x['game_id'],x['away_team_id']),axis = 1)

games_df['home_cnt_players'] =\
games_df.apply(lambda x: num_players(x['game_id'],x['home_team_id']),axis = 1)
games_df['away_cnt_players'] =\
games_df.apply(lambda x: num_players(x['game_id'],x['away_team_id']),axis = 1)

games_df.dropna(inplace = True)
games_df.to_csv('data/upd_game.csv',index=False)

## Get Stats For Winning Teams
winning_df = pd.concat([games_df,
                        games_df.apply(lambda x: get_winning_stats(x),axis = 1)],
                       axis = 1 )
winning_df.to_csv('shiny_app/app_data/upd_game_df.csv')

In [2]:
plays_df = pd.read_csv('data/game_plays.csv')

In [3]:
shooting_df = plays_df.loc[plays_df.event.isin(['Goal','Shot']),:]

#Format Shot by and Goalie
#     shooting_df.loc[:,'shot_by'] = shooting_df['description']\
#     .apply(lambda x: ' '.join(x.split()[0:2])).values
#     shooting_df.loc[:,'goalie'] = shooting_df['description']\
#     .apply(lambda x: ' '.join(x.split()[-2:])).values

# ## X and Y are inverted in the dataset vs our chart
# shooting_df.loc[:,'X'] = shooting_df['st_y'].apply(lambda x: x*500/85).values
# shooting_df.loc[:,'Y'] = shooting_df['st_x'].apply(lambda y: y*500/85).values

# shooting_df = shooting_df.loc[shooting_df['Y']>0,:] #ignore shots below center ice
# shooting_df = shooting_df.loc[shooting_df['Y']<516.5,:] #ignore shots below center ice

In [4]:
#Format Shot by and Goalie


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [9]:
shooting_df.loc[:,'shot_by'] = shooting_df['description']\
.apply(lambda x: ' '.join(x.split()[0:2]))
shooting_df.loc[:,'goalie'] = shooting_df['description']\
.apply(lambda x: ' '.join(x.split()[-2:]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [20]:
vals = shooting_df['description'].apply(lambda x: ' '.join(x.split()[0:2])).values

In [23]:
shooting_df.loc[:,'vals'] = vals

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [28]:
shooting_df.

TypeError: insert() missing 1 required positional argument: 'item'