In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import datetime
from collections import deque

# Feature Engineering
features:
FG per game by shot

In [2]:
df = pd.read_csv('../data/data1.csv')

In [3]:
def last_5_average(df):
    """
    Create a "moving average" for the points scored in the last 5 games.
    :param df:
    :return:
    """
    df = df.sort_values(by=['game_date'])
    df['last_5_games_avg'] = ""

    average_per_game = df.groupby(['game_id'], sort=False)['shot_made_flag'].mean().reset_index(name="avg")
    avg_pts = deque(maxlen=5) # using a queue with a max number of elements

    prev_res = 0
    for i, game, avg in average_per_game.itertuples():
        avg_pts.append(avg)
        res = round(sum(avg_pts) / len(avg_pts),2) # max size will be 5. for the first 5 we divide by the length of the array.
        if i == 0:
            # we skip the first
            df.loc[df['game_id'] == game, ['last_5_games_avg']] = 0
        else:
            df.loc[df['game_id'] == game, ['last_5_games_avg']] = prev_res

        prev_res = res

    return df


In [4]:
# shot id is needed. we can delete it later
def before_current_shot_features(df):
    """
    Adding the statistics for the shots taken before the current shot
    :param df:
    :return:
    """
    df['streak_before_shot'] = ""
    df['points_before_shot'] = ""
    df['fgp_before_shot'] = ""

    for game in df.game_id.unique():
        fgp = [0,0]
        streak = 0
        shots = df[df['game_id'] == game]['shot_id'].sort_values()
        prev_streak = 0
        points = 0
        prev_points = 0
        for num, shot in enumerate(shots):

            row = df[(df['game_id'] == game) & (df['shot_id'] == shot)]

            flag = row['shot_made_flag'].item()
            shot_type = row['shot_type'].item()
            period = row['period'].item()

            # TODO divide per period somehow and multiply by the time remaining
            if num == 0:
                df.loc[(df['game_id']==game) & (df['shot_id'] == shot), ['fgp_before_shot'] ] = 0.00
            else:
                df.loc[(df['game_id']==game) & (df['shot_id'] == shot), ['fgp_before_shot'] ] = round(fgp[0]/fgp[1],2)

            fgp[1] += 1
            if flag == 1.0:
                fgp[0] += 1
                prev_streak = streak
                prev_points = points
                streak += 1

                if shot_type == '2PT Field Goal':
                    points += 2
                else:
                    points += 3
            else:
                prev_streak = streak
                prev_points = points
                streak = 0

            # df.loc[(df['game_id']==game) & (df['shot_id'] == shot), ['current_streak'] ] = streak
            df.loc[(df['game_id']==game) & (df['shot_id'] == shot), ['streak_before_shot'] ] = prev_streak
            df.loc[(df['game_id']==game) & (df['shot_id'] == shot), ['points_before_shot'] ] = prev_points

    return df

Extracting the month, players shoot differently depending on the month

In [5]:
def extract_from_date(df):
    """
    Extract the month from the datetime format.
    :param df: dataframe
    :return: modified df with the month feature
    """
    df['month'] = df.game_date.apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").month)
    df['weekday'] = df.game_date.apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").weekday())
    return df


In [6]:
df.head()

Unnamed: 0,action_type,combined_shot_type,game_event_id,game_id,lat,loc_x,loc_y,lon,period,playoffs,...,shot_zone_area,shot_zone_basic,shot_zone_range,team_id,team_name,game_date,matchup,opponent,shot_id,time_remaining
0,Jump Shot,Jump Shot,12,20000012,34.0443,-157,0,-118.4268,1,0,...,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,away,POR,2,622
1,Jump Shot,Jump Shot,35,20000012,33.9093,-101,135,-118.3708,1,0,...,Left Side Center(LC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,away,POR,3,465
2,Jump Shot,Jump Shot,43,20000012,33.8693,138,175,-118.1318,1,0,...,Right Side Center(RC),Mid-Range,16-24 ft.,1610612747,Los Angeles Lakers,2000-10-31,away,POR,4,412
3,Driving Dunk Shot,Dunk,155,20000012,34.0443,0,0,-118.2698,2,0,...,Center(C),Restricted Area,Less Than 8 ft.,1610612747,Los Angeles Lakers,2000-10-31,away,POR,5,379
4,Jump Shot,Jump Shot,244,20000012,34.0553,-145,-11,-118.4148,3,0,...,Left Side(L),Mid-Range,8-16 ft.,1610612747,Los Angeles Lakers,2000-10-31,away,POR,6,572


In [7]:
plt.figure(figsize=(25,10))
df_fgp = df.groupby(['season', 'month'])['shot_made_flag'].mean().reset_index(name='FGP')
sns.barplot(data=df_fgp, x='season', y='FGP', hue='month',ci=None, hue_order=[10,11,12,1,2,3,4,5,6])

KeyError: 'month'

<Figure size 1800x720 with 0 Axes>

FGP per game in a season

In [None]:
# we don't use this statistics anymore. Maybe come up with something different?
# df.to_csv('../data/data2.csv', index=False)# df_pm = df[['season', 'plus_minus', 'game_date']].drop_duplicates()
# g = sns.FacetGrid(df_pm, col="season", col_wrap=2, size=4, aspect=2)
# g.map(sns.barplot,'game_date', 'plus_minus', data=df_pm)

In [44]:
df.to_csv('../data/data2.csv', index=False)