In [2]:
import pandas as pd
import numpy as np
import math

In [1]:
#Directory where the game(s) are stored. Change this for your system.
DATAPATH = '../../Volumes/KevSSDRive/NBA/data/csv/merged/'

In [184]:
def find_shot_starts(gameid):
    print gameid
    #Load the game and reduce it to plays which the pbp data says are shots
    try:
        df = pd.read_csv(DATAPATH+'merged_{}.csv'.format(gameid), low_memory=False)
    except IOError:
        return 0
    #Create a copy dataFrame which only includes the ball -- playerid = -1
    df_ball = pd.DataFrame(df, copy=True)
    df_ball = df_ball[df_ball['player_id'] == -1]
    #Reduce the dataframe down to shots -- EVENTMSGTYPE = 1 indicates a made shot and EVENTMSGTYPE = 2 indicates a missed shot
    df_ball = df_ball[(df_ball['EVENTMSGTYPE'] == 1) | (df_ball['EVENTMSGTYPE'] == 2)]
    #Add a shot distance column to find the xy distance from the hoop
    df_ball['shotdist'] = 0.0
    df_ball.reset_index(drop=True, inplace=True)
    #Add distance of the ball from the basket
    for row in range(len(df_ball)):
        x_loc = df_ball['x_loc'].iloc[row]
        if x_loc < 47:
            #XY distance to the left-half basket 
            df_ball.set_value(row, 'shotdist', (math.sqrt(math.pow((x_loc - 5.25), 2) + 
                                                            math.pow((df_ball['y_loc'].iloc[row] - 25), 2)))) 
        else:
            #XY distance to the right-half basket
            df_ball.set_value(row, 'shotdist', (math.sqrt(math.pow((x_loc - 88.75), 2) + 
                                                            math.pow((df_ball['y_loc'].iloc[row] - 25), 2))))
    #Determine a 'cylinder' value -- Ball needs to be above cylinder to go in the hoop
    #If the ball is at least 9 feet in the air (radius) and within 2 feet of the center of the hoop, it's near the cylinder.
    for row in range(len(df_ball)):
        x_loc = df_ball['x_loc'].iloc[row]
        if x_loc < 47:
            #XY distance to the basket
            df_ball.set_value(row, 'cylinder', 1 if ((math.sqrt(math.pow((x_loc - 5.25), 2) + 
                                                            math.pow((df_ball['y_loc'].iloc[row] - 25), 2)) <= 2) 
                                                    & (df_ball['radius'].iloc[row] >= 9)) 
                              else 0)
        else:
            #XY distance to the basket
            df_ball.set_value(row, 'cylinder', 1 if ((math.sqrt(math.pow((x_loc - 88.75), 2) + 
                                                            math.pow((df_ball['y_loc'].iloc[row] - 25), 2)) <= 2)
                                                    & (df_ball['radius'].iloc[row] >= 9)) 
                              else 0)

    df_ball['shooting'] = 0
    #Ball needs to be moving towards the hoop for it to be in a shot
    for row in range(1, len(df_ball)):
        if (df_ball['radius'].iloc[row] >= df_ball['radius'].iloc[row-1]) | (df_ball['radius'].iloc[row] >= 10):
            df_ball.set_value(row, 'shooting', 1)
    #Determines the start of the shot
    df_ball['SHOT_START'] = 0
    for row in range(1, len(df_ball)):
        if df_ball['shooting'].iloc[row] == 1:
            if df_ball['shooting'].iloc[row-1] == 0:
                start = row
        
            if df_ball['cylinder'].iloc[row] == 1 and df_ball['cylinder'].iloc[row-1] != 1:
                for backtrack in range(start, row):
                    if df_ball['radius'].iloc[backtrack] >= 7:
                        df_ball.set_value(backtrack, 'SHOT_START', 1)
                        break
    #df_shot_start is the dataFrame containing only the (x,y) coordinates of all the ball at the beginning of the shot
    df_shot_start = pd.DataFrame(df_ball,copy=True)
    #df_shot_start.to_csv('../../Volumes/KevSSDrive/NBA/data/csv/shooting/{}.csv'.format(gameid), index=False)
    df_shot_start = df_shot_start[df_shot_start['SHOT_START']==1]
    #df_total is where we merge in the other 10 players
    df_total = pd.DataFrame()
    for i in range(len(df_shot_start)):
        df_play = pd.DataFrame(df, copy=True)
        game_clock = df_shot_start['game_clock'].iloc[i]
        df_play = df_play[df_play['quarter'] == df_shot_start['quarter'].iloc[i]]
        df_play = df_play[df_play['game_clock'] == game_clock]
        df_play['play_id'] = i+1
        df_play['shotdist'] = df_shot_start['shotdist'].iloc[i]
        df_total = df_total.append(df_play, ignore_index=True)
        df_total.drop_duplicates(subset=['player_id', 'quarter', 'game_clock'], inplace=True)
        #df_total.sort_values(by=['quarter', 'game_clock'], ascending=[True, False], inplace=True)
    df_total = df_total[(df_total.EVENTMSGTYPE == 1) | (df_total.EVENTMSGTYPE == 2)]
    df_total.reset_index(drop=True, inplace=True)
    df_total.to_csv('../../Volumes/KevSSDrive/NBA/data/csv/shot_start/{}.csv'.format(gameid),index=False)

In [187]:
for game in range(21500001, 21500664):
    game = '00' + str(game)
    find_shot_starts(game)

0021500660
0021500661
0021500662
0021500663
