## Part 1 - Data Preparation and Exploration 

In [111]:
# %%capture
# # Due to the configuration of the base Jupter image, the following imports are required for the regressions in the assignment to report the correct metrics

# import sys 
# !{sys.executable} -m pip uninstall statsmodels --yes 
# !{sys.executable} -m pip uninstall numpy --yes
# !{sys.executable} -m pip uninstall pandas --yes 
# !{sys.executable} -m pip uninstall patsy --yes 
# !{sys.executable} -m pip install numpy==1.17
# !{sys.executable} -m pip install pandas==1.0
# !{sys.executable} -m pip install patsy==0.5.2
# !{sys.executable} -m pip install statsmodels==0.11.1

In [112]:
#Import Libraries

import pandas as pd
import datetime as dt
import scipy.stats as sp
import numpy as np
import statsmodels.formula.api as sm 

## Step 1
Import the "Shotlog_14_15.csv" data file as "Shotlog_1415" into Jupyter Notebook. Import "Player_Stats_1415.csv" data file as "Player_Stats" into Jupyter Notebook.

Descriptions of the datasets and selected variables:
- In the dataset “Shotlog_14_15,” each observation represents an attempt of a shot. In the dataset Player_Stats_14_15", each observation represents a player.
- The "average_hit" variable in both dataframes indicate the average success rate of a player making a shot over the season. It is defined and calculated the same way in both dataframes.
- The variable "home_away" indicates whether the team that the player belongs to played at home or away.
- The variable "result" indicates whether the team that the player belongs to won or lost the game. The variable "final_margin" represents the difference in final score between the team the player belongs to and their opponent’s.
- The variable "shot_number" is the order of the shot the given player attempted at the given game.
- "game_clock" is the countdown clock for each quarter. The game clock starts at 12 minutes. "shot_clock" refers to the display of a countdown clock of the time within which the team possessing the ball must attempt a field goal. The shot clock starts at 24 seconds.

In [113]:
# Import Shotlog_14_15 and Player_Stats Datasets

shotlog_1415=pd.read_csv("Assignment Data/Shotlog_14_15.csv")
player_stats=pd.read_csv("Assignment Data/Player_Stats_14_15.csv")


## Step 2
Convert the "date" variable to a date type variable and calculate summary statistics for the "shot_clock" variable

In [114]:
shotlog_1415.iloc[0,:]

game_id                     21400280
date                        5-Dec-14
match                      ATL @ BKN
home_team                        BKN
away_team                        ATL
home_away                          A
result                             W
final_margin                      23
shot_number                        1
quarter                            1
game_clock                     11:23
shot_clock                       5.6
dribbles                           0
touch_time                       1.2
shot_dist                       19.6
points                             2
current_shot_outcome            made
closest_defender        Lopez, Brook
closest_defender_id           201572
closest_def_dist                 6.6
current_shot_hit                   1
points_earned                      2
shoot_player              al horford
player_id                     201143
average_hit                 0.541259
shot_count                       715
shot_per_game                     10
N

In [115]:
shotlog_1415['date'] = pd.to_datetime(shotlog_1415['date'])
shotlog_1415['shot_clock'] = pd.to_timedelta(shotlog_1415['shot_clock'], unit='S')
shotlog_1415['game_clock'] = pd.to_timedelta('00:'+shotlog_1415['game_clock'])
shotlog_1415.shot_clock.describe()

count                       122502
mean     0 days 00:00:12.453343618
std      0 days 00:00:05.763265011
min                0 days 00:00:00
25%         0 days 00:00:08.200000
50%         0 days 00:00:12.300000
75%         0 days 00:00:16.675000
max                0 days 00:00:24
Name: shot_clock, dtype: object

## Step 3
Create a lagged variable "lag_shot_hit" to indicate the result of the previous shot by the same player in the same game. *Hint:* In this dataset, the variable "match" may not be able to uniquely identify each game; you can use "game_id" instead. You can sort the data by shot number for each player to create the lagged variable.

In [116]:
shotlog_1415['lag_shot_hit'] = (shotlog_1415.sort_values(['quarter','game_clock'], ascending=[True,False])
                                            .groupby(['shoot_player','game_id'])
                                            ['current_shot_hit'].shift(1))

## Step 4
Create a variable "error" to indicate the prediction error for each shot and a variable "lagerror" for the prediction error for the previous shot. The "error" variable is defined as the difference between the outcome of the current shot and the average success rate, "average_hit", and the "lagerror" variable is defined as the difference between the outcome of the previous shot and the average success rate.

In [117]:
shotlog_1415['error'] = shotlog_1415['current_shot_hit'] - shotlog_1415['average_hit']
shotlog_1415['lagerror'] = shotlog_1415['lag_shot_hit'] - shotlog_1415['average_hit']

## Step 5
Calculate summary statistics for the "error" and "lagerror" variables.

In [118]:
display(shotlog_1415['error'].describe())
print('\n')
display(shotlog_1415['lagerror'].describe())

count    1.280690e+05
mean     1.454042e-17
std      4.949640e-01
min     -7.124682e-01
25%     -4.491979e-01
50%     -3.850837e-01
75%      5.395973e-01
max      6.914894e-01
Name: error, dtype: float64





count    113726.000000
mean          0.006303
std           0.496035
min          -0.712468
25%          -0.449198
50%          -0.382143
75%           0.542254
max           0.691489
Name: lagerror, dtype: float64

In [119]:
shotlog_1415['lagerror'].count()

113726

## Part 2 - Conditional Probability and Autocorrelation

## Step 1
Create a dummy variable "conse_shot" that indicates a player made consecutive shots.

In [120]:
shotlog_1415['conse_shot'] = ((shotlog_1415['lag_shot_hit']==1)&(shotlog_1415['current_shot_hit']==1)).astype('int')

## Step 2
Create a dataframe "player_prob" for the probability of making the previous shot and the joint probability for making both the previous and current shots. Name the probability of making the previous shot "average_lag_hit" and the probability of making both shots "conse_shot_hit".

In [121]:
player_prob = (shotlog_1415.groupby('shoot_player')
                           .mean()
                           .loc[:,['conse_shot','lag_shot_hit']]
                           .reset_index()
                           .rename({'lag_shot_hit':'average_lag_hit'}, axis=1))
player_prob.head(15)

Unnamed: 0,shoot_player,conse_shot,average_lag_hit
0,aaron brooks,0.153298,0.418
1,aaron gordon,0.201923,0.532468
2,al farouq aminu,0.162791,0.465686
3,al horford,0.262937,0.537994
4,al jefferson,0.2075,0.48
5,alan anderson,0.15727,0.462366
6,alan crabbe,0.138298,0.52381
7,alex len,0.247492,0.539419
8,alexis ajinca,0.28436,0.598802
9,alonzo gee,0.137681,0.460784


## Step 3
In the "player_prob" dataframe, calculate the conditional probability "conditional_prob" for a player to make a shot given that he made the previous shot.

In [122]:
player_prob['conditional_prob'] = player_prob['conse_shot'] / player_prob['average_lag_hit']
player_prob.head(8)

Unnamed: 0,shoot_player,conse_shot,average_lag_hit,conditional_prob
0,aaron brooks,0.153298,0.418,0.366741
1,aaron gordon,0.201923,0.532468,0.379221
2,al farouq aminu,0.162791,0.465686,0.349572
3,al horford,0.262937,0.537994,0.488736
4,al jefferson,0.2075,0.48,0.432292
5,alan anderson,0.15727,0.462366,0.340142
6,alan crabbe,0.138298,0.52381,0.264023
7,alex len,0.247492,0.539419,0.458811


## Step 4
Merge the "player_prob" dataframe into the "player_stats" dataframe.

In [123]:
player_stats = player_stats.merge(player_prob, on='shoot_player')
player_stats.head(8)

Unnamed: 0,shoot_player,average_hit,conse_shot,average_lag_hit,conditional_prob
0,aaron brooks,0.41533,0.153298,0.418,0.366741
1,aaron gordon,0.528846,0.201923,0.532468,0.379221
2,al farouq aminu,0.430233,0.162791,0.465686,0.349572
3,al horford,0.541259,0.262937,0.537994,0.488736
4,al jefferson,0.4775,0.2075,0.48,0.432292
5,alan anderson,0.433234,0.15727,0.462366,0.340142
6,alan crabbe,0.425532,0.138298,0.52381,0.264023
7,alex len,0.528428,0.247492,0.539419,0.458811


## Step 5
Calculate summary statistics for the probability for a player to make a shot ("average_hit") and the conditional probability for a player to make a shot given that he made the previous one ("conditional_prob") and the probability of players making consecutive shots ("conse_shot_hit").

In [124]:
player_stats.describe()

Unnamed: 0,average_hit,conse_shot,average_lag_hit,conditional_prob
count,281.0,281.0,281.0,281.0
mean,0.451545,0.176987,0.459167,0.380233
std,0.059392,0.047943,0.0563,0.06232
min,0.308511,0.07619,0.322581,0.225801
25%,0.413223,0.144543,0.424012,0.336689
50%,0.446078,0.171625,0.454418,0.38157
75%,0.48048,0.203512,0.486772,0.422801
max,0.712468,0.422392,0.688822,0.613209


## Step 6
Perform a t-test for the statistical significance on the difference between conditional probability and unconditional probability of making a shot.

In [125]:
sp.stats.ttest_ind(player_stats['conditional_prob'], player_stats['average_hit'])

Ttest_indResult(statistic=-13.885932802814914, pvalue=6.925846314604593e-38)

## Step 7
Calculate the first order autocorrelation coefficient on making a shot (correlation coefficient between making the current shot and the previous shot) for the entire shotlog dataset.

In [126]:
shotlog_1415['current_shot_hit'].corr(shotlog_1415['lag_shot_hit'])

-0.010502388301693153

# Step 8
Calculate the first order autocorrelation coefficient on making a shot for each player. Display the top ten players with the highest first order autocorrelation coefficient.

In [127]:
(shotlog_1415.groupby('shoot_player')[['current_shot_hit','lag_shot_hit']]
             .corr()
             .unstack()
             .iloc[:,1]
             .reset_index()
             .droplevel(1, axis=1)
             .rename({'current_shot_hit':'autocorr'},axis=1))

Unnamed: 0,shoot_player,autocorr
0,aaron brooks,0.002556
1,aaron gordon,-0.071283
2,al farouq aminu,0.029513
3,al horford,-0.008396
4,al jefferson,-0.038885
...,...,...
276,wesley matthews,-0.071950
277,wilson chandler,-0.076534
278,zach lavine,-0.027201
279,zach randolph,0.017574


## Part 3 - Regression Analyses

In this section, you will run several regressions to investigate the "hot hand." In all the regressions, the dependent variable is "error" and the independent variable of interest is "lagerror".

## reg1
Run a linear least squares regression using the entire shotlog dataframe. Include the following control variables:
- Shot distance
- Number of dribbles
- Touch time
- Type of shot ("points" variable)
- Quarter of the game (as a cat variable)
- Home or away
- Shoot_player
- Closest defender
- Closest defender distance

In [128]:
shotlog_1415.columns

Index(['game_id', 'date', 'match', 'home_team', 'away_team', 'home_away',
       'result', 'final_margin', 'shot_number', 'quarter', 'game_clock',
       'shot_clock', 'dribbles', 'touch_time', 'shot_dist', 'points',
       'current_shot_outcome', 'closest_defender', 'closest_defender_id',
       'closest_def_dist', 'current_shot_hit', 'points_earned', 'shoot_player',
       'player_id', 'average_hit', 'shot_count', 'shot_per_game',
       'lag_shot_hit', 'error', 'lagerror', 'conse_shot'],
      dtype='object')

In [129]:
# fmla=('error ~ lagerror+C(shot_dist)'+
#               '+C(dribbles)'+
#               '+C(touch_time)'+
#               '+C(quarter)'+
#               '+C(points)'+
#               '+C(home_away)'+
#               '+C(shoot_player)'+
#               '+C(closest_defender)'+
#               '+C(closest_def_dist)')
fmla=('error ~ lagerror+shot_dist'+
              '+dribbles'+
              '+touch_time'+
              '+C(quarter)'+
              '+C(points)'+
              '+home_away'+
              '+shoot_player'+
              '+closest_defender'+
              '+closest_def_dist')

sm.ols(formula=fmla, data=shotlog_1415).fit().summary()

0,1,2,3
Dep. Variable:,error,R-squared:,0.057
Model:,OLS,Adj. R-squared:,0.051
Method:,Least Squares,F-statistic:,9.007
Date:,"Fri, 03 Dec 2021",Prob (F-statistic):,0.0
Time:,20:42:34,Log-Likelihood:,-78063.0
No. Observations:,113726,AIC:,157700.0
Df Residuals:,112960,BIC:,165000.0
Df Model:,765,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1697,0.038,4.413,0.000,0.094,0.245
C(quarter)[T.2],-0.0114,0.004,-2.608,0.009,-0.020,-0.003
C(quarter)[T.3],-0.0023,0.004,-0.553,0.580,-0.011,0.006
C(quarter)[T.4],-0.0119,0.004,-2.667,0.008,-0.021,-0.003
C(quarter)[T.5],-0.0446,0.016,-2.716,0.007,-0.077,-0.012
C(quarter)[T.6],0.0078,0.038,0.206,0.836,-0.066,0.081
C(quarter)[T.7],-0.0191,0.074,-0.258,0.797,-0.164,0.126
C(points)[T.3],0.0361,0.005,6.841,0.000,0.026,0.046
home_away[T.H],0.0066,0.003,2.295,0.022,0.001,0.012

0,1,2,3
Omnibus:,484185.604,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14522.409
Skew:,0.188,Prob(JB):,0.0
Kurtosis:,1.29,Cond. No.,8190.0


## reg2
Run a weighted least squares regression using the entire shotlog dataframe. Include the same set of control variables as in **reg1**. The regression should be weighted by the number of shots per game $(weight=1/shot\_per\_game)$.

In [130]:
sm.wls(formula=fmla, data=shotlog_1415, weights=1/shotlog_1415['shot_per_game']).fit().summary()

0,1,2,3
Dep. Variable:,error,R-squared:,0.062
Model:,WLS,Adj. R-squared:,0.056
Method:,Least Squares,F-statistic:,9.742
Date:,"Fri, 03 Dec 2021",Prob (F-statistic):,0.0
Time:,20:42:50,Log-Likelihood:,-86947.0
No. Observations:,113726,AIC:,175400.0
Df Residuals:,112960,BIC:,182800.0
Df Model:,765,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1952,0.036,5.382,0.000,0.124,0.266
C(quarter)[T.2],-0.0058,0.004,-1.295,0.195,-0.014,0.003
C(quarter)[T.3],0.0039,0.004,0.897,0.370,-0.005,0.012
C(quarter)[T.4],-0.0009,0.005,-0.193,0.847,-0.010,0.008
C(quarter)[T.5],-0.0167,0.019,-0.871,0.384,-0.054,0.021
C(quarter)[T.6],0.0328,0.045,0.732,0.464,-0.055,0.121
C(quarter)[T.7],-0.0627,0.087,-0.719,0.472,-0.234,0.108
C(points)[T.3],0.0428,0.005,8.087,0.000,0.032,0.053
home_away[T.H],0.0057,0.003,1.998,0.046,0.000,0.011

0,1,2,3
Omnibus:,59244.808,Durbin-Watson:,2.009
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6885.793
Skew:,0.207,Prob(JB):,0.0
Kurtosis:,1.868,Cond. No.,7630.0


## reg3_player
Run linear least squares regressions on individual players. Include the following control variables:
- Shot distance
- Number of dribbles
- Touch time
- Type of shot ("points" variable)
- Quarter of the game (as a cat variable)
- Home or away
- Closest defender

In [131]:
fmla=('error ~ lagerror+shot_dist'+
              '+dribbles'+
              '+touch_time'+
              '+C(quarter)'+
              '+C(points)'+
              '+home_away'+
              '+closest_defender')

def reg_player(player):
    shotlog_player=shotlog_1415[shotlog_1415.shoot_player==player]
    return sm.ols(formula=fmla, data=shotlog_player).fit()

def wls_reg_player(player):
    shotlog_player=shotlog_1415[shotlog_1415.shoot_player==player]
    return sm.wls(formula=fmla, data=shotlog_player, weights=1/shotlog_player['shot_per_game']).fit()   

In [132]:
player_list = np.unique(shotlog_1415['shoot_player'])
player_results = {}
for i, player in enumerate(player_list):
    reg_params = pd.DataFrame(reg_player(player).params).reset_index()
    reg_tvals = pd.DataFrame(reg_player(player).tvalues).reset_index()
    reg_pvals = pd.DataFrame(reg_player(player).pvalues).reset_index()
    
    reg_output = pd.merge(reg_params, reg_tvals, on='index')
    reg_output = pd.merge(reg_output, reg_pvals, on='index')
    
    lag_err = (reg_output[reg_output['index'] == 'lagerror']
               .drop(columns='index')
               .rename(columns={"0_x":"Coef", "0_y":"T_Statistics", 0:"P_Value"}))
    lag_err['shoot_player'] = player
    player_results[i] = lag_err[['shoot_player', 'Coef', 'T_Statistics', 'P_Value']]
    
reg_player = player_results[0]
for i, player in enumerate(player_list[1:]):
    reg_player = reg_player.append(player_results[i])

reg_player = (reg_player.reset_index()
                        .drop(columns='index')
                        .merge(player_stats, on='shoot_player'))


NameError: name 'player_shots' is not defined

## reg4_wls_player
Run weighted least squares regressions on individual players. Include the same set of control variables as in **reg3**. The regression should be weighted by the number of shots per game $(weight=1/shot\_per\_game)$.

In [None]:
player_results = {}
for i, player in enumerate(player_list):
    reg_params = pd.DataFrame(wls_reg_player(player).params).reset_index()
    reg_tvals = pd.DataFrame(wls_reg_player(player).tvalues).reset_index()
    reg_pvals = pd.DataFrame(wls_reg_player(player).pvalues).reset_index()
    
    reg_output = pd.merge(reg_params, reg_tvals, on='index')
    reg_output = pd.merge(reg_output, reg_pvals, on='index')
    
    lag_err = (reg_output[reg_output['index'] == 'lagerror']
               .drop(columns='index')
               .rename(columns={"0_x":"Coef", "0_y":"T_Statistics", 0:"P_Value"}))
    lag_err['shoot_player'] = player
    player_results[i] = lag_err[['shoot_player', 'Coef', 'T_Statistics', 'P_Value']]
    
wls_reg_player = player_results[0]
for i, player in enumerate(player_list[1:]):
    wls_reg_player = wls_reg_player.append(player_results[i])

wls_reg_player = (wls_reg_player.reset_index()
                                .drop(columns='index')
                                .merge(player_stats, on='shoot_player'))

In [133]:
player_stats

Unnamed: 0,shoot_player,average_hit,conse_shot,average_lag_hit,conditional_prob
0,aaron brooks,0.415330,0.153298,0.418000,0.366741
1,aaron gordon,0.528846,0.201923,0.532468,0.379221
2,al farouq aminu,0.430233,0.162791,0.465686,0.349572
3,al horford,0.541259,0.262937,0.537994,0.488736
4,al jefferson,0.477500,0.207500,0.480000,0.432292
...,...,...,...,...,...
276,wesley matthews,0.449198,0.172460,0.452830,0.380849
277,wilson chandler,0.414734,0.141883,0.420741,0.337221
278,zach lavine,0.423181,0.156334,0.435737,0.358781
279,zach randolph,0.488823,0.229508,0.491143,0.467294
