# Using Summary Statistics to Examine the "Hot Hand"
#### Import useful libraries and the updated shot log data

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

# Shotlog=pd.read_csv("../../Data/Week 6/Shotlog1.csv")
# Player_Stats=pd.read_csv("../../Data/Week 6/Player_Stats1.csv")
# Player_Shots=pd.read_csv("../../Data/Week 6/Player_Shots1.csv")
# Player_Game=pd.read_csv("../../Data/Week 6/Player_Game1.csv")

Shotlog=pd.read_csv("/content/Shotlog1.csv")
Player_Stats=pd.read_csv("/content/Player_Stats1.csv")
Player_Shots=pd.read_csv("/content/Player_Shots1.csv")
Player_Game=pd.read_csv("/content/Player_Game1.csv")

Shotlog.head()

Unnamed: 0,team_previous_shot,player_position,home_game,location_x,opponent_previous_shot,home_team,shot_type,points,away_team,location_y,...,date,shoot_player,time_from_last_shot,quarter,current_shot_outcome,current_shot_hit,lag_shot_hit,average_hit,shot_count,shot_per_game
0,MISSED,PG,Yes,210.0,SCORED,ATL,Pullup Jump Shot,2.0,WAS,267.0,...,2016-10-27,Dennis Schroder,27.0,1.0,MISSED,0.0,1.0,0.451029,1215.0,12.0
1,MISSED,SF,Yes,279.0,SCORED,ATL,Jump Shot,3.0,WAS,130.0,...,2016-10-27,Kent Bazemore,4.0,1.0,MISSED,0.0,0.0,0.408587,722.0,7.0
2,MISSED,PG,Yes,58.0,SCORED,ATL,Driving Layup,2.0,WAS,262.0,...,2016-10-27,Dennis Schroder,50.0,1.0,MISSED,0.0,0.0,0.451029,1215.0,12.0
3,MISSED,C,Yes,107.0,SCORED,ATL,Turnaround Jump Shot,2.0,WAS,254.0,...,2016-10-27,Dwight Howard,3.0,1.0,MISSED,0.0,1.0,0.631922,614.0,9.0
4,MISSED,PF,Yes,167.0,MISSED,ATL,Pullup Jump Shot,2.0,WAS,306.0,...,2016-10-27,Paul Millsap,47.0,1.0,SCORED,1.0,0.0,0.442387,972.0,20.0


## Conditional Probability
We can first calculate the conditional probability of making a shot in the current period conditional on making the previous shot.
$$Conditional \ Probability=\frac{Probability \ of \ Making \ Consecutive \ Shots}{Probability \ of \ Making \ Previous \ Shot}$$

We will need to create a variable that indicates a player made consecutive shots.

In [2]:
Shotlog['conse_shot_hit'] = np.where((Shotlog['current_shot_hit']==1)&(Shotlog['lag_shot_hit']==1), 1, 0)
Shotlog.head()

Unnamed: 0,team_previous_shot,player_position,home_game,location_x,opponent_previous_shot,home_team,shot_type,points,away_team,location_y,...,shoot_player,time_from_last_shot,quarter,current_shot_outcome,current_shot_hit,lag_shot_hit,average_hit,shot_count,shot_per_game,conse_shot_hit
0,MISSED,PG,Yes,210.0,SCORED,ATL,Pullup Jump Shot,2.0,WAS,267.0,...,Dennis Schroder,27.0,1.0,MISSED,0.0,1.0,0.451029,1215.0,12.0,0
1,MISSED,SF,Yes,279.0,SCORED,ATL,Jump Shot,3.0,WAS,130.0,...,Kent Bazemore,4.0,1.0,MISSED,0.0,0.0,0.408587,722.0,7.0,0
2,MISSED,PG,Yes,58.0,SCORED,ATL,Driving Layup,2.0,WAS,262.0,...,Dennis Schroder,50.0,1.0,MISSED,0.0,0.0,0.451029,1215.0,12.0,0
3,MISSED,C,Yes,107.0,SCORED,ATL,Turnaround Jump Shot,2.0,WAS,254.0,...,Dwight Howard,3.0,1.0,MISSED,0.0,1.0,0.631922,614.0,9.0,0
4,MISSED,PF,Yes,167.0,MISSED,ATL,Pullup Jump Shot,2.0,WAS,306.0,...,Paul Millsap,47.0,1.0,SCORED,1.0,0.0,0.442387,972.0,20.0,0


We can create a player level dataframe. The average of the variable "conse_shot_hit" would be the joint probability of making current and previous shots. We will also calculate the average of "lag_shot_hit" to indicate the probability of making the previous shot.

In [5]:
# Player_Prob=Shotlog.groupby(['shoot_player'])['conse_shot_hit','lag_shot_hit'].mean()
Player_Prob=Shotlog.groupby(['shoot_player'])[['conse_shot_hit','lag_shot_hit']].mean() # https://stackoverflow.com/questions/76158147/pandas-groupby-valueerror-cannot-subset-columns-with-a-tuple-with-more-than-o

Player_Prob=Player_Prob.reset_index()
Player_Prob.rename(columns={'lag_shot_hit':'average_lag_hit'}, inplace=True)
Player_Prob.head()

Unnamed: 0,shoot_player,conse_shot_hit,average_lag_hit
0,A.J. Hammons,0.148148,0.407407
1,Al Horford,0.207367,0.469304
2,Alonzo Gee,0.0,0.285714
3,Amir Johnson,0.351724,0.589655
4,Anderson Varejao,0.0,0.25


#### Calculate conditional probability for each player
We can calculate the conditional probability by dividing the joint probability by the probability of making the previous shot.

In [6]:
Player_Prob['conditional_prob']=Player_Prob['conse_shot_hit']/Player_Prob['average_lag_hit']
Player_Prob.head()

Unnamed: 0,shoot_player,conse_shot_hit,average_lag_hit,conditional_prob
0,A.J. Hammons,0.148148,0.407407,0.363636
1,Al Horford,0.207367,0.469304,0.44186
2,Alonzo Gee,0.0,0.285714,0.0
3,Amir Johnson,0.351724,0.589655,0.596491
4,Anderson Varejao,0.0,0.25,0.0


We can merge the "Player_Prob" data frame with the "Player_Stats" data frame we created earlier to compare the conditional probability and the unconditional probability. If the two probabilities are the same, or almost the same, then we fail to find evidence that the making the current shot depends on making the previous shot.

In [7]:
Player_Stats=pd.merge(Player_Prob, Player_Stats, on=['shoot_player'])
Player_Stats.head(10)

Unnamed: 0,shoot_player,conse_shot_hit,average_lag_hit,conditional_prob,average_hit
0,A.J. Hammons,0.148148,0.407407,0.363636,0.404762
1,Al Horford,0.207367,0.469304,0.44186,0.473159
2,Alonzo Gee,0.0,0.285714,0.0,0.214286
3,Amir Johnson,0.351724,0.589655,0.596491,0.577236
4,Anderson Varejao,0.0,0.25,0.0,0.357143
5,Andre Drummond,0.288647,0.521739,0.553241,0.530253
6,Andre Iguodala,0.292035,0.539823,0.540984,0.527711
7,Andrew Bogut,0.224138,0.431034,0.52,0.469136
8,Andrew Nicholson,0.083333,0.333333,0.25,0.387387
9,Anthony Bennett,0.180556,0.402778,0.448276,0.413043


Let's first take a quick look at our "Player_Stats" data frame.

In [8]:
Player_Stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189 entries, 0 to 188
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   shoot_player      189 non-null    object 
 1   conse_shot_hit    189 non-null    float64
 2   average_lag_hit   189 non-null    float64
 3   conditional_prob  185 non-null    float64
 4   average_hit       189 non-null    float64
dtypes: float64(4), object(1)
memory usage: 7.5+ KB


Note that when we created the "conditional_prob" variable, some observations may have missing value since the "average_lag_shot" variable may contain zero value. We will delete these observations with missing values in conditional probability.

In [9]:
Player_Stats=Player_Stats[pd.notnull(Player_Stats["conditional_prob"])]

We can first check which players have the highest conditional probability, i.e., more likely to have hot hand.

Let's sort the data by conditional probability.

In [10]:
Player_Stats.sort_values(by=['conditional_prob'], ascending=[False]).head(10)

Unnamed: 0,shoot_player,conse_shot_hit,average_lag_hit,conditional_prob,average_hit
112,Kyle Wiltjer,0.2,0.2,1.0,0.285714
26,Chinanu Onuaku,1.0,1.0,1.0,0.714286
162,Salah Mejri,0.47561,0.658537,0.722222,0.642336
28,Chris McCullough,0.388889,0.555556,0.7,0.5
72,JaVale McGee,0.441296,0.65587,0.67284,0.652038
183,Walter Tavares,0.666667,1.0,0.666667,0.8
137,Montrezl Harrell,0.431452,0.66129,0.652439,0.652174
30,Clint Capela,0.422018,0.66055,0.638889,0.644128
88,Johnny O'Bryant III,0.35,0.55,0.636364,0.5
51,Dwight Howard,0.398148,0.625926,0.636095,0.631922


Comparing the "conditional_prob" variable and the "average_hit" variable, some players have a slightly higher conditional probability but some also have a lower conditional probability.

We can sort the data by the value of difference between conditional and unconditional probabilities.

In [11]:
Player_Stats['diff_prob']=Player_Stats['conditional_prob']-Player_Stats['average_hit']
Player_Stats=pd.merge(Player_Stats, Player_Shots, on=['shoot_player'])
Player_Stats.sort_values(by=['diff_prob'], ascending=[False]).head(10)

Unnamed: 0,shoot_player,conse_shot_hit,average_lag_hit,conditional_prob,average_hit,diff_prob,shot_count
110,Kyle Wiltjer,0.2,0.2,1.0,0.285714,0.714286,14
112,Lamar Patterson,0.090909,0.181818,0.5,0.2,0.3,15
25,Chinanu Onuaku,1.0,1.0,1.0,0.714286,0.285714,7
27,Chris McCullough,0.388889,0.555556,0.7,0.5,0.2,32
128,Mike Miller,0.307692,0.538462,0.571429,0.391304,0.180124,23
78,Jarrod Uthoff,0.2,0.333333,0.6,0.421053,0.178947,38
86,Johnny O'Bryant III,0.35,0.55,0.636364,0.5,0.136364,30
16,Bobby Brown,0.2,0.4,0.5,0.383333,0.116667,60
91,Jordan Mickey,0.294118,0.529412,0.555556,0.441176,0.114379,34
129,Mike Muscala,0.309963,0.512915,0.604317,0.504451,0.099866,337


Comparing the "conditional_prob" variable and the "average_hit" variable, some players have a slightly higher conditional probability but some also have a lower conditional probability. We can sort the data by the value of the difference between conditional and unconditional probabilities. We can see that Lamar Patterson has the highest difference between the two probabilities, at 30%. But we could also see that the sample size for Patterson is pretty small. For Joe Young and Damjan Rudez, we have about 80 observations and the difference in the probabilities is about 20%.  

### T-test for statistical significance on the difference

More rigorously, we can use a t-test to test if the players’ probability of hitting the goal is statistically significantly different than their conditional probability.

We need to choose a significance level before we perform the test. If the test produces a p-value less than the chosen significance level, then we say that there is a statistically significant difference between the two probabilities; otherwise, we fail to find evidence to support that the two probabilities are statistically significantly different from each other.

The most commonly used significance level is 0.05.

#### To perform a t-test, we need to import a new library, "scipy.stats."

In [12]:
import scipy.stats as sp

#### We can use the ttest_ind() function to calculate the test statistics.


In [13]:
sp.stats.ttest_ind(Player_Stats['conditional_prob'], Player_Stats['average_hit'])

  sp.stats.ttest_ind(Player_Stats['conditional_prob'], Player_Stats['average_hit'])


TtestResult(statistic=-0.8931047640070158, pvalue=0.37238473255837057, df=368.0)

The first number is the t-statistics and the second number is the p-value.

_Note that the p-value for the t test is about 0.10, which is higher than the conventional significance level 0.05. Thus the conditional probability is not statistically significantly different than the average success rate. In other words, in the analysis of conditional probability, we fail to find evidence to support the "hot hand"._

## Autocorrelation Coefficient
We can calculate the autocorrelation coefficient by calculating the correlation coefficient between the “current_shot_hit” variable and the “lag_shot_hit” variable.

**Note: in python, you could use “autocorr(lag=1)” to calculate first order autocorrelation coefficient. This command is not very useful in our case since we want to look at the autocorrelation coefficient within each game. Using the built-in autocorrelation coefficient function in python, we will be treating the last shot from the previous game and the first shot of the subsequent game as a pair.**

In [14]:
Shotlog['current_shot_hit'].corr(Shotlog['lag_shot_hit'])

-0.0090415464405573

_As we can see, the autocorrelation coefficient is negative and the magnitude is very small and close to zero._

Since some players may have “hot hand”, and hence strong correlation between outcomes of adjacent shots, while some may not.  We can also calculate autocorrelation coefficient for each player.

In [15]:
Shotlog.groupby('shoot_player')[['current_shot_hit','lag_shot_hit']].corr().head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,current_shot_hit,lag_shot_hit
shoot_player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A.J. Hammons,current_shot_hit,1.0,-0.011562
A.J. Hammons,lag_shot_hit,-0.011562,1.0
Al Horford,current_shot_hit,1.0,-0.056836
Al Horford,lag_shot_hit,-0.056836,1.0
Alonzo Gee,current_shot_hit,1.0,-0.258199
Alonzo Gee,lag_shot_hit,-0.258199,1.0
Amir Johnson,current_shot_hit,1.0,0.066629
Amir Johnson,lag_shot_hit,0.066629,1.0
Anderson Varejao,current_shot_hit,1.0,-0.57735
Anderson Varejao,lag_shot_hit,-0.57735,1.0


#### We may not want to print out a 2 by 2 matrix for every player. We can use the "unstack()" command to reshape the data.



In [16]:
Autocorr_Hit=Shotlog.groupby('shoot_player')[['current_shot_hit','lag_shot_hit']].corr().unstack()
Autocorr_Hit.head()

Unnamed: 0_level_0,current_shot_hit,current_shot_hit,lag_shot_hit,lag_shot_hit
Unnamed: 0_level_1,current_shot_hit,lag_shot_hit,current_shot_hit,lag_shot_hit
shoot_player,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A.J. Hammons,1.0,-0.011562,-0.011562,1.0
Al Horford,1.0,-0.056836,-0.056836,1.0
Alonzo Gee,1.0,-0.258199,-0.258199,1.0
Amir Johnson,1.0,0.066629,0.066629,1.0
Anderson Varejao,1.0,-0.57735,-0.57735,1.0


Note that now each row represents a single player. But we still have duplicate information in the columns.

#### We can use the ".iloc" command to select the columns that we need.
- In the iloc[,] command, we first specify the rows we want to select, then the columns, i.e., [rows, columns]
- We want to select all rows, so we will have iloc[:,]
- We only want to select the second column, which is indexed 1 (first column would be indexed 0, etc.)
- So we will use the command iloc[:,1]

Lastly, we will also reset the index so that the player names would become a variable.

In [17]:
Autocorr_Hit=Shotlog.groupby('shoot_player')[['current_shot_hit','lag_shot_hit']].corr().unstack().iloc[:,1].reset_index()
Autocorr_Hit.head()

Unnamed: 0_level_0,shoot_player,current_shot_hit
Unnamed: 0_level_1,Unnamed: 1_level_1,lag_shot_hit
0,A.J. Hammons,-0.011562
1,Al Horford,-0.056836
2,Alonzo Gee,-0.258199
3,Amir Johnson,0.066629
4,Anderson Varejao,-0.57735


Notice that we still have two levels of variable names.

#### We can use the "get_level_values" command to reset the variable name to the first level (index 0).

In [18]:
Autocorr_Hit.columns=Autocorr_Hit.columns.get_level_values(0)
Autocorr_Hit.head()

Unnamed: 0,shoot_player,current_shot_hit
0,A.J. Hammons,-0.011562
1,Al Horford,-0.056836
2,Alonzo Gee,-0.258199
3,Amir Johnson,0.066629
4,Anderson Varejao,-0.57735


Let's rename the variable capturing autocorrelation coefficient.

In [19]:
Autocorr_Hit.rename(columns={'current_shot_hit':'autocorr'}, inplace=True)
Autocorr_Hit.head()

Unnamed: 0,shoot_player,autocorr
0,A.J. Hammons,-0.011562
1,Al Horford,-0.056836
2,Alonzo Gee,-0.258199
3,Amir Johnson,0.066629
4,Anderson Varejao,-0.57735


#### How informative the autocorrelation coefficient also depends on the number of shots per game for each player. Let's add the number of shots and the number of shots per game to the autocorrelation matrix  and sort the data by the size of autocorrelation coefficient.

In [20]:
Player_Game_Shot=Player_Game.groupby(["shoot_player"])['shot_per_game'].mean().reset_index(name='avg_shot_game')
Player_Game_Shot.head()

Unnamed: 0,shoot_player,avg_shot_game
0,A.J. Hammons,2.8
1,Aaron Brooks,4.83871
2,Aaron Gordon,10.8
3,Aaron Harrison,1.0
4,Adreian Payne,3.6


In [21]:
Autocorr_Hit=pd.merge(Autocorr_Hit, Player_Game_Shot, on=['shoot_player'])
Autocorr_Hit.sort_values(by=['autocorr'], ascending=[False]).head(10)

Unnamed: 0,shoot_player,autocorr,avg_shot_game
112,Kyle Wiltjer,1.0,1.75
17,Bobby Brown,0.43082,3.0
132,Mike Miller,0.414758,2.3
114,Lamar Patterson,0.388889,3.75
88,Johnny O'Bryant III,0.301511,3.0
136,Miles Plumlee,0.238095,2.555556
80,Jarrod Uthoff,0.237826,4.75
77,James Jones,0.165939,3.125
181,Tyler Ennis,0.154919,4.622222
162,Salah Mejri,0.152158,2.490909


We will merge the Player_Game_Shot dataframe to the Player_Shots dataframe since both dataframes are measured in player level and both contain information on the number of shots.

In [22]:
Player_Shots=pd.merge(Player_Shots, Player_Game_Shot, on=['shoot_player'])
Player_Shots.head()

Unnamed: 0,shoot_player,shot_count,avg_shot_game
0,A.J. Hammons,42,2.8
1,Aaron Brooks,300,4.83871
2,Aaron Gordon,864,10.8
3,Aaron Harrison,4,1.0
4,Adreian Payne,54,3.6


### Save updated data

In [None]:
# Shotlog.to_csv("../../Data/Week 6/Shotlog2.csv", index=False)
# Player_Stats.to_csv("../../Data/Week 6/Player_Stats2.csv", index=False)
# Player_Shots.to_csv("../../Data/Week 6/Player_Shots2.csv", index=False)

In [23]:
Shotlog.to_csv("/content/Shotlog2.csv", index=False)
Player_Stats.to_csv("/content/Player_Stats2.csv", index=False)
Player_Shots.to_csv("/content/Player_Shots2.csv", index=False)