# Statistics fundamentals

## Sign test ---- Is Son Heung-Min an ambipedal?

In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import os
import pathlib
import warnings
pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore')

### Opening the dataset

Data: PL 17/18 from Wyscout

In [2]:
# open event data
path = os.path.join(str(pathlib.Path().resolve()), 'Wyscout','events', 'events_England.json')
with open(path) as f:
    data = json.load(f)

events = pd.DataFrame(data)

In [3]:
path = os.path.join(str(pathlib.Path().resolve()), 'Wyscout', 'players.json')
with open(path) as f:
    data = json.load(f)

players = pd.DataFrame(data)

### Preparing the dataset

1. Filter only shot events from all events
2. Find Son's ID in players df
3. Filter only Son's shot from all shots
4. Group Son's shots by the foot he used
    - 401 for left foot & 402 for right foot & 403 for header
    - Look up tag2name.csv for each tag's name


In [7]:
events.head()

Unnamed: 0,eventId,subEventName,tags,playerId,positions,matchId,eventName,teamId,matchPeriod,eventSec,subEventId,id
0,8,Simple pass,[{'id': 1801}],25413,"[{'y': 49, 'x': 49}, {'y': 78, 'x': 31}]",2499719,Pass,1609,1H,2.758649,85,177959171
1,8,High pass,[{'id': 1801}],370224,"[{'y': 78, 'x': 31}, {'y': 75, 'x': 51}]",2499719,Pass,1609,1H,4.94685,83,177959172
2,8,Head pass,[{'id': 1801}],3319,"[{'y': 75, 'x': 51}, {'y': 71, 'x': 35}]",2499719,Pass,1609,1H,6.542188,82,177959173
3,8,Head pass,[{'id': 1801}],120339,"[{'y': 71, 'x': 35}, {'y': 95, 'x': 41}]",2499719,Pass,1609,1H,8.143395,82,177959174
4,8,Simple pass,[{'id': 1801}],167145,"[{'y': 95, 'x': 41}, {'y': 88, 'x': 72}]",2499719,Pass,1609,1H,10.302366,85,177959175


In [6]:
events['subEventName'].unique()

array(['Simple pass', 'High pass', 'Head pass', 'Air duel',
       'Ground loose ball duel', 'Smart pass', 'Launch',
       'Ground defending duel', 'Ground attacking duel', 'Foul',
       'Free Kick', 'Cross', 'Shot', 'Reflexes', 'Touch', 'Clearance',
       'Ball out of the field', 'Throw in', 'Goal kick', 'Corner',
       'Goalkeeper leaving line', 'Hand pass', 'Acceleration',
       'Save attempt', '', 'Free kick cross', 'Free kick shot',
       'Hand foul', 'Violent Foul', 'Protest', 'Whistle',
       'Late card foul', 'Out of game foul', 'Penalty', 'Time lost foul',
       'Simulation'], dtype=object)

In [8]:
events['eventName'].unique()

array(['Pass', 'Duel', 'Foul', 'Free Kick', 'Shot', 'Save attempt',
       'Others on the ball', 'Interruption', 'Goalkeeper leaving line',
       'Offside'], dtype=object)

In [25]:
# filter only shots
shots = events[events['eventName']=='Shot']

# find Son's ID
son_id = players[players['lastName']=='Son']['wyId'].iloc[0]

# filter Son's shots
son_shots = shots[shots['playerId']== son_id]

# shots using left foot
left_foot_shots = son_shots[son_shots.apply(lambda x:{'id':401} in x.tags, axis=1)]

# shots using right foot
right_foot_shots = son_shots[son_shots.apply(lambda x:{'id':402} in x.tags, axis=1)]

In [28]:
# create list with 1 for left foot shots and -1 for right foot shots
l = [1] * len(left_foot_shots)
l.extend([-1] * len(right_foot_shots))

### Testing the hypothesis

In [33]:
from statsmodels.stats.descriptivestats import sign_test
test = sign_test(l, mu0 = 0)
pvalue = test[1]

In [34]:
if pvalue < 0.05:
    print("P-value amounts to", str(pvalue)[:4], "- We reject null hypothesis - Heung-Min Son is not ambipedal")
else:
    print("P-value amounts to", str(pvalue)[:4], " - We do not reject null hypothesis - Heung-Min Son is ambipedal")

P-value amounts to 0.14  - We do not reject null hypothesis - Heung-Min Son is ambipedal


In [32]:
? sign_test