In [1572]:
import xmltodict
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import  train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import os, zipfile
import enum
%matplotlib inline
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from bokeh.charts import Bar, Histogram
from bokeh.io import output_notebook, show
from bokeh.layouts import row
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 9.18 ms


In [1280]:
output_notebook()

In [850]:
class ImportOpta(object):
    def __init__(self, path, run=True):
        self.path = path
        self.file_paths = []
        self.instances = []
        
        if run:
            self.get_file_paths()
            self.create_instances()
            
    def get_file_paths(self):
        for folder_name in os.listdir(self.path)[1:]: # loop through items in dir
            folder_path = self.path + '/' + folder_name
            zip_file_name = os.listdir(folder_path)[-1]
            if zip_file_name.endswith(".zip"): # check for ".zip" extension
                zip_file_path = folder_path + '/' + zip_file_name # get full path of files
                zip_ref = zipfile.ZipFile(zip_file_path, 'r') # create zipfile object
                zip_ref.extractall(folder_path) # extract file to dir
                zip_ref.close() # close file
                os.remove(zip_file_path) # delete zipped file
            folder_items = os.listdir(folder_path)
            for file_name in folder_items:
                if 'f24' in file_name:
                    self.file_paths.append(folder_path + '/' + file_name)
                    break
                elif 'Opta' in file_name or 'opta_import' == file_name:
                    sub_folder_path = folder_path + '/' + file_name
                    sub_folder_items = os.listdir(sub_folder_path)
                    for file_name2 in sub_folder_items:
                        if 'f24' in file_name2:
                            self.file_paths.append(sub_folder_path + '/' + file_name2)
                            
    def create_instances(self):
        for file_path in self.file_paths:
            with open(file_path) as fd:
                [self.instances.append(Shot(item)) for item in xmltodict.parse(fd.read())['Games']['Game']['Event'] 
                 if item['@type_id'] in ['13', '14', '15', '16']];

In [1342]:
class BodyPart(enum.Enum):
    head = 0
    left_foot = 1
    right_foot = 2
    other = 3
    none = 4

In [852]:
 class ShotPitchLocation(enum.Enum):
    small_box_center = 0
    box_center = 1
    out_of_box_center = 2
    center_35_plus = 3
    small_box_right = 4
    small_box_left = 5
    box_deep_right = 6
    box_right = 7
    box_left = 8
    box_deep_left = 9
    out_of_box_deep_right = 10
    out_of_box_right = 11
    out_of_box_left = 12
    out_of_box_deep_left = 13
    right_35_plus = 14
    left_35_plus = 15
    none = 16

In [853]:
class ShotGoalLocation(enum.Enum):
    left = 0
    high = 1
    right = 2
    low_left = 3
    high_left = 4
    low_center = 5
    high_center = 6
    low_right = 7
    high_right = 8
    blocked = 9
    close_left = 10
    close_right = 11
    close_high = 12
    close_left_and_high = 13
    close_right_and_high = 14
    none = 15

In [1367]:
class PatternOfPlay(enum.Enum):
    regular_play = 0
    fast_break = 1
    set_piece = 2
    from_corner = 3
    from_kick = 4
    penalty_kick = 5
    throw_in = 6
    none = 7

In [1368]:
class ShotResult(enum.Enum):
    miss = 0
    post = 1
    saved = 2
    goal = 3
    none = 4

In [1421]:
class Assisted(enum.Enum):
    yes = 1
    no = 0

time: 1.44 ms


In [1422]:
class IndividualPlay(enum.Enum):
    yes = 1
    no = 0

time: 1.28 ms


In [1423]:
class Made(enum.Enum):
    yes = 1
    no = 0

time: 1.35 ms


In [1424]:
class Shot(object):
    def __init__(self, item):
        self.item = item
        self.pitch_length = 105.0
        self.pitch_width = 68.0
        
    @property
    def x_raw(self):
        return float(self.item['@x'])
    
    @property
    def y_raw(self):
        return float(self.item['@y'])
    
    @property
    def x(self):
        return self.x_raw / 100.0 * self.pitch_length
    
    @property
    def y(self):
        return self.y_raw / 100.0 * self.pitch_width
    
    @property
    def minute(self):
        return self.item['@min']
    
    @property
    def second(self):
        return self.item['@sec']
    
    @property
    def team(self):
        return self.item['@team_id']
    
    @property
    def shot_result(self):
        if self.item['@type_id'] == '13':
            return ShotResult.miss
        elif self.item['@type_id'] == '14':
            return ShotResult.post
        elif self.item['@type_id'] == '15':
            return ShotResult.saved
        elif self.item['@type_id'] == '16':
            return ShotResult.goal
        else:
            return ShotResult.none
    
    @property
    def made(self):
        if self.item['@type_id'] == '16':
            return Made.yes
        else:
            return Made.no
    
    @property
    def distance_raw(self):
        return np.sqrt((abs(float(self.y_raw) - 50))**2 + ((100 - float(self.x_raw)))**2)
    
    @property
    def distance(self):
        return np.sqrt((abs(float(self.y) - self.pitch_width * .5))**2 + 
                       ((self.pitch_length - float(self.x)))**2)
    
    @property
    def qualifiers(self):
        return [qualifier['@qualifier_id'] for qualifier in self.item['Q']]
    
    @property
    def body_part(self):
        if '15' in self.qualifiers:
            return BodyPart.head
        elif '72' in self.qualifiers:
            return BodyPart.left_foot
        elif '20' in self.qualifiers:
            return BodyPart.right_foot
        elif '21' in self.qualifiers:
            return BodyPart.other
        else:
            return BodyPart.none
    
    @property
    def assisted(self):
        if '29' in self.qualifiers:
            return Assisted.yes
        else:
            return Assisted.no
    
    @property
    def individual_play(self):
        if '215' in self.qualifiers:
            return IndividualPlay.yes
        else:
            return IndividualPlay.no
    
    @property
    def shot_pitch_location(self):
        if '16' in self.qualifiers:
            return ShotPitchLocation.small_box_center
        elif '17' in self.qualifiers:
            return ShotPitchLocation.box_center
        elif '18' in self.qualifiers:
            return ShotPitchLocation.out_of_box_center
        elif '19' in self.qualifiers:
            return ShotPitchLocation.center_35_plus
        elif '60' in self.qualifiers:
            return ShotPitchLocation.small_box_right
        elif '61' in self.qualifiers:
            return ShotPitchLocation.small_box_left
        elif '62' in self.qualifiers:
            return ShotPitchLocation.box_deep_right
        elif '63' in self.qualifiers:
            return ShotPitchLocation.box_right
        elif '64' in self.qualifiers:
            return ShotPitchLocation.box_left
        elif '65' in self.qualifiers:
            return ShotPitchLocation.box_deep_left
        elif '66' in self.qualifiers:
            return ShotPitchLocation.out_of_box_deep_right
        elif '67' in self.qualifiers:
            return ShotPitchLocation.out_of_box_right
        elif '68' in self.qualifiers:
            return ShotPitchLocation.out_of_box_left
        elif '69' in self.qualifiers:
            return ShotPitchLocation.out_of_box_deep_left
        elif '70' in self.qualifiers:
            return ShotPitchLocation.right_35_plus
        elif '71' in self.qualifiers:
            return ShotPitchLocation.left_35_plus
        else:
            return ShotPitchLocation.none
        
    @property
    def shot_goal_location(self):
        if '73' in self.qualifiers:
            return ShotGoalLocation.left
        elif '74' in self.qualifiers:
            return ShotGoalLocation.high
        elif '75' in self.qualifiers:
            return ShotGoalLocation.right
        elif '76' in self.qualifiers:
            return ShotGoalLocation.low_left
        elif '77' in self.qualifiers:
            return ShotGoalLocation.high_left
        elif '78' in self.qualifiers:
            return ShotGoalLocation.low_center
        elif '79' in self.qualifiers:
            return ShotGoalLocation.high_center
        elif '80' in self.qualifiers:
            return ShotGoalLocation.low_right
        elif '81' in self.qualifiers:
            return ShotGoalLocation.high_right
        elif '82' in self.qualifiers:
            return ShotGoalLocation.blocked
        elif '83' in self.qualifiers:
            return ShotGoalLocation.close_left
        elif '84' in self.qualifiers:
            return ShotGoalLocation.close_right
        elif '85' in self.qualifiers:
            return ShotGoalLocation.close_high
        elif '86' in self.qualifiers:
            return ShotGoalLocation.close_left_and_high
        elif '87' in self.qualifiers:
            return ShotGoalLocation.close_right_and_high
        else:
            return ShotGoalLocation.none
    
    @property
    def pattern_of_play(self):
        if '22' in self.qualifiers:
            return PatternOfPlay.regular_play
        elif '23' in self.qualifiers:
            return PatternOfPlay.fast_break
        elif '24' in self.qualifiers:
            return PatternOfPlay.set_piece
        elif '25' in self.qualifiers:
            return PatternOfPlay.from_corner
        elif '26' in self.qualifiers:
            return PatternOfPlay.from_kick
        elif '9' in self.qualifiers:
            return PatternOfPlay.penalty_kick
        elif '160' in self.qualifiers:
            return PatternOfPlay.throw_in
        else:
            return PatternOfPlay.none

time: 229 ms


In [1413]:
instances = ImportOpta('/Users/jason.katz/Downloads/AllOpta')

time: 1min 21s


In [1391]:
def opta_hists(prop):
    enum = type(getattr(instances.instances[0],prop))
    members = '__members__'
    goal_percentage = [np.mean([result.made for result in instances.instances if getattr(result,prop) == option]) 
                       for option in enum.__members__.values()][:-1]
    categories = [option for option in enum.__members__.keys()][:-1]
    data1 = {'label': categories,'values': goal_percentage}
    plot1 = Bar(data = data1, values = 'values', label = 'label', title = prop + ' goal percentages', legend=False)
    data2 = pd.DataFrame.from_dict(Counter([getattr(result,prop).name for result in instances.instances]), 
                                  orient='index')
    data2['name'] = data2.index
    
    data2 = {'label': data2['name'], 'values': data2[0]}
    
    plot2 = Bar(data = data2, values = 'values', label = 'label', title = prop + ' counts', legend=False)
    try: 
        show(plot1)
    except AttributeError:
        pass
    try: 
        show(plot2)
    except AttributeError:
        pass
    show(plot1)
    show(plot2)

time: 17.2 ms


In [1392]:
opta_hists('pattern_of_play')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


time: 1.04 s


In [1393]:
opta_hists('body_part')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


time: 756 ms


In [1394]:
opta_hists('shot_pitch_location')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


time: 2.57 s


In [1395]:
opta_hists('assisted')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


time: 488 ms


In [1396]:
opta_hists('individual_play')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


time: 482 ms


In [1398]:
data = pd.DataFrame({'c1': [1,2,3]})
data['c1']

0    1
1    2
2    3
Name: c1, dtype: int64

time: 9.16 ms


In [1400]:
pd.DataFrame.from_dict(Counter([getattr(result,'pattern_of_play').name for result in instances.instances]), 
                                  orient='index')

Unnamed: 0,0
set_piece,748
regular_play,8236
from_corner,1796
fast_break,126
from_kick,500
penalty_kick,114
throw_in,64


time: 624 ms


In [1528]:
features = ['made', 'pattern_of_play', 'body_part', 'shot_pitch_location', 'assisted', 'individual_play']
data = pd.DataFrame()
for feature in features:
    data[feature] = [getattr(result,feature).value for result in instances.instances]
locations = ['x', 'y', 'distance']
for location in locations:
    data[location] = [getattr(result,location) for result in instances.instances]
data

Unnamed: 0,made,pattern_of_play,body_part,shot_pitch_location,assisted,individual_play,x,y,distance
0,0,2,0,1,1,0,95.760,26.248,12.061140
1,0,2,0,1,0,0,97.860,35.768,7.355639
2,0,0,2,2,1,1,76.545,35.836,28.514171
3,0,3,1,2,0,1,83.265,42.636,23.387833
4,0,3,1,1,1,0,93.975,37.264,11.498014
5,0,0,1,5,1,0,99.120,39.508,8.056827
6,0,2,2,2,0,1,82.110,30.600,23.141134
7,0,0,1,2,1,1,78.855,42.024,27.348594
8,0,0,2,2,1,1,82.110,49.368,27.570410
9,0,0,1,2,0,1,86.205,23.732,21.416906


time: 650 ms


In [1655]:
enc = OneHotEncoder(categorical_features=[0,1,2,3,4,5,6])
x = enc.fit_transform(data[data.columns[1:]])
y = data['made']
xtrain,xtest,ytrain,ytest  = train_test_split(x,y,random_state=2)
ytrain=np.ravel(ytrain)
#modelling 
clf = LogisticRegression(penalty='l1')
clf.fit(xtrain,ytrain)
ypred_train = clf.predict(xtrain)
ypred_test = clf.predict(xtest)
accuracy_score(ytest,ypred_test)

0.91160220994475138

time: 353 ms


In [1492]:
clf.coef_

array([[-0.59054066,  0.96639931, -0.93927628, -1.24563231, -0.23166984,
         2.29858156, -0.81965108, -0.59526841, -0.17383815, -0.04493604,
         0.25225328,  1.2090153 ,  0.50378674, -0.4666055 , -1.24819076,
        -0.13850803,  0.27015704, -0.59328817, -0.15052683, -0.40372386,
        -0.13017584, -0.42125937, -0.39690668, -0.29106396,  1.78741898,
        -0.09191839, -0.10142453, -0.46036478, -0.24428102, -0.31750829,
         0.53766441,  1.75581093,  1.6103759 ,  1.46772098,  0.84850292,
         0.5365597 ,  1.24188835,  0.55330837,  0.26102671,  0.35620443,
         0.00996503,  0.06887812, -0.45082261,  0.07303282, -0.25495171,
        -0.63470562, -0.56385202, -0.53891611, -0.64481343, -0.38543881,
        -0.5284883 , -0.79869518, -0.59605491, -1.02717677, -1.33013354,
        -0.8330696 , -1.61337309, -1.49298819, -1.25005715, -1.57491011,
        -1.13832942, -1.37167432, -1.49790447, -0.20410155, -0.92462389,
        -0.69133405, -0.17144659, -0.19626233, -0.3

time: 4.51 ms


In [1459]:
accuracy_score(ytrain,ypred_train)

0.90032228360957645

time: 3.62 ms


In [1463]:
1-np.mean(data['made'])

0.8933011049723757

time: 2.92 ms


In [1465]:
from sklearn.metrics import confusion_matrix
confusion_matrix(ytest,ypred_test)

array([[2575,   25],
       [ 252,   44]])

time: 6.19 ms
