In [1857]:
import xmltodict
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import  train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, scale
import os, zipfile
import enum
%matplotlib inline
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from bokeh.charts import Bar, Histogram
from bokeh.io import output_notebook, show
from bokeh.layouts import row
%load_ext autotime
import threading

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 10.6 ms


In [1280]:
output_notebook()

In [2055]:
def find_file_path(folder_path):
    zip_file_name = os.listdir(folder_path)[-1]
    if zip_file_name.endswith(".zip"): # check for ".zip" extension
        zip_file_path = folder_path + '/' + zip_file_name # get full path of files
        zip_ref = zipfile.ZipFile(zip_file_path, 'r') # create zipfile object
        zip_ref.extractall(folder_path) # extract file to dir
        zip_ref.close() # close file
        os.remove(zip_file_path) # delete zipped file
    folder_items = os.listdir(folder_path)
    for file_name in folder_items:
        if 'f24' in file_name:
            return folder_path + '/' + file_name
            break
        elif 'Opta' in file_name or 'opta_import' == file_name:
            sub_folder_path = folder_path + '/' + file_name
            sub_folder_items = os.listdir(sub_folder_path)
            for file_name2 in sub_folder_items:
                if 'f24' in file_name2:
                    return sub_folder_path + '/' + file_name2
                
def create_static_instances(file_path, instances):
    with open(file_path) as fd:
        instances.extend([Shot(item) for item in xmltodict.parse(fd.read())['Games']['Game']['Event'] 
                if item['@type_id'] in ['13', '14', '15', '16'] and '9' not in 
                         [qualifier['@qualifier_id'] for qualifier in item['Q']]])
    
def process_folder(folder_path, instances):
    file_path = find_file_path(folder_path)
    create_static_instances(file_path, instances)

time: 14.5 ms


In [2056]:
class ImportOpta(object):
    def __init__(self, path, run=True):
        self.path = path
        self.file_paths = []
        self.instances = []
        self.features = ['made', 'pattern_of_play', 'body_part', 'shot_pitch_location', 'assisted', 
                         'individual_play']
        self.locations = ['x', 'y', 'distance']
        
        if run:
            self.create_instances_fast()
            self.create_model_data()
            
    def create_instances_fast(self):
        threads = []
        for folder_name in os.listdir(self.path)[1:]: # loop through items in dir
            folder_path = self.path + '/' + folder_name
            args = (folder_path, self.instances)
            thread = threading.Thread(target=process_folder, args=args)
            thread.start()
            threads.append(thread)
        for thread in threads:
            thread.join()
            
    def create_model_data(self, new_features = None, new_locations = None):
        if new_features != None:
            self.features = new_features
        if new_locations != None:
            self.locations = new_locations
        data = pd.DataFrame()
        for feature in self.features:
            data[feature] = [getattr(result,feature).value for result in self.instances]
        for location in self.locations:
            data[location] = [getattr(result,location) for result in self.instances]
        enc = OneHotEncoder(categorical_features=np.arange(0,len(self.features)-1))
        x_raw = enc.fit_transform(data[data.columns[1:]])
        self.x = scale(x_raw.toarray())
        self.y = data['made']

time: 23.2 ms


In [2057]:
class BodyPart(enum.Enum):
    head = 0
    left_foot = 1
    right_foot = 2
    other = 3
    none = 4

time: 2.32 ms


In [2058]:
 class ShotPitchLocation(enum.Enum):
    small_box_center = 0
    box_center = 1
    out_of_box_center = 2
    center_35_plus = 3
    small_box_right = 4
    small_box_left = 5
    box_deep_right = 6
    box_right = 7
    box_left = 8
    box_deep_left = 9
    out_of_box_deep_right = 10
    out_of_box_right = 11
    out_of_box_left = 12
    out_of_box_deep_left = 13
    right_35_plus = 14
    left_35_plus = 15
    none = 16

time: 6.14 ms


In [2059]:
class ShotGoalLocation(enum.Enum):
    left = 0
    high = 1
    right = 2
    low_left = 3
    high_left = 4
    low_center = 5
    high_center = 6
    low_right = 7
    high_right = 8
    blocked = 9
    close_left = 10
    close_right = 11
    close_high = 12
    close_left_and_high = 13
    close_right_and_high = 14
    none = 15

time: 5.25 ms


In [2070]:
class PatternOfPlay(enum.Enum):
    regular_play = 0
    fast_break = 1
    set_piece = 2
    from_corner = 3
    from_kick = 4
    throw_in = 5
    none = 6

time: 2.82 ms


In [2061]:
class ShotResult(enum.Enum):
    miss = 0
    post = 1
    saved = 2
    goal = 3
    none = 4

time: 1.9 ms


In [2062]:
class Assisted(enum.Enum):
    yes = 1
    no = 0

time: 1.48 ms


In [2063]:
class IndividualPlay(enum.Enum):
    yes = 1
    no = 0

time: 1.31 ms


In [2064]:
class Made(enum.Enum):
    yes = 1
    no = 0

time: 1.22 ms


In [2071]:
class Shot(object):
    def __init__(self, item):
        self.item = item
        self.pitch_length = 105.0
        self.pitch_width = 68.0
        
    @property
    def x_raw(self):
        return float(self.item['@x'])
    
    @property
    def y_raw(self):
        return float(self.item['@y'])
    
    @property
    def x(self):
        return self.x_raw / 100.0 * self.pitch_length
    
    @property
    def y(self):
        return self.y_raw / 100.0 * self.pitch_width
    
    @property
    def minute(self):
        return self.item['@min']
    
    @property
    def second(self):
        return self.item['@sec']
    
    @property
    def team(self):
        return self.item['@team_id']
    
    @property
    def shot_result(self):
        if self.item['@type_id'] == '13':
            return ShotResult.miss
        elif self.item['@type_id'] == '14':
            return ShotResult.post
        elif self.item['@type_id'] == '15':
            return ShotResult.saved
        elif self.item['@type_id'] == '16':
            return ShotResult.goal
        else:
            return ShotResult.none
    
    @property
    def made(self):
        if self.item['@type_id'] == '16':
            return Made.yes
        else:
            return Made.no
    
    @property
    def distance_raw(self):
        return np.sqrt((abs(float(self.y_raw) - 50))**2 + ((100 - float(self.x_raw)))**2)
    
    @property
    def distance(self):
        return np.sqrt((abs(float(self.y) - self.pitch_width * .5))**2 + 
                       ((self.pitch_length - float(self.x)))**2)
    
    @property
    def qualifiers(self):
        return [qualifier['@qualifier_id'] for qualifier in self.item['Q']]
    
    @property
    def body_part(self):
        if '15' in self.qualifiers:
            return BodyPart.head
        elif '72' in self.qualifiers:
            return BodyPart.left_foot
        elif '20' in self.qualifiers:
            return BodyPart.right_foot
        elif '21' in self.qualifiers:
            return BodyPart.other
        else:
            return BodyPart.none
    
    @property
    def assisted(self):
        if '29' in self.qualifiers:
            return Assisted.yes
        else:
            return Assisted.no
    
    @property
    def individual_play(self):
        if '215' in self.qualifiers:
            return IndividualPlay.yes
        else:
            return IndividualPlay.no
    
    @property
    def shot_pitch_location(self):
        if '16' in self.qualifiers:
            return ShotPitchLocation.small_box_center
        elif '17' in self.qualifiers:
            return ShotPitchLocation.box_center
        elif '18' in self.qualifiers:
            return ShotPitchLocation.out_of_box_center
        elif '19' in self.qualifiers:
            return ShotPitchLocation.center_35_plus
        elif '60' in self.qualifiers:
            return ShotPitchLocation.small_box_right
        elif '61' in self.qualifiers:
            return ShotPitchLocation.small_box_left
        elif '62' in self.qualifiers:
            return ShotPitchLocation.box_deep_right
        elif '63' in self.qualifiers:
            return ShotPitchLocation.box_right
        elif '64' in self.qualifiers:
            return ShotPitchLocation.box_left
        elif '65' in self.qualifiers:
            return ShotPitchLocation.box_deep_left
        elif '66' in self.qualifiers:
            return ShotPitchLocation.out_of_box_deep_right
        elif '67' in self.qualifiers:
            return ShotPitchLocation.out_of_box_right
        elif '68' in self.qualifiers:
            return ShotPitchLocation.out_of_box_left
        elif '69' in self.qualifiers:
            return ShotPitchLocation.out_of_box_deep_left
        elif '70' in self.qualifiers:
            return ShotPitchLocation.right_35_plus
        elif '71' in self.qualifiers:
            return ShotPitchLocation.left_35_plus
        else:
            return ShotPitchLocation.none
        
    @property
    def shot_goal_location(self):
        if '73' in self.qualifiers:
            return ShotGoalLocation.left
        elif '74' in self.qualifiers:
            return ShotGoalLocation.high
        elif '75' in self.qualifiers:
            return ShotGoalLocation.right
        elif '76' in self.qualifiers:
            return ShotGoalLocation.low_left
        elif '77' in self.qualifiers:
            return ShotGoalLocation.high_left
        elif '78' in self.qualifiers:
            return ShotGoalLocation.low_center
        elif '79' in self.qualifiers:
            return ShotGoalLocation.high_center
        elif '80' in self.qualifiers:
            return ShotGoalLocation.low_right
        elif '81' in self.qualifiers:
            return ShotGoalLocation.high_right
        elif '82' in self.qualifiers:
            return ShotGoalLocation.blocked
        elif '83' in self.qualifiers:
            return ShotGoalLocation.close_left
        elif '84' in self.qualifiers:
            return ShotGoalLocation.close_right
        elif '85' in self.qualifiers:
            return ShotGoalLocation.close_high
        elif '86' in self.qualifiers:
            return ShotGoalLocation.close_left_and_high
        elif '87' in self.qualifiers:
            return ShotGoalLocation.close_right_and_high
        else:
            return ShotGoalLocation.none
    
    @property
    def pattern_of_play(self):
        if '22' in self.qualifiers:
            return PatternOfPlay.regular_play
        elif '23' in self.qualifiers:
            return PatternOfPlay.fast_break
        elif '24' in self.qualifiers:
            return PatternOfPlay.set_piece
        elif '25' in self.qualifiers:
            return PatternOfPlay.from_corner
        elif '26' in self.qualifiers:
            return PatternOfPlay.from_kick
        elif '160' in self.qualifiers:
            return PatternOfPlay.throw_in
        else:
            return PatternOfPlay.none

time: 228 ms


In [2066]:
def opta_hists(shots, prop):
    enum = type(getattr(shots.instances[0],prop))
    members = '__members__'
    goal_percentage = [np.mean([result.made.value for result in shots.instances if getattr(result,prop) == option]) 
                       for option in enum.__members__.values()][:-1]
    categories = [option for option in enum.__members__.keys()][:-1]
    data1 = {'label': categories,'values': goal_percentage}
    plot1 = Bar(data = data1, values = 'values', label = 'label', title = prop + ' goal percentages', legend=False)
    data2 = pd.DataFrame.from_dict(Counter([getattr(result,prop).name for result in shots.instances]), 
                                  orient='index')
    data2['name'] = data2.index
    
    data2 = {'label': data2['name'], 'values': data2[0]}
    
    plot2 = Bar(data = data2, values = 'values', label = 'label', title = prop + ' counts', legend=False)
    try: 
        show(plot1)
    except AttributeError:
        pass
    try: 
        show(plot2)
    except AttributeError:
        pass
    show(plot1)
    show(plot2)

time: 17.6 ms


In [2073]:
opta_hists(shots, 'pattern_of_play')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


time: 972 ms


In [2045]:
opta_hists(shots, 'body_part')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


time: 829 ms


In [1394]:
opta_hists(shots, 'shot_pitch_location')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


time: 2.57 s


In [1395]:
opta_hists(shots, 'assisted')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


time: 488 ms


In [1396]:
opta_hists(shots, 'individual_play')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


time: 482 ms


In [2050]:
class ModelResults(object):
    def __init__(self, accuracies, model):
        self.accuracies = accuracies
        self.model = model

    @property
    def iterations(self):
        return len(self.accuracies)
    
    @property
    def mean_accuracy(self):
        return np.mean(self.accuracies)
    
    @property
    def variance(self):
        return np.var(self.accuracies)
    
    def hist(self, bins=None):
        if bins == None:
            plot = Histogram(self.accuracies)
        else:
            plot = Histogram(self.accuracies, bins = bins)
        plot.x_range.start,plot.x_range.end = (min(self.accuracies),max(self.accuracies))
        try: 
            show(plot)
        except AttributeError:
            pass
        show(plot)

time: 10.3 ms


In [2051]:
def create_model(shots, model, iters = 50):
    accuracies = []
    model_name = model.__name__
    for i in range(iters):
        xtrain,xtest,ytrain,ytest  = train_test_split(shots.x,shots.y)
        ytrain=np.ravel(ytrain)
        clf = model()
        clf.fit(xtrain,ytrain)
        ypred_test = clf.predict(xtest)
        accuracies.append(accuracy_score(ytest,ypred_test))
    print("Mean Accuracy: {}, Variance: {}".format(np.mean(accuracies), np.var(accuracies)))
    return ModelResults(accuracies, model_name)

time: 5.53 ms


In [2072]:
shots = ImportOpta('/Users/jason.katz/Downloads/AllOpta')

time: 1min 36s


In [2075]:
mod1 = create_model(shots,RandomForestClassifier)

Mean Accuracy: 0.9119804741980475, Variance: 1.634036129930807e-05
time: 4.28 s


In [2076]:
mod1.hist()

time: 165 ms


In [1463]:
1-np.mean(data['made'])

0.8933011049723757

time: 2.92 ms
