In [2]:
import xmltodict
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import  train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, scale
import os, zipfile
import enum
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
from bokeh.charts import Bar, Histogram, Scatter
from bokeh.io import output_notebook, show
from bokeh.layouts import row
import threading
import json
%load_ext autotime



In [3]:
output_notebook()

time: 6.35 ms


In [4]:
def find_file_path(folder_path):
    zip_file_name = os.listdir(folder_path)[-1]
    if zip_file_name.endswith(".zip"): # check for ".zip" extension
        zip_file_path = folder_path + '/' + zip_file_name # get full path of files
        zip_ref = zipfile.ZipFile(zip_file_path, 'r') # create zipfile object
        zip_ref.extractall(folder_path) # extract file to dir
        zip_ref.close() # close file
        os.remove(zip_file_path) # delete zipped file
    folder_items = os.listdir(folder_path)
    for file_name in folder_items:
        if 'f24' in file_name:
            return folder_path + '/' + file_name
            break
        elif 'Opta' in file_name or 'opta_import' == file_name:
            sub_folder_path = folder_path + '/' + file_name
            sub_folder_items = os.listdir(sub_folder_path)
            for file_name2 in sub_folder_items:
                if 'f24' in file_name2:
                    return sub_folder_path + '/' + file_name2
                
def create_static_instances(file_path, instances):
    with open(file_path) as fd:
        items = []
        data = xmltodict.parse(fd.read())
        game = data['Games']['Game']['@id']
        home_team_id = data['Games']['Game']['@home_team_id']
        away_team_id = data['Games']['Game']['@away_team_id']
        home_team_name = data['Games']['Game']['@home_team_name']
        away_team_name = data['Games']['Game']['@away_team_name']
        for item in data['Games']['Game']['Event']:
            if item['@type_id'] in ['13', '14', '15', '16'] and all(x not in [qualifier['@qualifier_id'] for qualifier in item['Q']] for x in ['9','28']):
                item['@game_id'] = game
                item['@home_team_id'] = home_team_id
                item['@away_team_id'] = away_team_id
                item['@home_team_name'] = home_team_name
                item['@away_team_name'] = away_team_name
                items.append(item)
        instances.extend(items)
       
def process_folder(folder_path, instances):
    file_path = find_file_path(folder_path)
    create_static_instances(file_path, instances)

time: 29.6 ms


In [5]:
class ImportOpta(object):
    def __init__(self, path, run=True):
        self.path = path
        self.file_paths = []
        self.instances = []
        
        if run:
            print("Beginning upload of games")
            self.create_instances_fast()
            print("Finished uploading games")
            
    def create_instances_fast(self):
        threads = []
        counter = 0
        for folder_name in os.listdir(self.path)[1:]: # loop through items in dir
            counter += 1
            folder_path = self.path + '/' + folder_name
            args = (folder_path, self.instances)
            thread = threading.Thread(target=process_folder, args=args)
            thread.start()
            threads.append(thread)
            print("Uploaded game {} of {}".format(counter, len(os.listdir(self.path)[1:])))
        for thread in threads:
            thread.join()

time: 9.83 ms


In [6]:
class BodyPart(enum.Enum):
    head = 0
    left_foot = 1
    right_foot = 2
    other = 3
    none = 4

class ShotPitchLocation(enum.Enum):
    small_box_center = 0
    box_center = 1
    out_of_box_center = 2
    center_35_plus = 3
    small_box_right = 4
    small_box_left = 5
    box_deep_right = 6
    box_right = 7
    box_left = 8
    box_deep_left = 9
    out_of_box_deep_right = 10
    out_of_box_right = 11
    out_of_box_left = 12
    out_of_box_deep_left = 13
    right_35_plus = 14
    left_35_plus = 15
    none = 16

class ShotGoalLocation(enum.Enum):
    left = 0
    high = 1
    right = 2
    low_left = 3
    high_left = 4
    low_center = 5
    high_center = 6
    low_right = 7
    high_right = 8
    blocked = 9
    close_left = 10
    close_right = 11
    close_high = 12
    close_left_and_high = 13
    close_right_and_high = 14
    none = 15

class PatternOfPlay(enum.Enum):
    regular_play = 0
    fast_break = 1
    set_piece = 2
    from_corner = 3
    from_kick = 4
    throw_in = 5
    none = 6

class ShotResult(enum.Enum):
    miss = 0
    post = 1
    saved = 2
    goal = 3
    none = 4

class Assisted(enum.Enum):
    yes = 1
    no = 0
    none = 2

class IntentionalAssist(enum.Enum):
    yes = 1
    no = 0
    none = 2

class Strong(enum.Enum):
    yes = 1
    no = 0
    none = 2

class Swerved(enum.Enum):
    yes = 1
    no = 0
    none = 2

class Deflection(enum.Enum):
    yes = 1
    no = 0
    none = 2

class BigChance(enum.Enum):
    yes = 1
    no = 0
    none = 2

class Weak(enum.Enum):
    yes = 1
    no = 0
    none = 2

class IndividualPlay(enum.Enum):
    yes = 1
    no = 0
    none = 2

class RightFoot(enum.Enum):
    yes = 1
    no = 0
    none = 2
    
    
class LeftFoot(enum.Enum):
    yes = 1
    no = 0
    none = 2
    
    
class OtherBodyPart(enum.Enum):
    yes = 1
    no = 0
    none = 2
    
class Header(enum.Enum):
    yes = 1
    no = 0
    none = 2
    
    
class FastBreak(enum.Enum):
    yes = 1
    no = 0
    none = 2
    
    
class SetPiece(enum.Enum):
    yes = 1
    no = 0
    none = 2
    
    
class FromKick(enum.Enum):
    yes = 1
    no = 0
    none = 2

class Made(enum.Enum):
    yes = 1
    no = 0

time: 150 ms


In [7]:
class Shot(object):
    def __init__(self, item):
        self.item = item
        self.pitch_length = 105.0
        self.pitch_width = 68.0
    
    @property
    def game(self):
        return self.item['@game_id']
    
    @property
    def home_team_id(self):
        return self.item['@home_team_id']
    
    @property
    def home_team_name(self):
        return self.item['@home_team_name']
    
    @property
    def away_team_id(self):
        return self.item['@away_team_id']
    
    @property
    def away_team_name(self):
        return self.item['@away_team_name']
    
    @property
    def x_raw(self):
        return float(self.item['@x'])
    
    @property
    def y_raw(self):
        return float(self.item['@y'])
    
    @property
    def x(self):
        return self.x_raw / 100.0 * self.pitch_length
    
    @property
    def y(self):
        return self.y_raw / 100.0 * self.pitch_width
    
    @property
    def angle(self):
        return np.rad2deg(np.arctan(abs(self.y-34)/(105-self.x)))
    
    @property
    def minute(self):
        return int(self.item['@min'])
    
    @property
    def second(self):
        return int(self.item['@sec'])
    
    @property
    def time(self):
        return (self.minute*60 + self.second)/5400
    
    @property
    def team(self):
        return self.item['@team_id']
    
    @property
    def shot_result(self):
        if self.item['@type_id'] == '13':
            return ShotResult.miss
        elif self.item['@type_id'] == '14':
            return ShotResult.post
        elif self.item['@type_id'] == '15':
            return ShotResult.saved
        elif self.item['@type_id'] == '16':
            return ShotResult.goal
        else:
            return ShotResult.none
    
    @property
    def made(self):
        if self.item['@type_id'] == '16':
            return Made.yes
        else:
            return Made.no
    
    @property
    def distance_raw(self):
        return np.sqrt((abs(float(self.y_raw) - 50))**2 + ((100 - float(self.x_raw)))**2)
    
    @property
    def distance(self):
        return np.sqrt((abs(float(self.y) - self.pitch_width * .5))**2 + 
                       ((self.pitch_length - float(self.x)))**2)
    
    @property
    def qualifiers(self):
        return [qualifier['@qualifier_id'] for qualifier in self.item['Q']]
    
    @property
    def body_part(self):
        if '15' in self.qualifiers:
            return BodyPart.head
        elif '72' in self.qualifiers:
            return BodyPart.left_foot
        elif '20' in self.qualifiers:
            return BodyPart.right_foot
        elif '21' in self.qualifiers:
            return BodyPart.other
        else:
            return BodyPart.none
        
    @property
    def header(self):
        if '15' in self.qualifiers:
            return Header.yes
        elif '15' not in self.qualifiers:
            return Header.no
        else:
            return Header.none
        
    @property
    def right_foot(self):
        if '20' in self.qualifiers:
            return RightFoot.yes
        elif '20' not in self.qualifiers:
            return RightFoot.no
        else:
            return RightFoot.none
        
    @property
    def left_foot(self):
        if '72' in self.qualifiers:
            return LeftFoot.yes
        elif '72' not in self.qualifiers:
            return LeftFoot.no
        else:
            return LeftFoot.none
        
    @property
    def other_body_part(self):
        if '21' in self.qualifiers:
            return OtherBodyPart.yes
        elif '21' not in self.qualifiers:
            return OtherBodyPart.no
        else:
            return OtherBodyPart.none
    
    @property
    def assisted(self):
        if '29' in self.qualifiers:
            return Assisted.yes
        elif '29' not in self.qualifiers:
            return Assisted.no
        else:
            return Assisted.none
        
    @property
    def intentional_assist(self):
        if '154' in self.qualifiers:
            return IntentionalAssist.yes
        elif '154' not in self.qualifiers:
            return IntentionalAssist.no
        else:
            return IntentionalAssist.none

    @property
    def individual_play(self):
        if '215' in self.qualifiers:
            return IndividualPlay.yes
        elif '215' not in self.qualifiers:
            return IndividualPlay.no
        else:
            return IndividualPlay.none
        
    @property
    def strong(self):
        if '113' in self.qualifiers:
            return Strong.yes
        elif '113' not in self.qualifiers:
            return Strong.no
        else:
            return Strong.none
        
    @property
    def weak(self):
        if '114' in self.qualifiers:
            return Weak.yes
        elif '114' not in self.qualifiers:
            return Weak.no
        else:
            return Weak.none
        
    @property
    def swerved(self):
        if '120' in self.qualifiers or '121' in self.qualifiers or '122' in self.qualifiers:
            return Swerved.yes
        elif '120' not in self.qualifiers and '121' not in self.qualifiers and '122' not in self.qualifiers:
            return Swerved.no
        else:
            return Swerved.none
        
    @property
    def deflection(self):
        if '133' in self.qualifiers:
            return Deflection.yes
        elif '133' not in self.qualifiers:
            return Deflection.no
        else:
            return Deflection.none
        
    @property
    def big_chance(self):
        if '214' in self.qualifiers:
            return BigChance.yes
        elif '214' not in self.qualifiers:
            return BigChance.no
        else:
            return BigChance.none
    
    @property
    def shot_pitch_location(self):
        if '16' in self.qualifiers:
            return ShotPitchLocation.small_box_center
        elif '17' in self.qualifiers:
            return ShotPitchLocation.box_center
        elif '18' in self.qualifiers:
            return ShotPitchLocation.out_of_box_center
        elif '19' in self.qualifiers:
            return ShotPitchLocation.center_35_plus
        elif '60' in self.qualifiers:
            return ShotPitchLocation.small_box_right
        elif '61' in self.qualifiers:
            return ShotPitchLocation.small_box_left
        elif '62' in self.qualifiers:
            return ShotPitchLocation.box_deep_right
        elif '63' in self.qualifiers:
            return ShotPitchLocation.box_right
        elif '64' in self.qualifiers:
            return ShotPitchLocation.box_left
        elif '65' in self.qualifiers:
            return ShotPitchLocation.box_deep_left
        elif '66' in self.qualifiers:
            return ShotPitchLocation.out_of_box_deep_right
        elif '67' in self.qualifiers:
            return ShotPitchLocation.out_of_box_right
        elif '68' in self.qualifiers:
            return ShotPitchLocation.out_of_box_left
        elif '69' in self.qualifiers:
            return ShotPitchLocation.out_of_box_deep_left
        elif '70' in self.qualifiers:
            return ShotPitchLocation.right_35_plus
        elif '71' in self.qualifiers:
            return ShotPitchLocation.left_35_plus
        else:
            return ShotPitchLocation.none
        
    @property
    def shot_goal_location(self):
        if '73' in self.qualifiers:
            return ShotGoalLocation.left
        elif '74' in self.qualifiers:
            return ShotGoalLocation.high
        elif '75' in self.qualifiers:
            return ShotGoalLocation.right
        elif '76' in self.qualifiers:
            return ShotGoalLocation.low_left
        elif '77' in self.qualifiers:
            return ShotGoalLocation.high_left
        elif '78' in self.qualifiers:
            return ShotGoalLocation.low_center
        elif '79' in self.qualifiers:
            return ShotGoalLocation.high_center
        elif '80' in self.qualifiers:
            return ShotGoalLocation.low_right
        elif '81' in self.qualifiers:
            return ShotGoalLocation.high_right
        elif '82' in self.qualifiers:
            return ShotGoalLocation.blocked
        elif '83' in self.qualifiers:
            return ShotGoalLocation.close_left
        elif '84' in self.qualifiers:
            return ShotGoalLocation.close_right
        elif '85' in self.qualifiers:
            return ShotGoalLocation.close_high
        elif '86' in self.qualifiers:
            return ShotGoalLocation.close_left_and_high
        elif '87' in self.qualifiers:
            return ShotGoalLocation.close_right_and_high
        else:
            return ShotGoalLocation.none
    
    @property
    def pattern_of_play(self):
        if '22' in self.qualifiers:
            return PatternOfPlay.regular_play
        elif '23' in self.qualifiers:
            return PatternOfPlay.fast_break
        elif '24' in self.qualifiers:
            return PatternOfPlay.set_piece
        elif '25' in self.qualifiers:
            return PatternOfPlay.from_corner
        elif '26' in self.qualifiers:
            return PatternOfPlay.from_kick
        elif '160' in self.qualifiers:
            return PatternOfPlay.throw_in
        else:
            return PatternOfPlay.none
        
    @property
    def fast_break(self):
        if '23' in self.qualifiers:
            return FastBreak.yes
        elif '23' not in self.qualifiers:
            return FastBreak.no
        else:
            return FastBreak.none
        
    @property
    def set_piece(self):
        if '24' in self.qualifiers:
            return SetPiece.yes
        elif '24' not in self.qualifiers:
            return SetPiece.no
        else:
            return SetPiece.none
        
    @property
    def from_kick(self):
        if '26' in self.qualifiers:
            return FromKick.yes
        elif '26' not in self.qualifiers:
            return FromKick.no
        else:
            return FromKick.none

time: 613 ms


In [207]:
counter = 0
for i in range(10000):
    if '214' in shots.instances[i].qualifiers and shots.instances[i].made.value==1:
        counter += 1
counter

476

time: 68.5 ms


In [8]:
def opta_hists(shots, prop):
    enum = type(getattr(shots[0],prop))
    goal_percentage = [np.mean([result.made.value for result in shots if getattr(result,prop) == option]) 
                       for option in enum.__members__.values()][:-1]
    categories = [option for option in enum.__members__.keys()][:-1]
    data1 = {'label': categories,'values': goal_percentage}
    plot1 = Bar(data = data1, values = 'values', label = 'label', title = prop + ' goal percentages', legend=False)
    show(plot1)
    data2 = pd.DataFrame.from_dict(Counter([getattr(result,prop).name for result in shots]), 
                                  orient='index')
    data2['name'] = data2.index
    
    data2 = {'label': data2['name'], 'values': data2[0]}
    
    plot2 = Bar(data = data2, values = 'values', label = 'label', title = prop + ' counts', legend=False)
    show(plot2)

time: 12.3 ms


In [985]:
opta_hists(shots, 'pattern_of_play')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


time: 1.21 s


In [1000]:
opta_hists(shots, 'shot_pitch_location')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


time: 3.14 s


In [986]:
opta_hists(shots, 'body_part')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


time: 1.12 s


In [1394]:
opta_hists(shots, 'shot_pitch_location')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


time: 2.57 s


In [959]:
opta_hists(shots, 'assisted')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


time: 683 ms


In [960]:
opta_hists(shots, 'individual_play')

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


time: 708 ms


In [9]:
class ModelResults(object):
    def __init__(self, accuracies, model, model_obj, show_hist=False):
        self.accuracies = accuracies
        self.model = model
        self.model_obj = model_obj

        if show_hist:
            self.hist()

    @property
    def iterations(self):
        return len(self.accuracies)

    @property
    def mean_accuracy(self):
        return np.mean(self.accuracies)

    @property
    def variance(self):
        return np.var(self.accuracies)
    
    def hist(self, bins=None):
        if bins == None:
            plot = Histogram(self.accuracies)
        else:
            plot = Histogram(self.accuracies, bins = bins)
        plot.x_range.start,plot.x_range.end = (min(self.accuracies),max(self.accuracies))
        try: 
            show(plot)
        except AttributeError:
            pass
        show(plot)

time: 13.3 ms


In [10]:
def create_model_data(shots, new_features=None, new_continuous=None):
    features = new_features
    continuous = new_continuous
    data = pd.DataFrame()
    for feature in features:
        data[feature] = [getattr(result, feature).value for result in shots]
    for cont in continuous:
        data[cont] = [getattr(result, cont) for result in shots]
    return scale(data[data.columns[1:]]), data['made']

time: 3.47 ms


In [11]:
def create_model(shots, model, new_features=None, new_continuous=None, show_hist=False, iters=50):
    if new_features is None:
        new_features = ['made', 'fast_break', 'set_piece', 'body_part', 'assisted', 'individual_play']
    if new_continuous is None:
        new_continuous = ['x', 'y', 'distance', 'angle', 'time']
    predictors, response = create_model_data(shots, new_features=new_features, new_continuous=new_continuous)
    accuracies = []
    model_name = model.__name__
    for i in range(iters):
        xtrain, xtest, ytrain, ytest = train_test_split(predictors, response)
        ytrain = np.ravel(ytrain)
        clf = model()
        clf.fit(xtrain, ytrain)
        ypred_test = clf.predict(xtest)
        accuracies.append(accuracy_score(ytest, ypred_test))
    final_model = model()
    final_model.fit(predictors, response)
    print("{}: Mean Accuracy: {}, Variance: {}".format(model_name, np.mean(accuracies), np.var(accuracies)))
    return ModelResults(accuracies, model_name, final_model, show_hist)

time: 10.7 ms


In [12]:
def model_accuracy(shots, model_obj, show_plot = False):
    new_features = ['made', 'fast_break', 'set_piece', 'left_foot', 'right_foot', 'assisted', 'individual_play']
    new_continuous = ['x', 'y', 'distance', 'angle', 'time']
    game_id = shots[0].game
    single_game = []
    expected_goals = []
    actual_goals = []
    for shot in shots:
        if shot.game != game_id:
            predictors, response = create_model_data(single_game, new_features=new_features, 
                                                     new_continuous=new_continuous)
            predictions = model_obj.predict_proba(predictors)
            expected_goals.append(sum(predictions)[1])
            actual_goals.append(sum(response))
            single_game = []
        game_id = shot.game
        single_game.append(shot)  
    if show_plot:
        data = pd.DataFrame({'expected_goals':expected_goals, 'actual_goals':actual_goals})
        show(Scatter(data, 'expected_goals', 'actual_goals'))
    return expected_goals, actual_goals

time: 10.7 ms


In [13]:
instances_list = ImportOpta('/Users/jason.katz/Downloads/AllOpta').instances

Beginning upload of games
Uploaded game 1 of 447
Uploaded game 2 of 447
Uploaded game 3 of 447
Uploaded game 4 of 447
Uploaded game 5 of 447
Uploaded game 6 of 447
Uploaded game 7 of 447
Uploaded game 8 of 447
Uploaded game 9 of 447
Uploaded game 10 of 447
Uploaded game 11 of 447
Uploaded game 12 of 447
Uploaded game 13 of 447
Uploaded game 14 of 447
Uploaded game 15 of 447
Uploaded game 16 of 447
Uploaded game 17 of 447
Uploaded game 18 of 447
Uploaded game 19 of 447
Uploaded game 20 of 447
Uploaded game 21 of 447
Uploaded game 22 of 447
Uploaded game 23 of 447
Uploaded game 24 of 447
Uploaded game 25 of 447
Uploaded game 26 of 447
Uploaded game 27 of 447
Uploaded game 28 of 447
Uploaded game 29 of 447
Uploaded game 30 of 447
Uploaded game 31 of 447
Uploaded game 32 of 447
Uploaded game 33 of 447
Uploaded game 34 of 447
Uploaded game 35 of 447
Uploaded game 36 of 447
Uploaded game 37 of 447
Uploaded game 38 of 447
Uploaded game 39 of 447
Uploaded game 40 of 447
Uploaded game 41 of 447

Uploaded game 335 of 447
Uploaded game 336 of 447
Uploaded game 337 of 447
Uploaded game 338 of 447
Uploaded game 339 of 447
Uploaded game 340 of 447
Uploaded game 341 of 447
Uploaded game 342 of 447
Uploaded game 343 of 447
Uploaded game 344 of 447
Uploaded game 345 of 447
Uploaded game 346 of 447
Uploaded game 347 of 447
Uploaded game 348 of 447
Uploaded game 349 of 447
Uploaded game 350 of 447
Uploaded game 351 of 447
Uploaded game 352 of 447
Uploaded game 353 of 447
Uploaded game 354 of 447
Uploaded game 355 of 447
Uploaded game 356 of 447
Uploaded game 357 of 447
Uploaded game 358 of 447
Uploaded game 359 of 447
Uploaded game 360 of 447
Uploaded game 361 of 447
Uploaded game 362 of 447
Uploaded game 363 of 447
Uploaded game 364 of 447
Uploaded game 365 of 447
Uploaded game 366 of 447
Uploaded game 367 of 447
Uploaded game 368 of 447
Uploaded game 369 of 447
Uploaded game 370 of 447
Uploaded game 371 of 447
Uploaded game 372 of 447
Uploaded game 373 of 447
Uploaded game 374 of 447


In [14]:
shots = [Shot(item) for item in instances_list]

time: 13.3 ms


In [1111]:
test = Model(instances_list)

time: 690 µs


In [15]:
class Model(object):
    def __init__(self, instances_list):
        self.instances_list = instances_list
    
    @property
    def shots(self):
        return [Shot(item) for item in self.instances_list]
    
    def create_model_data2(self, shots, new_features=None, new_continuous=None):
        features = ['made', 'fast_break', 'set_piece', 'left_foot', 'right_foot', 'assisted', 'individual_play']
        continuous = ['x', 'y', 'distance', 'angle', 'time']
        data = pd.DataFrame()
        for feature in features:
            data[feature] = [getattr(result, feature).value for result in shots]
        for cont in continuous:
            data[cont] = [getattr(result, cont) for result in shots]
        return scale(data[data.columns[1:]]), data['made']
    
    def create_model3(self, shots, model):
        predictors, response = self.create_model_data2(shots)
        model_obj = model()
        model_obj.fit(predictors, response)
        return model_obj
    
    
    def model_accuracy2(self, model, show_plot = False):
        new_features = ['made', 'fast_break', 'set_piece', 'left_foot', 'right_foot', 'assisted', 'individual_play']
        new_continuous = ['x', 'y', 'distance', 'angle', 'time']
        model_obj = self.create_model3(self.shots, model)
        game_id = self.shots[0].game
        single_game = []
        residuals = []
        game_ids = []
        counter = 0
        total_iter = len(self.shots)
        for shot in self.shots:
            counter +=1
            if shot.game != game_id or counter == total_iter:
                print(counter)
                game_ids.append(game_id)
                predictors, response = self.create_model_data2(single_game, new_features=new_features, 
                                                         new_continuous=new_continuous)
                predictions = model_obj.predict_proba(predictors)
                residuals.append(sum(predictions)[1] - sum(response))
                single_game = []
            game_id = shot.game
            single_game.append(shot)  
        if show_plot:
            data = pd.DataFrame({'residuals':residuals, 'game_id':game_ids})
            show(Scatter(data, 'game_id', 'residuals'))
        return residuals

time: 48.4 ms


In [1592]:
game_id = test.shots[0].game
home_team_id = test.shots[0].home_team_id
away_team_id = test.shots[0].away_team_id
residuals_home = []
residuals_away = []
game_ids = []
goals_home = 0
goals_away = 0
total_iter = len(test.shots)
prediction_home = 0
prediction_away = 0
for shot in test.shots:
    counter +=1
    if shot.game != game_id or counter == total_iter:
        game_ids.append(game_id)
        residuals_home.append(prediction_home - goals_home)
        residuals_away.append(prediction_away - goals_away)
        home_team_id = shot.home_team_id
        away_team_id = shot.away_team_id
        goals_home = 0
        goals_away = 0
        prediction_home = 0
        prediction_away = 0
    if shot.made.value == 1 and shot.team == home_team_id:
        goals_home += 1
    if shot.made.value == 1 and shot.team == away_team_id:
        goals_away += 1
    if shot.individual_play.value == 1 and shot.shot_pitch_location.name == 'out_of_box_center' and shot.x >= 85 and shot.team == home_team_id:
        prediction_home += 0.05514705882352941
    elif shot.individual_play.value == 1 and shot.shot_pitch_location.name == 'out_of_box_center' and shot.x < 85 and shot.team == home_team_id:
        prediction_home += 0.028037383177570093
    elif shot.individual_play.value == 1 and shot.shot_pitch_location.name == 'out_of_box_center' and shot.x >= 85 and shot.team == away_team_id:
        prediction_away += 0.08163265306122448
    elif shot.individual_play.value == 1 and shot.shot_pitch_location.name == 'out_of_box_center' and shot.x < 85 and shot.team == away_team_id:
        prediction_away += 0.02259475218658892
    elif shot.individual_play.value == 0 and shot.shot_pitch_location.name == 'out_of_box_center' and shot.team == home_team_id:
        prediction_home += 0.04891304347826087
    elif shot.individual_play.value == 0 and shot.shot_pitch_location.name == 'out_of_box_center' and shot.team == away_team_id:
        prediction_away += 0.03513513513513514
    elif shot.individual_play.value == 0 and shot.shot_pitch_location.name == 'box_center' and shot.body_part.name == 'head' and shot.team == home_team_id:
        prediction_home += 0.09869203329369798
    elif shot.individual_play.value == 0 and shot.shot_pitch_location.name == 'box_center' and shot.body_part.name == 'head' and shot.team == away_team_id:
        prediction_home += 0.11024844720496894
    elif shot.individual_play.value == 0 and shot.shot_pitch_location.name == 'box_center' and shot.x >= 95 and shot.y >= 35 and shots[i].body_part.name != 'head' and shot.team == home_team_id:
        prediction_home += 0.2679738562091503
    elif shot.individual_play.value == 0 and shot.shot_pitch_location.name == 'box_center' and shot.x >= 95 and shot.y < 35 and shots[i].body_part.name != 'head' and shot.team == home_team_id:
        prediction_home += 0.2331288343558282
    elif shot.individual_play.value == 0 and shot.shot_pitch_location.name == 'box_center' and shot.x < 95 and shot.y >= 29 and shots[i].body_part.name != 'head' and shot.team == home_team_id:
        prediction_home += 0.1718146718146718
    elif shot.individual_play.value == 0 and shot.shot_pitch_location.name == 'box_center' and shot.x < 95 and shot.y < 29 and shots[i].body_part.name != 'head' and shot.team == home_team_id:
        prediction_home += 0.058333333333333334
    elif shot.individual_play.value == 0 and shot.shot_pitch_location.name == 'box_center' and shot.x >= 95 and shot.y >= 38 and shots[i].body_part.name != 'head' and shot.team == away_team_id:
        prediction_away += 0.16129032258064516
    elif shot.individual_play.value == 0 and shot.shot_pitch_location.name == 'box_center' and shot.x >= 95 and shot.y < 38 and shots[i].body_part.name != 'head' and shot.team == away_team_id:
        prediction_away += 0.25112107623318386
    elif shot.individual_play.value == 0 and shot.shot_pitch_location.name == 'box_center' and shot.x < 95 and shot.y >= 37 and shots[i].body_part.name != 'head' and shot.team == away_team_id:
        prediction_away += 0.08609271523178808
    elif shot.individual_play.value == 0 and shot.shot_pitch_location.name == 'box_center' and shot.x < 95 and shot.y < 37 and shots[i].body_part.name != 'head' and shot.team == away_team_id:
        prediction_away += 0.17403314917127072
    elif shot.individual_play.value == 1 and shot.shot_pitch_location.name == 'box_center' and shot.team == home_team_id:
        prediction_home += 0.15451895043731778
    elif shot.individual_play.value == 1 and shot.shot_pitch_location.name == 'box_center' and shot.team == away_team_id:
        prediction_away += 0.20491803278688525
    elif shot.shot_pitch_location.name == 'small_box_center' and shot.team == home_team_id:
        prediction_home += 0.4798206278026906
    elif shot.shot_pitch_location.name == 'small_box_center' and shot.team == away_team_id:
        prediction_away += 0.4691358024691358
    elif shot.pattern_of_play.name == 'regular_play' and shot.team == home_team_id:
        prediction_home += 0.07671480144404332
    elif shot.pattern_of_play.name != 'regular_play' and shot.team == home_team_id:
        prediction_home += 0.09968847352024922
    elif shot.pattern_of_play.name == 'regular_play':
        prediction_away += 0.0855614973262032
    else:
        prediction_away += 0.12734082397003746
    game_id = shot.game
print("Home Team -> Mean: {}, Variance: {}".format(np.mean(residuals_home), np.var(residuals_home)))
print("Away Team -> Mean: {}, Variance: {}".format(np.mean(residuals_away), np.var(residuals_away)))

Home Team -> Mean: -0.05845740588342836, Variance: 1.3351209266418558
Away Team -> Mean: -0.3087489158284936, Variance: 1.1481631352674486
time: 1.35 s


In [1566]:
goals = 0
total_shots = len(shots)
specific_shots = 0
for i in range(total_shots):
    if shots[i].team == shots[i].home_team_id:  
        specific_shots += 1
        if shots[i].made.value == 1:
            goals += 1
print("Goal Percentage: {}, Shots Taken: {}".format(goals/specific_shots, specific_shots))

Goal Percentage: 0.0976216384268615, Shots Taken: 6433
time: 28 ms


In [1236]:
goals = 0
total_shots = len(shots)
specific_shots = 0
for i in range(total_shots):
    if shots[i].shot_pitch_location.name != 'out_of_box_center' and shots[i].shot_pitch_location.name != 'small_box_center' and shots[i].shot_pitch_location.name != 'box_center' and shots[i].pattern_of_play.name != 'regular_play' and shots[i].team == shots[i].away_team_id:  
        specific_shots += 1
        if shots[i].made.value == 1:
            goals += 1
print("Goal Percentage: {}, Shots Taken: {}".format(goals/specific_shots, specific_shots))

Goal Percentage: 0.12734082397003746, Shots Taken: 267
time: 246 ms


In [1409]:
goals = 0
total_shots = len(shots)
specific_shots = 0
for i in range(total_shots):
    if shots[i].individual_play.value == 1 and shots[i].shot_pitch_location.name == 'out_of_box_center' and shots[i].x < 85 and shots[i].team == shots[i].home_team_id:  
        specific_shots += 1
        if shots[i].made.value == 1:
            goals += 1
print("Goal Percentage: {}, Shots Taken: {}".format(goals/specific_shots, specific_shots))

Goal Percentage: 0.028037383177570093, Shots Taken: 1819
time: 141 ms


In [1593]:
goals = 0
total_shots = len(shots)
specific_shots = 0
for i in range(total_shots):
    if shots[i].individual_play.value == 0 and shots[i].shot_pitch_location.name == 'out_of_box_center' and shots[i].team == shots[i].away_team_id:  
        specific_shots += 1
        if shots[i].made.value == 1:
            goals += 1
print("Goal Percentage: {}, Shots Taken: {}".format(goals/specific_shots, specific_shots))

Goal Percentage: 0.03513513513513514, Shots Taken: 370
time: 136 ms


In [1579]:
goals = 0
total_shots = len(shots)
specific_shots = 0
for i in range(total_shots):
    if shots[i].individual_play.value == 0 and shots[i].shot_pitch_location.name == 'box_center' and shots[i].body_part.name != 'head' and shots[i].team == shots[i].home_team_id:  
        specific_shots += 1
        if shots[i].made.value == 1:
            goals += 1
print("Goal Percentage: {}, Shots Taken: {}".format(goals/specific_shots, specific_shots))

Goal Percentage: 0.18343815513626835, Shots Taken: 954
time: 147 ms


In [1194]:
goals = 0
total_shots = len(shots)
specific_shots = 0
for i in range(total_shots):
    if shots[i].individual_play.value == 1 and shots[i].shot_pitch_location.name == 'box_center' and shots[i].team == shots[i].away_team_id:  
        specific_shots += 1
        if shots[i].made.value == 1:
            goals += 1
print("Goal Percentage: {}, Shots Taken: {}".format(goals/specific_shots, specific_shots))

Goal Percentage: 0.20491803278688525, Shots Taken: 244
time: 132 ms


In [1188]:
goals = 0
total_shots = len(shots)
specific_shots = 0
for i in range(total_shots):
    if shots[i].shot_pitch_location.name == 'small_box_center' and shots[i].team == shots[i].away_team_id:  
        specific_shots += 1
        if shots[i].made.value == 1:
            goals += 1
print("Goal Percentage: {}, Shots Taken: {}".format(goals/specific_shots, specific_shots))

Goal Percentage: 0.4691358024691358, Shots Taken: 162
time: 129 ms


In [1204]:
goals = 0
total_shots = len(shots)
specific_shots = 0
for i in range(total_shots):
    if (shots[i].shot_pitch_location.name == 'small_box_left' or shots[i].shot_pitch_location.name == 'small_box_right'):  
        specific_shots += 1
        if shots[i].made.value == 1:
            goals += 1
print("Goal Percentage: {}, Shots Taken: {}".format(goals/specific_shots, specific_shots))

Goal Percentage: 0.1918158567774936, Shots Taken: 391
time: 201 ms


In [1026]:
specific_shots

391

time: 2.75 ms


In [967]:
residuals_log = test.model_accuracy2(LogisticRegression)

31
55
79
102
127
142
173
198
224
243
273
288
319
341
381
401
419
440
468
496
511
540
569
592
611
641
666
698
721
741
769
796
827
851
878
903
931
951
959
982
1010
1036
1060
1090
1129
1161
1190
1217
1243
1272
1306
1333
1361
1386
1413
1444
1467
1496
1517
1538
1559
1599
1623
1652
1674
1711
1738
1761
1782
1807
1819
1842
1871
1899
1924
1954
1985
2010
2042
2060
2087
2107
2129
2153
2174
2204
2225
2248
2277
2296
2328
2355
2377
2408
2443
2464
2484
2516
2542
2569
2593
2621
2637
2670
2696
2726
2753
2785
2810
2835
2859
2885
2900
2922
2949
2966
2996
3021
3039
3063
3080
3104
3135
3154
3185
3207
3233
3249
3267
3294
3317
3345
3360
3374
3403
3434
3460
3486
3509
3535
3557
3581
3614
3648
3673
3691
3727
3757
3782
3818
3836
3852
3874
3894
3912
3940
3964
3991
4012
4036
4063
4083
4116
4144
4164
4192
4221
4253
4270
4292
4307
4340
4367
4385
4415
4443
4484
4501
4523
4549
4568
4599
4626
4650
4675
4698
4717
4747
4772
4790
4814
4837
4854
4885
4915
4943
4969
4989
5013
5033
5069
5093
5114
5129
5161
5180
5213
5241
526

In [704]:
from scipy.stats import linregress

time: 809 µs


In [969]:
np.var(residuals_log)

2.8502319941370269

time: 2.53 ms


In [1588]:
data_home = pd.DataFrame({'residuals':residuals_home})
data_away = pd.DataFrame({'residuals':residuals_away})
data_home['index'] = data_home.index
data_away['index'] = data_away.index

time: 3.49 ms


In [1591]:
np.mean(abs(data_away['residuals']) < 1)

0.7020316027088036

time: 2.88 ms


In [1248]:
show(Histogram(data_home, 'residuals'))

time: 1.16 s


In [1564]:
show(Histogram(data_away, 'residuals'))

time: 821 ms


In [707]:
linregress(expected_goals, actual_goals).rvalue

0.15862517663480211

time: 3.2 ms


In [640]:
mod_rf = create_model(shots, RandomForestClassifier, iters = 25, 
                      new_features=['made', 'fast_break', 'set_piece', 'left_foot', 'right_foot', 'assisted', 
                                    'individual_play']).model_obj

RandomForestClassifier: Mean Accuracy: 0.9184797768479777, Variance: 2.666853404760659e-05
time: 3.95 s


In [678]:
create_model(shots,ExtraTreesClassifier, iters = 250)

ExtraTreesClassifier: Mean Accuracy: 0.9185592747559275, Variance: 2.0071040228442917e-05


<__main__.ModelResults at 0x1103f6ac8>

time: 16.1 s


In [679]:
create_model(shots,BaggingClassifier, iters = 10)

BaggingClassifier: Mean Accuracy: 0.915655509065551, Variance: 1.8101194540244985e-05


<__main__.ModelResults at 0x10d88f940>

time: 4.79 s


In [708]:
expected_goals, actual_goals = model_accuracy(shots, mod_rf, show_plot = True)

time: 3.52 s


time: 796 µs


In [16]:
with open('data', 'w') as fd:
    fd.write(json.dumps(instances_list, indent=4))

time: 1.5 s


In [378]:
with open('data', 'r') as fd:
    instances_list = json.loads(fd.read())

time: 392 ms


In [1463]:
1-np.mean(data['made'])

0.8933011049723757

time: 2.92 ms


In [974]:
goals = 0
total_shots = len(shots)
for i in range(total_shots):
    if shots[i].made.value == 1:
        goals += 1
goals/total_shots

0.0998256320836966

time: 24.5 ms


In [17]:
total_shots = len(shots)
distance_list = []
goals_list = []
for i in range(total_shots):
    distance_list.append(shots[i].distance)
    goals_list.append(shots[i].made.value)

time: 72.9 ms


In [19]:
from scipy.optimize import minimize
import numpy as np

x_data = [5.0, 10.0, 2.0, 20.0, 50.0, 0.5, 5.5, 7.0]
y_data = [1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0]


class ExponentialFitter(object):
    def __init__(self):
        self.fitted_exponential = lambda x: 0.5
        self.fitted_likelihood = lambda x: 0.5

    @staticmethod
    def exponential(x, theta):
        return np.exp(-theta*x)

    def likelihood(self, xy, theta):
        x, y = xy
        if y:
            return self.exponential(x, theta)
        return 1.0 - self.exponential(x, theta)

    def fit(self, x_data, y_data, verbose=False):
        def sum_likelihood(theta):
            sum_nllh = 0.0
            for xy in zip(x_data, y_data):
                llh = self.likelihood(xy, theta)
                sum_nllh += -np.log(llh)
            return sum_nllh

        result = minimize(sum_likelihood, 0.1)
        if verbose:
            print(result)
        self.fitted_exponential = lambda x: self.exponential(x, *result.x)
        self.fitted_likelihood = lambda xy: self.likelihood(xy, *result.x)
        return self.fitted_exponential
    
    def total_likelihood(self, x_data, y_data):
        llh = 0.0
        for xy in zip(x_data, y_data):
            llh += -np.log(self.fitted_likelihood(xy))
        return llh
    
    @staticmethod
    def naive_likelihood(made, base_probability):
        return base_probability if made else 1.0 - base_probability
    
    def naive_total_likelihood(self, y_data):
        base_probability = np.mean(y_data)
        llh = 0.0
        for y in y_data:
            llh += -np.log(self.naive_likelihood(y, base_probability))
        return llh
    
fitter = ExponentialFitter()
fit_func = fitter.fit(distance_list, goals_list, verbose=True)

# to get probability for a shot of distance 10.0
print(fit_func(10.0))

# total likelihoods
print("Naive likelihood: {} Fit Likelihood: {}".format(
    fitter.naive_total_likelihood(goals_list),
    fitter.total_likelihood(distance_list, goals_list)
))

      fun: 3258.693799482519
 hess_inv: array([[  1.52815427e-07]])
      jac: array([-0.0005188])
  message: 'Desired error not necessarily achieved due to precision loss.'
     nfev: 161
      nit: 7
     njev: 50
   status: 2
  success: False
        x: array([ 0.15733668])
0.207345911984
Naive likelihood: 3629.1581575705736 Fit Likelihood: 3258.693799482519
time: 14.2 s


In [20]:
data_test = pd.DataFrame({'Distance': distance_list, 'Made': goals_list})
show(Histogram(data_test, 'Distance', 'Made', color='Made'))

time: 1.87 s


In [21]:
from bokeh.plotting import figure

time: 787 µs


In [24]:
from bokeh.models.annotations import Label

time: 793 µs


In [43]:
x_list_home = []
y_list_home = []
x_list_away = []
y_list_away = []
probabilities_home = []
probabilities_away = []
home_team_name = None 
away_team_name = None 
home_team_actual = 0
away_team_actual = 0
for i in range(29):
    probability = fit_func(shots[i].distance)
    if shots[i].team == shots[i].home_team_id:
        x_list_home.append(shots[i].x)
        y_list_home.append(shots[i].y)
        probabilities_home.append(probability*100)
        if shots[i].made.value == 1:
            home_team_actual += 1
    if shots[i].team == shots[i].away_team_id:
        x_list_away.append(105 - shots[i].x)
        y_list_away.append(shots[i].y)
        probabilities_away.append(probability*100)
        if shots[i].made.value == 1:
            away_team_actual += 1
    if i == 28:
        home_team_name = shots[i].home_team_name
        away_team_name = shots[i].away_team_name
home_team_xG = round(sum(probabilities_home)/100,1)
away_team_xG = round(sum(probabilities_away)/100,1)

time: 13.6 ms


In [48]:
p = figure(plot_width=105*7, plot_height=68*7)
p.quad(top=[69], bottom=[-1], left=[-1], right=[106], color="green", alpha=.75)
p.line([18, 18], [12, 56], line_width = 3, color = 'white')
p.line([87, 87], [12, 56], line_width = 3, color = 'white')
p.line([0, 18], [12, 12], line_width = 3, color = 'white')
p.line([87, 105], [12, 12], line_width = 3, color = 'white')
p.line([0, 18], [56, 56], line_width = 3, color = 'white')
p.line([87, 105], [56, 56], line_width = 3, color = 'white')
p.line([6, 6], [24, 44], line_width = 3, color = 'white')
p.line([99, 99], [24, 44], line_width = 3, color = 'white')
p.line([0, 6], [24, 24], line_width = 3, color = 'white')
p.line([99, 105], [24, 24], line_width = 3, color = 'white')
p.line([0, 6], [44, 44], line_width = 3, color = 'white')
p.line([99, 105], [44, 44], line_width = 3, color = 'white')
p.line([52.5, 52.5], [0, 68], line_width = 3, color = 'white')
p.line([0, 0], [0, 68], line_width = 3, color = 'white')
p.line([105, 105], [0, 68], line_width = 3, color = 'white')
p.line([0, 105], [0, 0], line_width = 3, color = 'white')
p.line([0, 105], [68, 68], line_width = 3, color = 'white')
p.circle(12, 34, color="white", size=10)
p.circle(94, 34, color="white", size=10)
p.circle(52.5, 34, color="white", size=10)
p.arc(52.5, 34, radius=10, start_angle=0, end_angle=6.28, color="white", line_width=3)
p.arc(12, 34, radius=10, start_angle=5.34, end_angle=.92, color="white", line_width=3)
p.arc(94, 34, radius=10, start_angle=2.35, end_angle=3.93, color="white", line_width=3)
p.square(x_list_home, y_list_home, size=probabilities_home, color="blue", alpha=0.5)
p.square(x_list_away, y_list_away, size=probabilities_away, color="red", alpha=0.5)
p.x_range.start, p.x_range.end, p.y_range.start, p.y_range.end = [-1, 106, -1, 69]
mytext = Label(x=34, y=64, text=away_team_name + " vs. " + home_team_name, text_color="black")
mytext2 = Label(x=34, y=60, text="Rough xG sum: {} - {}".format(away_team_xG, home_team_xG), text_color="black")
mytext3 = Label(x=34, y=1, text="Score: {} - {}".format(away_team_actual, home_team_actual), text_color="black")
p.add_layout(mytext)
p.add_layout(mytext2)
p.add_layout(mytext3)
show(p)

time: 550 ms
