In [352]:
import os
import json
import numpy as np
from scipy.optimize import minimize
from math import isnan
%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 3.43 ms


In [809]:
class Shot(object):
    def __init__(self):
        pass
    
    def change_made(self):
        if self.made == 'yes':
            self.made = 1
        elif self.made == 'no':
            self.made = 0
    
    def has_attribute(self, event_type):
        in_list = False
        while not in_list:
            for descriptor in self.shot['tags']:
                if descriptor['key'] == 'qualifier':
                    if descriptor['value'] == event_type:
                        in_list = True
            break
        return in_list
    
    def change_free_kick(self):
        if self.has_attribute('Free kick'):
            self.free_kick = True
        else:
            self.free_kick = False
        
    def change_corner(self):
        if self.has_attribute('From corner'):
            self.corner = True
        else:
            self.corner = False
        
    def change_throw_in(self):
        if self.has_attribute('Throw-in set piece'):
            self.throw_in = True
        else:
            self.throw_in = False
        
    def change_header(self):
        if self.has_attribute('Head'):
            self.header = True
        else:
            self.header = False
      
    def change_distance(self):
        self.distance = np.sqrt((self.x - self.goal_x)**2 + self.y**2)
    
    @property
    def more_than_one_defender(self):
        if self.defenders_in_shot_triangle > 1:
            return True
        else:
            return False
        
    @property
    def more_than_two_defenders(self):
        if self.defenders_in_shot_triangle > 2:
            return True
        else:
            return False
        
    @property
    def more_than_three_defenders(self):
        if self.defenders_in_shot_triangle > 3:
            return True
        else:
            return False
        
    @property
    def nobody_in_shot_triangle(self):
        if self.defenders_in_shot_triangle == 0:
            return True
        else:
            return False
        
    @property
    def speed_more_than_3(self):
        if self.speed_5 >= 3.0:
            return True
        else:
            return False
    
    @property
    def speed_more_than_5(self):
        if self.speed_5 >= 5.0:
            return True
        else:
            return False
        
    @property
    def cross_section_more_than_average(self):
        if self.cross_section >= 7.85:
            return True
        else:
            return False
        
    @property
    def cross_section_top_25_percent(self):
        if self.cross_section >= 7.97:
            return True
        else:
            return False
        
    @property
    def cross_section_bottom_25_percent(self):
        if self.cross_section >= 7.60:
            return True
        else:
            return False

time: 80.8 ms


In [859]:
shots = []
excluded = 0
for game_data in os.listdir('/Users/jason.katz/Downloads/shot_jsons')[1:]:
    with open('/Users/jason.katz/Downloads/shot_jsons/{}'.format(game_data), 'r') as fd:
        loaded_game_dict_list = json.loads(fd.read())
    for idx, shot in enumerate(loaded_game_dict_list):
        shot_obj = Shot()
        for key, value in shot.items():
            setattr(shot_obj, key, value)
        if not isnan(shot_obj.distance) and not isnan(shot_obj.goalkeeper_x) and shot_obj.distance < shot_obj.pitch_width/2:
            shot_obj.change_made()
            shot_obj.change_free_kick()
            shot_obj.change_corner()
            shot_obj.change_throw_in()
            shot_obj.change_header()
            shots.append(shot_obj)
        if shot_obj.distance > shot_obj.pitch_width/2 or isnan(shot_obj.distance) or isnan(shot_obj.goalkeeper_x):
            excluded +=1
print("{} shots have been filtered out".format(excluded))

263 shots have been filtered out
time: 2.99 s


In [864]:
test = FitData(shots)
test.create_model_fit([{'goalkeeper_in_shot_triangle': False, 'nobody_in_shot_triangle': True},
                       {'goalkeeper_in_shot_triangle': False, 'nobody_in_shot_triangle': False}, 
                       {'goalkeeper_in_shot_triangle': True, 'free_kick': True}, 
                       {'goalkeeper_in_shot_triangle': True, 'free_kick': False, 'corner': True, 'header': True}, 
                       {'goalkeeper_in_shot_triangle': True, 'free_kick': False, 'corner': True, 'header': False}, 
                       {'goalkeeper_in_shot_triangle': True, 'free_kick': False, 'corner': False, 'header': True}, 
                       {'goalkeeper_in_shot_triangle': True, 'free_kick': False, 'corner': False, 'header': False, 'more_than_one_defender': True}, 
                       {'goalkeeper_in_shot_triangle': True, 'free_kick': False, 'corner': False, 'header': False, 'more_than_one_defender': False, 'cross_section_top_25_percent': True}, 
                       {'goalkeeper_in_shot_triangle': True, 'free_kick': False, 'corner': False, 'header': False, 'more_than_one_defender': False, 'cross_section_top_25_percent': False}])


Goal Percentage: 27.3%, Shots Taken: 1246 -> {'goalkeeper_in_shot_triangle': False, 'nobody_in_shot_triangle': True}
Goal Percentage: 13.3%, Shots Taken: 653 -> {'goalkeeper_in_shot_triangle': False, 'nobody_in_shot_triangle': False}
Goal Percentage: 5.7%, Shots Taken: 441 -> {'goalkeeper_in_shot_triangle': True, 'free_kick': True}
Goal Percentage: 8.5%, Shots Taken: 611 -> {'goalkeeper_in_shot_triangle': True, 'free_kick': False, 'corner': True, 'header': True}
Goal Percentage: 6.2%, Shots Taken: 791 -> {'goalkeeper_in_shot_triangle': True, 'free_kick': False, 'corner': True, 'header': False}
Goal Percentage: 14.1%, Shots Taken: 688 -> {'goalkeeper_in_shot_triangle': True, 'free_kick': False, 'corner': False, 'header': True}
Goal Percentage: 4.9%, Shots Taken: 3272 -> {'goalkeeper_in_shot_triangle': True, 'free_kick': False, 'corner': False, 'header': False, 'more_than_one_defender': True}
Goal Percentage: 23.7%, Shots Taken: 586 -> {'goalkeeper_in_shot_triangle': True, 'free_kick': F

In [651]:
class NaiveModel(object):
    def __init__(self, shots):
        self.made = sum([shot.made for shot in shots])
        self.prob = self.made / float(len(shots))

    def probability(self):
        return self.prob

    def likelihood(self, made):
        if made:
            return self.prob
        return 1.0 - self.prob

    def nllh(self, made):
        return -np.log(self.likelihood(made))


class ExponentialFitter(object):
    def __init__(self):
        self.fitted_exponential = lambda x: 0.5
        self.fitted_likelihood = lambda x: 0.5
        self.fitted_nllh = lambda x: 0.5

    @staticmethod
    def exponential(x, theta):
        return np.exp(-theta * x)

    def likelihood(self, xy, theta):
        x, y = xy
        if y:
            return self.exponential(x, theta)
        return 1.0 - self.exponential(x, theta)

    def nllh(self, xy, theta):
        return -np.log(self.likelihood(xy, theta))

    def fit(self, x_data, y_data, verbose=False):
        def sum_likelihood(theta):
            sum_nllh = 0.0
            for xy in zip(x_data, y_data):
                nllh = self.nllh(xy, theta)
                sum_nllh += nllh
            return sum_nllh

        result = minimize(sum_likelihood, np.array([0.1]))
        if verbose:
            print(result)
        self.fitted_exponential = lambda x: self.exponential(x, *result.x)
        self.fitted_likelihood = lambda xy: self.likelihood(xy, *result.x)
        self.fitted_nllh = lambda xy: self.nllh(xy, *result.x)
        return self.fitted_exponential

    def total_likelihood(self, x_data, y_data):
        llh = 0.0
        for xy in zip(x_data, y_data):
            llh += -np.log(self.fitted_likelihood(xy))
        return llh

    @staticmethod
    def naive_likelihood(made, base_probability):
        return base_probability if made else 1.0 - base_probability

    def naive_total_likelihood(self, y_data):
        base_probability = np.mean(y_data)
        llh = 0.0
        for y in y_data:
            llh += -np.log(self.naive_likelihood(y, base_probability))
        return llh

time: 63.1 ms


In [583]:
class FitData(object):
    def __init__(self, shots):
        self.shots = shots
        self.base_fitter = None
        self.attribute_dict_list = [{'free_kick': 'yes'},
                           {'free_kick': 'no', 'corner': 'yes', 'header': 'yes'},
                           {'free_kick': 'no', 'corner': 'yes', 'header': 'no'},
                           {'free_kick': 'no', 'corner': 'no', 'throw_in': 'yes'},
                           {'free_kick': 'no', 'corner': 'no', 'throw_in': 'no', 'header': 'yes'},
                           {'free_kick': 'no', 'corner': 'no', 'throw_in': 'no', 'header': 'no'}]
        self.dependent = None

    def filter_by_attribute(self, attribute_names, attribute_values, dependent_variable='distance', verbose=True):
        """
        :type attribute_names: list
        :type attribute_values: list
        :type dependent_variable: str
        :type verbose: bool
        """
        attribute_name_value_dict = dict(zip(attribute_names, attribute_values))
        self.dependent = dependent_variable
        # attribute_value must be the name associated to its value, not integer
        goal_count = 0
        total_shots = len(self.shots)
        filtered_shots_count = 0
        dependent_list = []
        goal_list = []
        for i in range(total_shots):
            passes = True
            for attribute_name, attribute_value in attribute_name_value_dict.items():
                passes = passes and getattr(self.shots[i], attribute_name) == attribute_value
            if passes:
                dependent_list.append(getattr(self.shots[i], dependent_variable))
                goal_list.append(self.shots[i].made)
                filtered_shots_count += 1
                if self.shots[i].made == 1:
                    goal_count += 1
        if verbose:
            print("Goal Percentage: {}%, Shots Taken: {} -> {}".format(round(goal_count * 100 / filtered_shots_count, 1), filtered_shots_count, attribute_name_value_dict))
        return dependent_list, goal_list

    def create_base_fit(self):
        dependent_list, goal_list = self.filter_by_attribute(['always_true'], ['yes'], verbose=False)
        base_fitter = ExponentialFitter()
        base_fitter.fit(dependent_list, goal_list, verbose=False)
        self.base_fitter = base_fitter

    def check_attribute_dict(self, attribute_dict_list=None):
        if attribute_dict_list is None:
            attribute_dict_list = self.attribute_dict_list
        for attributes in attribute_dict_list:
            attribute_names = list(attributes.keys())
            attribute_values = list(attributes.values())
            self.filter_by_attribute(attribute_names, attribute_values)

    def create_model_fit(self, attribute_dict_list=None, dependent='distance', run=True):
        self.dependent = dependent
        if attribute_dict_list is not None:
            self.attribute_dict_list = attribute_dict_list
        fitters = []
        fit_funcs = []
        for idx, attributes in enumerate(self.attribute_dict_list):
            attribute_names = list(attributes.keys())
            attribute_values = list(attributes.values())
            dependent_list, goal_list = self.filter_by_attribute(attribute_names, attribute_values,
                                                                 dependent_variable=dependent, verbose=True)
            fitters.append(ExponentialFitter())
            fit_funcs.append(fitters[idx].fit(dependent_list, goal_list, verbose=False))
        if run:
            self.run_model(fitters, fit_funcs, self.attribute_dict_list)
        else:
            return fitters, fit_funcs, self.attribute_dict_list

    def run_model(self, fitters, fit_funcs, attribute_dict_list, dependent='distance'):
        self.dependent = dependent
        game_id = self.shots[0].game_id
        goals = {'home': 0, 'away': 0}
        total_iter = len(self.shots) - 1
        predictions = {'home': 0.0, 'away': 0.0}
        residuals = {'home': [], 'away': []}
        likelihood = 0
        naive_likelihood = 0
        base_likelihood = 0
        naive_model = NaiveModel(self.shots)
        for i, shot in enumerate(self.shots):
            team = shot.team_type
            made = shot.made
            dependent_value = getattr(shot, dependent)
            if shot.game_id != game_id or i == total_iter:
                for team in ['home', 'away']:
                    residuals[team].append(predictions[team] - goals[team])
                goals = {'home': 0, 'away': 0}
                predictions = {'home': 0.0, 'away': 0.0}
            goals[team] += made
            for idx, attribute_dict in enumerate(attribute_dict_list):
                passes = True
                for attribute_name, attribute_value in attribute_dict.items():
                    passes = passes and getattr(self.shots[i], attribute_name) == attribute_value
                if passes:
                    predictions[team] += fit_funcs[idx](dependent_value)
                    likelihood += fitters[idx].fitted_nllh((dependent_value, made))
                    naive_likelihood += naive_model.nllh(made)
                    if self.base_fitter is not None:
                        base_likelihood += self.base_fitter.fitted_nllh((dependent_value, made))
                    break
            game_id = shot.game_id
        # noinspection PyTypeChecker
        home_within_goal = np.round(np.mean(np.abs(np.array(residuals['home'])) < 1) * 100, 1)
        # noinspection PyTypeChecker
        away_within_goal = np.round(np.mean(np.abs(np.array(residuals['away'])) < 1) * 100, 1)
        print("Dependent variable used: {}".format(self.dependent))
        print("xG within one goal for home: {}%".format(home_within_goal))
        print("xG within one goal for away: {}%".format(away_within_goal))
        print("Home Team Residuals -> Mean: {}, Variance: {}".format(np.round(np.mean(residuals['home']), 3),
                                                                     np.round(np.var(residuals['home']), 3)))
        print("Away Team Residuals -> Mean: {}, Variance: {}".format(np.round(np.mean(residuals['away']), 3),
                                                                     np.round(np.var(residuals['away']), 3)))
        print("Model Likelihood: {}".format(round(likelihood)))
        print("Naive Likelihood: {}".format(round(naive_likelihood)))
        if self.base_fitter is not None:
            print("Base Likelihood: {}".format(round(base_likelihood)))

time: 185 ms
