## Imports

In [30]:
import numpy as np
import pandas as pd
import wget
import os
import zipfile

In [40]:
## Download and Unzip Data

In [19]:
start, end = 2007, 2018
years = [i for i in range(start, end+1)]
url_format = "http://peter-tanner.com/moneypuck/downloads/shots_{year}.zip"
urls = [url_format.format(year=yr) for yr in years]

In [29]:
%time list(map(lambda x: wget.download(x, os.getcwd()), urls))

CPU times: user 1.85 s, sys: 750 ms, total: 2.6 s
Wall time: 25 s


['/Users/jai.ghose/NHL-MoneyPuck-Prediction/shots_2007.zip',
 '/Users/jai.ghose/NHL-MoneyPuck-Prediction/shots_2008.zip',
 '/Users/jai.ghose/NHL-MoneyPuck-Prediction/shots_2009.zip',
 '/Users/jai.ghose/NHL-MoneyPuck-Prediction/shots_2010.zip',
 '/Users/jai.ghose/NHL-MoneyPuck-Prediction/shots_2011.zip',
 '/Users/jai.ghose/NHL-MoneyPuck-Prediction/shots_2012.zip',
 '/Users/jai.ghose/NHL-MoneyPuck-Prediction/shots_2013.zip',
 '/Users/jai.ghose/NHL-MoneyPuck-Prediction/shots_2014.zip',
 '/Users/jai.ghose/NHL-MoneyPuck-Prediction/shots_2015.zip',
 '/Users/jai.ghose/NHL-MoneyPuck-Prediction/shots_2016.zip',
 '/Users/jai.ghose/NHL-MoneyPuck-Prediction/shots_2017.zip',
 '/Users/jai.ghose/NHL-MoneyPuck-Prediction/shots_2018.zip']

In [34]:
def unzipper(filePath, dest=None):
    zip_ref = zipfile.ZipFile(filePath, 'r')
    dest_path = dest if dest is not None else os.getcwd()
    zip_ref.extractall(dest_path)
    zip_ref.close()

In [38]:
fPaths = [os.path.join(os.getcwd(), "shots_{}.zip".format(yr)) for yr in years]

In [39]:
%time list(map(unzipper, fPaths))

CPU times: user 3.88 s, sys: 380 ms, total: 4.26 s
Wall time: 4.37 s


[None, None, None, None, None, None, None, None, None, None, None, None]

## Load Data

In [48]:
data = pd.read_csv("shots_2018.csv", index_col=0)

In [49]:
data.describe()

Unnamed: 0,arenaAdjustedShotDistance,arenaAdjustedXCord,arenaAdjustedXCordABS,arenaAdjustedYCord,arenaAdjustedYCordAbs,averageRestDifference,awayEmptyNet,awayPenalty1Length,awayPenalty1TimeLeft,awaySkatersOnIce,...,xCordAdjusted,xFroze,xGoal,xPlayContinuedInZone,xPlayContinuedOutsideZone,xPlayStopped,xRebound,xShotWasOnGoal,yCord,yCordAdjusted
count,70754.0,70754.0,70754.0,70754.0,70754.0,70754.0,70754.0,70754.0,70754.0,70754.0,...,70754.0,70754.0,70754.0,70754.0,70754.0,70754.0,70754.0,70754.0,70754.0,70754.0
mean,35.81771,0.230814,58.908768,-0.208469,15.859118,-1.992824,0.014091,13.560505,6.333819,4.890084,...,59.739011,0.160741,0.066269,0.393554,0.302334,0.023097,0.048431,0.707702,-0.202872,-0.048223
std,19.170421,61.921737,19.080429,19.464177,11.286385,12.755147,0.117867,41.309812,22.723453,0.389794,...,18.949264,0.062249,0.094252,0.101082,0.119791,0.009574,0.023081,0.130724,18.906039,18.907066
min,1.0,-97.0,0.0,-51.0,0.0,-139.4,0.0,0.0,0.0,3.0,...,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,-42.0,-42.0
25%,19.0,-61.0,44.0,-15.0,6.0,-5.6,0.0,0.0,0.0,5.0,...,45.0,0.121912,0.014314,0.361146,0.255028,0.019329,0.035289,0.659357,-14.0,-14.0
50%,35.0,0.0,61.0,-0.0,15.0,0.0,0.0,0.0,0.0,5.0,...,63.0,0.154591,0.034659,0.414955,0.295628,0.02159,0.043101,0.717448,0.0,0.0
75%,50.0,61.0,75.0,14.0,25.0,0.0,0.0,0.0,0.0,5.0,...,76.0,0.193398,0.080957,0.453054,0.324,0.024844,0.057464,0.773692,14.0,14.0
max,98.0,99.0,99.0,49.0,51.0,104.8,1.0,600.0,568.0,6.0,...,99.0,0.725385,0.987757,0.685085,0.959,0.409448,0.239279,0.992156,42.0,42.0


In [None]:
## http://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas

In [59]:
tmp_properties = ["tmp_col"] # can use these to hold intermediary things for processing, w/e
properties = ["team_name", "players", "data_range"] ## TODO: add more attributes of a team

In [60]:
class Team(pd.DataFrame):

    # temporary properties
    _internal_names = pd.DataFrame._internal_names + tmp_vars
    _internal_names_set = set(_internal_names)

    # normal properties
    _metadata = properties

    @property
    def _constructor(self):
        return Team

In [61]:
tmp = Team(data)

In [62]:
tmp.players = set(["Ovi", "Crosby"])

In [63]:
tmp.players

{'Crosby', 'Ovi'}

## Matrix Factorization Idea

Let $G_{H,A,(t, t+\Delta)}$ be the data matrix for two Teams, $H$ and $A$, from times $t$ to $t+\Delta$ at a point $t+\Delta$. We wish to learn a transformation $P$, such that:
$$G_{H, A, (t_0, t)}P = G_{h, a,(t, t+1)}$$
Basically, we wish to learn the data matrix for the following game for two other opponents. We need to structure $G$ such that it has fixed dimension and $G_{H, A, (t_0, t)} = G_{A, H, (t_0, t)}$ so $P$ can have fixed dimension. We need to experiment for how to construct such data matrices from the granular shot data. Once we figure out how to aggregate and subset the data, we can figure out what entries in the input or output to substitute, which to leave empty (like say goals) and use the models in practice.

In [None]:
## http://blog.echen.me/2011/10/24/winning-the-netflix-prize-a-summary/

## Desicion Tree

In [None]:
## https://towardsdatascience.com/interactive-visualization-of-decision-trees-with-jupyter-widgets-ca15dd312084

## Simulation Based on PGM

We create some PGM based on our observation of the data and randomly sample shots in some form to make up some game.

sample shots in a game -> sample shot characteristics with probability according to some rules -> sum example goals

e.g. shots against defense A vs defense B

In [None]:
## https://en.wikipedia.org/wiki/Graphical_model