In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from cleaner import cleaner

import pyprind  # progress bar

import pandas as pd
import numpy as np
from numpy.random import choice

## Data Preparation

In [3]:
## uncomment for train-test split
# df = pd.read_csv('../data/train.csv', index_col='shot_id')

# inds = choice(df.index, int(len(df)*0.2))
# df_train = df.drop(inds)
# df_test = df.loc[inds,:].copy()  # without copy pandas throws a lot of `SettingWithCopy` warning


## uncomment for submission
df_train = pd.read_csv('../data/train.csv', index_col='shot_id')
df_test  = pd.read_csv('../data/submission.csv', index_col='shot_id')

In [4]:
df_train = cleaner(df_train)
df_test  = cleaner(df_test)

### Verify

In [5]:
print df_train.shape
print df_test.shape

(25697, 17)
(5000, 17)


In [6]:
df_train.head()

Unnamed: 0_level_0,playoffs,shot_made_flag,game_date,3ptr,last_3_years,backcourt,C+RA,better_when_homefield,combined_shot_type_Dunk,combined_shot_type_Hook Shot,combined_shot_type_Jump Shot,combined_shot_type_Layup,combined_shot_type_Tip Shot,final_4s,final_s,4th_period,extra_time
shot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
22902,0,0.0,1996-11-03,0,0,0,0,0,0.0,0.0,1.0,0.0,0.0,0,0,0,0
22903,0,0.0,1996-11-05,0,0,0,0,0,0.0,0.0,1.0,0.0,0.0,0,0,0,0
22904,0,1.0,1996-11-06,1,0,0,0,1,0.0,0.0,1.0,0.0,0.0,0,0,0,0
22905,0,0.0,1996-11-06,1,0,0,1,1,0.0,0.0,1.0,0.0,0.0,0,0,0,0
22906,0,1.0,1996-11-06,0,0,0,1,1,0.0,0.0,1.0,0.0,0.0,0,0,0,0


## Models

### Baseline Model (Constant Probability)

In [None]:
i,j = 0,0                     # for train
k, y_pred, p_pred = 0, [], [] # for test

bar = pyprind.ProgBar(len(df_test))

n = 10   # for debugging

while k < len(df_test):
    isTrain = (df_train['game_date'].iat[j] < df_train['game_date'].iat[k]) or \
              (df_train['game_date'].iat[j] == df_train['game_date'].iat[k] and \
               df_train.index[k] < df_test.index[k])
    
    if isTrain:
        j += 1

    else:
        if j == 0:
            p_pred += 0.5,
        else:
            p_pred += df_train[:j+1]['shot_made_flag'].sum() / float(j),
            
        y_pred += int(p_pred[-1] >= 0.5),
        
        k += 1
        bar.update()
        
## for debugging:
#     n -= 1
#     print n, p_pred, y_pred
#     if n < 0:
#         break

| Model | Accuracy | Log Loss | Time Elapsed |
| -- | -- | -- | -- |
| Constant Probability | 0.554193422845 | 0.687805243243 | 00:00:03 |

### Bayesian Update

In [None]:
import scipy.stats as scs

In [None]:
i,j = 0,0                     # for train
k, y_pred, p_pred = 0, [], [] # for test

bar = pyprind.ProgBar(len(df_test))

alpha = 1
beta  = 1
prior = scs.beta(a=alpha, b=beta)   # uniform distribution

n = 10  # for debugging

while k < len(df_test):
    isTrain = (df_train['game_date'].iat[j] < df_train['game_date'].iat[k]) or \
              (df_train['game_date'].iat[j] == df_train['game_date'].iat[k] and \
               df_train.index[k] < df_test.index[k])
    
    if isTrain:
        
        if df_train['shot_made_flag'].iat[j] == 1:
            alpha += 1
        else:
            beta += 1
        
        j += 1

    else:
        
        y_pred += int(prior.rvs() >= 0.5),
        
        if j == 0:
            p_pred += 0.5,
        else:
            p_pred += (alpha-1) / float(alpha + beta - 2),
        
        k += 1
        bar.update()

        
    # update prior        
    prior = scs.beta(a=alpha, b=beta)       # posterior becomes the new prior 
        
## for debugging:        
#     n -= 1
#     print n, p_pred, y_pred
#     if n < 0:
#         break

| Model | Accuracy | Log Loss | Time Elapsed |
| -- | -- | -- | -- |
| Bayesian | 0.549717843939 | 0.687771751616 | 00:00:12 |

### Online Models

In [7]:
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

In [8]:
i,j = 0,0                     # for train
k, y_pred, p_pred = 0, [], [] # for test

bar = pyprind.ProgBar(len(df_test))

## online learning methods
# model = SGDClassifier(loss='log', warm_start=True)  # only logistic regression can compute probability
# model = RandomForestClassifier(n_estimators=50, n_jobs=-1, warm_start=True)
model = MultinomialNB()
# model = GaussianNB()

n = 10   # for debugging

while k < len(df_test):
    isTrain = (df_train['game_date'].iat[j] < df_train['game_date'].iat[k]) or \
              (df_train['game_date'].iat[j] == df_train['game_date'].iat[k] and \
               df_train.index[k] < df_test.index[k])
    
    if isTrain:
        j += 1

    else:
        if j == 0:
            y_pred += choice([0,1]),  # randomly guess
            p_pred += 0.5,
        
        else:
            # train
            row = df_train[i:j+1]
            
            if model.__class__.__name__ in ['MultinomialNB', 'SGDClassifier']:
                model.partial_fit(row.drop(['shot_made_flag', 'game_date'], axis=1), 
                                  row['shot_made_flag'],
                                  classes=(0,1))
            else:
                model.fit(row.drop(['shot_made_flag', 'game_date'], axis=1), 
                          row['shot_made_flag'])
            
            # predict
            row = df_test[k:k+1]  # must use list slicing for pythonic indexing
            y_pred += model.predict(row.drop(['shot_made_flag', 'game_date'], axis=1)),
            p_pred += model.predict_proba(row.drop(['shot_made_flag', 'game_date'], axis=1))[0][1],
            
            
            # prep for next round: need to increase no. of trees for warm start
            if model.__class__.__name__ in ['RandomForestClassifier']:
                model.n_estimators += 10  
            
            
        k += 1
        bar.update()

## for debugging:
#     n -= 1
#     print n, p_pred, y_pred
#     if n < 0:
#         break

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:25


| Model | Accuracy | Log Loss | Time Elapsed |
| -- | -- | -- | -- |
| SGD (logistic) | 0.564506713368 | 1.05732119307 | 00:00:23 |
| Random Forest |  |  | takes forever |
| Naive Bayes (multinomial) | 0.604397742752 | 0.66269579501 | 00:00:29 |
| Naive Bayes (gaussian) | 0.594084452228 | 2.23953381407 | 00:00:27 | 

n.b.:
* SGD must use `partial_fit`, otherwise `warm_start` is ignored!
* Random forest must increase `n_estimators`, otherwise no new tree is created with `warm_start`!

### Cold Start Everytime

In [None]:
i,j = 0,0                     # for train
k, y_pred, p_pred = 0, [], [] # for test

bar = pyprind.ProgBar(len(df_test))

## new model every time
model = RandomForestClassifier(n_estimators=50, n_jobs=-1)
movingWindow = False

n = 10   # for debugging

while k < len(df_test):
    isTrain = (df_train['game_date'].iat[j] < df_train['game_date'].iat[k]) or \
              (df_train['game_date'].iat[j] == df_train['game_date'].iat[k] and \
               df_train.index[k] < df_test.index[k])
    
    if isTrain:
        j += 1

    else:
        if j == 0:
            y_pred += choice([0,1]),  # randomly guess
            p_pred += 0.5,
        
        else:
            # train
            row = df_train[i:j+1]
            model.fit(row.drop(['shot_made_flag', 'game_date'], axis=1), 
                      row['shot_made_flag'])
            
            
            if movingWindow:
                i = j-100 if j-100 > 0 else 0  # train on only the last 100 shots
                
                
                
            # predict
            row = df_test[k:k+1]  # must use list slicing for pythonic indexing
            y_pred += model.predict(row.drop(['shot_made_flag', 'game_date'], axis=1)),
            p_pred += model.predict_proba(row.drop(['shot_made_flag', 'game_date'], axis=1))[0][1],
            
        k += 1
        bar.update()

## for debugging:        
#     n -= 1
#     print n, y_pred, p_pred
#     if n < 0:
#         break

| Model | Accuracy | Log Loss | Time Elapsed |
| -- | -- | -- | -- |
| Random Forest | 0.603424790815 | 0.68181330833 | 00:49:49 |
| Random Forest (moving window) | 0.595251994551 | 0.768489946031 | 00:49:17 |

In [None]:
from itertools import izip

In [None]:
features = list(df_test.columns)

In [None]:
features.remove('shot_made_flag')
features.remove('game_date')

In [None]:
sorted(izip(features, model.feature_importances_), key=lambda t: t[1], reverse=True)[:10]

## Scoring Metric: [Logarithmic Loss](https://www.kaggle.com/wiki/LogarithmicLoss)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
import scipy as sp

def logloss(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll

In [None]:
print 'accuracy:', accuracy_score(df_test['shot_made_flag'], y_pred)
print 'log loss:', logloss(df_test['shot_made_flag'], p_pred)

## Prepare submission csv

In [9]:
df_submission = pd.Series(p_pred, index=df_test.index, name='shot_made_flag')

In [10]:
df_submission.sort_index(inplace=True)
df_submission.head()

shot_id
1     0.375249
8     0.425787
17    0.000569
20    0.508281
33    0.381880
Name: shot_made_flag, dtype: float64

In [11]:
df_submission.to_csv('../data/prediction.csv', header=True)