# Load packages

## Note that fastai v0.7 is needed rather than fastai v1.0.
### This link tells how to install fastai v0.7. https://forums.fast.ai/t/fastai-v0-7-install-issues-thread/24652

In [1]:
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics
import os
import pandas as pd
import numpy as np

## Read Data

In [2]:
# os.chdir("/home/rk9cx/Kaggle/Kobe Shot Selection/")
df = pd.read_csv("data.csv", index_col = False, low_memory=False, parse_dates=["game_date"])
test = df[df['shot_made_flag'].isna()]
train = df[~df['shot_made_flag'].isna()]

In [22]:
train.dtypes

action_type                   object
combined_shot_type            object
game_event_id                  int64
game_id                        int64
lat                          float64
loc_x                          int64
loc_y                          int64
lon                          float64
minutes_remaining              int64
period                         int64
playoffs                       int64
season                        object
seconds_remaining              int64
shot_distance                  int64
shot_made_flag               float64
shot_type                     object
shot_zone_area                object
shot_zone_basic               object
shot_zone_range               object
team_id                        int64
team_name                     object
game_date             datetime64[ns]
matchup                       object
opponent                      object
shot_id                        int64
dtype: object

In [30]:
# how many features in the dataset. 25
print(train.columns.size)

25
<bound method NDFrame.head of               action_type combined_shot_type  game_event_id   game_id  \
1               Jump Shot          Jump Shot             12  20000012   
2               Jump Shot          Jump Shot             35  20000012   
3               Jump Shot          Jump Shot             43  20000012   
4       Driving Dunk Shot               Dunk            155  20000012   
5               Jump Shot          Jump Shot            244  20000012   
...                   ...                ...            ...       ...   
30691  Driving Layup Shot              Layup            382  49900088   
30692           Jump Shot          Jump Shot            397  49900088   
30694   Running Jump Shot          Jump Shot            426  49900088   
30695           Jump Shot          Jump Shot            448  49900088   
30696           Jump Shot          Jump Shot            471  49900088   

           lat  loc_x  loc_y       lon  minutes_remaining  period  ...  \
1      34.0443  

## Preprocessing

In [3]:
#converting date into different features
# add_datepart: add_datepart converts a column of df from a datetime64 to 13 columns containing 
# the information from the date. This applies changes inplace.
# new columns include: Year Month Week Day Dayofweek Dayofyear Is_month_end Is_month_start Is_quarter_end Is_quarter_start Is_year_end Is_year_start Elapsed
add_datepart(train, 'game_date')
add_datepart(test, 'game_date')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


# Approach 1: using label encoding

In [4]:
#converting categorical variables into label coding
train_cats(train)
apply_cats(test, train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[n] = pd.Categorical(c, categories=trn[n].cat.categories, ordered=True)


In [5]:
#test.drop(["shot_made_flag"], axis = 1, inplace= True)
# proc_df: takes a data frame df and splits off the response variable, and
# changes the df into an entirely numeric dataframe.
df_test, y_test, nas = proc_df(test, y_fld='shot_made_flag')

In [6]:
#imputing missing values with median
df, y, nas = proc_df(train, 'shot_made_flag')

In [7]:
#function for train-test split
def split_vals(a,n): 
    return a[:n].copy(), a[n:].copy()

n_valid = 5000  # same as Kaggle's test set size
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(train, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

## Modelling - Basic Random Forest

In [29]:
# how many columns in X_train
print(X_train.columns.size)
# inspect the X_train and y_train
X_train.dtypes

36


action_type                 int8
combined_shot_type          int8
game_event_id              int64
game_id                    int64
lat                      float64
loc_x                      int64
loc_y                      int64
lon                      float64
minutes_remaining          int64
period                     int64
playoffs                   int64
season                      int8
seconds_remaining          int64
shot_distance              int64
shot_type                   int8
shot_zone_area              int8
shot_zone_basic             int8
shot_zone_range             int8
team_id                    int64
team_name                   int8
matchup                     int8
opponent                    int8
shot_id                    int64
game_Year                  int64
game_Month                 int64
game_Week                  int64
game_Day                   int64
game_Dayofweek             int64
game_Dayofyear             int64
game_Is_month_end           bool
game_Is_mo

In [8]:
#cross validation
m = RandomForestClassifier(n_jobs=-1)
m.fit(X_train, y_train)
metrics.log_loss(y_valid,m.predict_proba(X_valid))



0.8672047189463101

# Approach 2: One-hot Encoding

## Read Data

In [9]:
# os.chdir("/home/rk9cx/Kaggle/Kobe Shot Selection/")
df = pd.read_csv("data.csv", index_col = False, low_memory=False, parse_dates=["game_date"])
test = df[df['shot_made_flag'].isna()]
train = df[~df['shot_made_flag'].isna()]

## One-hot Encoding and Preprocessing

In [10]:
def make_onehot_feat_dict(df, feat_key, feat_name):
    # Create features for each day of the week
    feat_vals = df[feat_key].values
    all_vals = np.unique(feat_vals)
    N_vals = len(all_vals)
    N_feat = N_vals - 1

    # Create keys
    keys = [0]*N_feat
    for i in range(N_feat):
        keys[i] = 'f_'+feat_name+'_'+ str(all_vals[i])

    # Create value for each training example in dict
    feat_dict = {}
    for i, k in enumerate(keys):
        this_day = all_vals[i]
        feat_dict[k] = feat_vals == this_day
    return feat_dict
#converting top 2 categorical variable to one-hot encoding
action_type_dict = make_onehot_feat_dict(df, 
                                      'action_type', 'action_type')
combined_shot_type_dict = make_onehot_feat_dict(df, 
                                      'combined_shot_type', 'combined_shot_type')

In [11]:
#convert one-hot to dictionary
all_dicts = [action_type_dict, combined_shot_type_dict]
feat_dict = all_dicts[0].copy()
for d in all_dicts[1:]:
    feat_dict.update(d)

In [12]:
#subset dataframe to eliminate categorical variables
df_new = df.drop(["action_type","combined_shot_type"], axis = 1)

In [13]:
#modify dataframe to accomodate the on-hot encoded features
df_feat = pd.DataFrame.from_dict(feat_dict)
df_feat = pd.concat([df_feat, df_new], axis=1)
df_feat.shape

(30697, 84)

In [14]:
test_new = df[df['shot_made_flag'].isna()]
train_new = df[~df['shot_made_flag'].isna()]

In [15]:
#converting date into different fatures
add_datepart(train_new, 'game_date')
add_datepart(test_new, 'game_date')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [16]:
#converting categorical variables into label coding
train_cats(train_new)
apply_cats(test_new, train_new)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[n] = pd.Categorical(c, categories=trn[n].cat.categories, ordered=True)


In [17]:
#imputing missing values with median
df_test, y_test, nas = proc_df(test_new, 'shot_made_flag')
df, y, nas = proc_df(train_new, 'shot_made_flag')

## Splitting Data for Cross Validation

In [18]:
#function for train-test split
def split_vals(a,n): 
    return a[:n].copy(), a[n:].copy()

n_valid = 5000  # same as Kaggle's test set size
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(train, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

In [19]:
#cross validation
m = RandomForestClassifier(n_jobs=-1)
m.fit(X_train, y_train)
metrics.log_loss(y_valid,m.predict_proba(X_valid))



0.7685790546897757