### League of Legends: Model Tuning
Standard Methodology:

1. Exploratory plots to get a sense of data (e.g. relationships, distribution etc.)
2. Perform transformations (standardization, log-transform, PCA etc.)
3. Experiment with algorithms that make sense, feature selection and compare cross-validated performance.Algos to thinks about: Tree-Based, Basis Expansion, Logistic Regression, Discriminant  Analysis, Boosting, Neural Nets...

4. **Tune model hyperparameters**
5. **Run on test set**

In [2]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
import math
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
import statsmodels.api as sm
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve, roc_auc_score
import scipy.stats as stats
import sklearn.metrics as metrics
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
def read_pickle(path):
    
    input_file = open(path,'rb')
    variable = pickle.load(input_file)
    input_file.close()
    return(variable)

In [4]:
x_train = '../data/x_train_v2.pickle'
x_test = '../data/x_test_v2.pickle'
y_train = '../data/y_train_v2.pickle'
y_test = '../data/y_test_v2.pickle'

x_train = read_pickle(x_train) 
x_test = read_pickle(x_test) 
y_train = read_pickle(y_train) 
y_test = read_pickle(y_test) 

In [6]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, select_features):
        self.select_features = select_features
    
    def fit(self, x_df):
        return self
    
    def transform(self, x_df):
        select_features = self.select_features
        x_df = x_df[select_features]
        return(x_df)
    
class ContinuousFeatureEngineering(BaseEstimator, TransformerMixin):
    
    #Initiate class
    def __init__(self): 
        return None
        
    #We don't need to fit anything, so leave this as is
    def fit(self, x_df):
        return self
    
    #Perform our feature transformations
    def transform(self, x_df):
        
        #Log cs field
        add_constant = abs(min(x_df['delta_total_cs']))
        x_df['log_delta_total_cs'] = x_df['delta_total_cs'].apply(lambda x : math.log(x + add_constant + 0.01))
        x_df = x_df.drop('delta_total_cs', axis = 1)
        
        #Create per_level * gamelength variables
        feature_columns = list(x_df.columns)
        per_level = [feature for feature in feature_columns if "perlevel" in feature]
        
        for i in per_level:
            field_name = i + str('_average_gamelength')
            x_df[field_name] = x_df[i] * x_df['average_gamelength']
        
        #Standardize data
        standard_scaler = preprocessing.StandardScaler()
        x_df = standard_scaler.fit_transform(x_df)
        
        return(x_df)
    

In [8]:
num_attributes = ['delta_assists', 'delta_damagetochampions', 'delta_deaths',
       'delta_kills', 'delta_monsterkills', 'delta_total_cs',
       'delta_totalgold', 'delta_wardskilled', 'delta_wardsplaced',
       'delta_armor', 'delta_armorperlevel', 'delta_attackdamage',
       'delta_attackdamageperlevel', 'delta_attackrange', 'delta_attackspeed',
       'delta_attackspeedperlevel','delta_gap_closer_value', 'delta_hard_cc_value', 'delta_hp',
       'delta_hpperlevel', 'delta_hpregen', 'delta_hpregenperlevel',
       'delta_movespeed', 'delta_mp', 'delta_mpperlevel', 'delta_mpregen',
       'delta_mpregenperlevel', 'delta_protection_value',
       'delta_soft_cc_value', 'delta_spellblock', 'delta_spellblockperlevel',
       'delta_spells_average_range_value', 'delta_Assassin', 'delta_Fighter',
       'delta_Mage', 'delta_Marksman', 'delta_Support', 'delta_Tank',
       'average_gamelength']

categorical_attributes = ['soul_point', 'red_soul_point']

In [9]:
numerical_pipeline = Pipeline([
    
    ('FeatureSelector', FeatureSelector(num_attributes)),
    ('FeatureEngineering', ContinuousFeatureEngineering()),
    ('PCA', PCA(n_components = 30))
])

categorical_pipeline = Pipeline([
    
    ('FeatureSelector', FeatureSelector(num_attributes))
])

full_pipeline = FeatureUnion(transformer_list = [
    ('numerical_pipeline', numerical_pipeline),
    ('categorical_pipeline', categorical_pipeline),
])

x_train_prepared  = full_pipeline.fit_transform(x_train)
x_test_prepared = full_pipeline.fit_transform(x_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Initial Tuning Range

Before performing grid-search, which computationally expensive especially if the range & number of hyperparameters are big, we can start by first plotting the fitting curves of each algorithm with respect to their complexity controls. We will compare the training score vs cross validated training scores.

1. Logistic Regresion: L1/L2 regularization
2. CART: max_depth, min_samples_split, min_samples_leaf
3. Random Forest: max_depth, min_samples_split, min_samples_leaf 
4. Gradient Boosting: max_depth, min_samples_split, min_samples_leaf 

In [None]:
cart_model = DecisionTreeClassifier(min_samples_leaf = 30) #blanket default, to avoid 1 observation per leaf
cart_model.fit(x_train_prepared, y_train)

cv_score = np.mean(cross_val_score(cart_model, x_train_prepared, y_train,scoring = "roc_auc", cv = 3))
train_score = 

In [None]:
auc_train = []
auc_test = []
maxdepth = 20
depths = range(1, maxdepth+1)

fig, ax = plt.subplots(figsize=(10, 7))

for md in depths:
    model = DecisionTreeClassifier(max_depth=md)
    model.fit(X_train, Y_train)
    
    # Get the probability of Y_test records being = 1
    Y_train_probability_1 = model.predict_proba(X_train)[:, 1]
    Y_test_probability_1 = model.predict_proba(X_test)[:, 1]

    # Use the metrics.roc_curve function to get the true positive rate (tpr) and false positive rate (fpr)
    fpr_train, tpr_train, thresholds_train = metrics.roc_curve(Y_train, Y_train_probability_1)
    fpr_test, tpr_test, thresholds_test = metrics.roc_curve(Y_test, Y_test_probability_1)
    
    auc_train.append(metrics.auc(fpr_train, tpr_train))
    auc_test.append(metrics.auc(fpr_test, tpr_test))
    
plt.plot(depths, auc_train, label="Train")
plt.plot(depths, auc_test, label="Test")
plt.title("Decision Tree AUC Performance on train and test data")
plt.xlabel("Max depth")
plt.ylabel("AUC")
plt.ylim([min(auc_test), 1.0])
plt.xlim([1,maxdepth])
plt.legend()
plt.grid()
plt.show()