### League of Legends: Model Training
Standard Methodology:

1. Exploratory plots to get a sense of data (e.g. relationships, distribution etc.)
2. **Perform transformations (standardization, log-transform, PCA etc.)**
3. **Experiment with algorithms that make sense, feature selection and compare cross-validated performance.Algos to thinks about: Tree-Based, Basis Expansion, Logistic Regression, Discriminant  Analysis, Boosting, Neural Nets...**

4. **Run on test set**

In [153]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
import math
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import statsmodels.api as sm
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve

#BaseEstimator will inherit get_parms and set_parms methods. 
#TransformerMixin will inherit fit_transform, which calls fit and transform. We can customize our fit and transform
#These are used for consistency with existing sklearn classes
from sklearn.base import BaseEstimator, TransformerMixin

In [6]:
def read_pickle(path):
    
    input_file = open(path,'rb')
    variable = pickle.load(input_file)
    input_file.close()
    return(variable)

In [134]:
x_train = '../data/x_train.pickle'
x_test = '../data/x_test.pickle'
y_train = '../data/y_train.pickle'
y_test = '../data/y_test.pickle'

x_train = read_pickle(x_train) 
x_test = read_pickle(x_test) 
y_train = read_pickle(y_train) 
y_test = read_pickle(y_test) 

### Transformations ###

Let's begin with applying the transformations we deemed suitable during EDA. 
1. Standardize the data, 
2. Remove crit and crit per level variables 
3. Log cs feature 
4. Create per level * gamelength variable
5. Perform PCA with 30 components

In [135]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, remove_features):
        self.remove_features = remove_features
    
    def fit(self, x_df):
        return self
    
    def transform(self, x_df):
        keep_features = list(x_df.columns)
        
        for feature in self.remove_features:
            keep_features.remove(feature)
        
        x_df = x_df[keep_features]
        return(x_df)
    
class FeatureEngineering(BaseEstimator, TransformerMixin):
    
    #Initiate class
    def __init__(self): 
        return None
        
    #We don't need to fit anything, so leave this as is
    def fit(self, x_df):
        return self
    
    #Perform our feature transformations
    def transform(self, x_df):
        
        #Log cs field
        add_constant = abs(min(x_df['delta_total_cs']))
        x_df['log_delta_total_cs'] = x_df['delta_total_cs'].apply(lambda x : math.log(x + add_constant + 0.01))
        x_df = x_df.drop('delta_total_cs', axis = 1)
        
        #Create per_level * gamelength variables
        feature_columns = list(x_df.columns)
        per_level = [feature for feature in feature_columns if "perlevel" in feature]
        
        for i in per_level:
            field_name = i + str('_gamelength')
            x_df[field_name] = x_df[i] * x_df['gamelength']
        
        #Standardize data
        standard_scaler = preprocessing.StandardScaler()
        x_df = standard_scaler.fit_transform(x_df)
        
        return(x_df)
    

In [137]:
pipeline = Pipeline([
    
    ('FeatureSelector', FeatureSelector(['delta_crit', 'delta_critperlevel', 'gameid'])),
    ('FeatureEngineering', FeatureEngineering()),
    ('PCA', PCA(n_components = 30))
])

x_train_prepared  = pipeline.fit_transform(x_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Model Training ###

List of initial candiate models:

1. Logistic Regression
2. CART models
3. Bagging (Random Forest)
4. Boosting
5. Others...

In [142]:
#Plot ROC Curve
def plot_roc_curve(fpr, tpr, label = None):
    plt.plot(fpr, tpr, linewidth = 2, label = label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    

Begin with zero tuning, just to grasp a baseline for each algorithm. Then we'll start tuning and compare the best model of each algorithm

In [141]:
#Begin with baseline logistic regression
log_reg_model = LogisticRegression()
log_reg_model.fit(x_train_prepared, y_train)

scores = cross_val_score(log_reg_model, x_train_prepared, y_train,
                        scoring = "neg_mean_squared_error", cv = 10)
rmse_scores = np.sqrt(-scores)
print(rmse_scores)
print(np.mean(rmse_scores))

[0.41803981 0.38161639 0.44280744 0.39605902 0.50487816 0.42008403
 0.45374261 0.43159531 0.47720307 0.4975186 ]
0.44235444283898906


In [150]:
#CART Model
cart_model = DecisionTreeClassifier()
cart_model.fit(x_train_prepared, y_train)

scores = cross_val_score(cart_model, x_train_prepared, y_train,
                        scoring = "neg_mean_squared_error", cv = 10)
rmse_scores = np.sqrt(-scores)
print(rmse_scores)
print(np.mean(rmse_scores))

[0.52138725 0.59119757 0.57735027 0.48507125 0.60228324 0.56879646
 0.57735027 0.56011203 0.55401326 0.60525749]
0.564281908567188


In [152]:
#Random Forest Model
rand_forest_model = RandomForestClassifier()
rand_forest_model.fit(x_train_prepared, y_train)

scores = cross_val_score(rand_forest_model, x_train_prepared, y_train,
                        scoring = "neg_mean_squared_error", cv = 10)
rmse_scores = np.sqrt(-scores)
print(rmse_scores)
print(np.mean(rmse_scores))

[0.49266464 0.49266464 0.47485808 0.46442036 0.55129082 0.54232614
 0.50487816 0.51449576 0.56287804 0.5073714 ]
0.5107848047502787


In [154]:
#Boosting Model
gboost_model = GradientBoostingClassifier()
gboost_model.fit(x_train_prepared, y_train)

scores = cross_val_score(gboost_model, x_train_prepared, y_train,
                        scoring = "neg_mean_squared_error", cv = 10)
rmse_scores = np.sqrt(-scores)
print(rmse_scores)
print(np.mean(rmse_scores))

[0.41803981 0.44065265 0.44280744 0.42008403 0.48507125 0.48507125
 0.49507377 0.48507125 0.52652419 0.54500431]
0.47433999522447784


### Tuning Simple Classification Tree ####

In [149]:
#y_scores = cross_val_predict(log_reg_model, x_train_prepared, y_train, cv = 10, method = 'decision_function')
#fpr, tpr, thresholds = roc_curve(y_train, y_scores)
#plot_roc_curve(fpr, tpr)
#plt.show()