In this nontebook we explore feature importance according to the best performing model as described in the data analysis notebook, e.g., random forest. 

In [1]:
import pandas as pd

df = pd.read_csv('tl-data.csv')

In [2]:
df['ratio_won_played'] = df['gameswon'] / df['gamesplayed']
df['verified'] = df['verified'].apply(lambda x: 'yes' if x == True else 'no')


In [3]:
from sklearn.ensemble import RandomForestRegressor

# model
random_forest = RandomForestRegressor(max_depth=12, max_features='auto', n_estimators = 200)

In [4]:
cols_cat = ['verified']
cols_num = ['ratio_won_played', 'apm', 'pps', 'vs', 'blitz']

In [5]:
# build pipelines
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

transformer_numeric = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ]
)

transformer_categorical = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", transformer_numeric, cols_num),
        ("cat", transformer_categorical, cols_cat),
    ]
)

preprocessor

In [6]:
X = df[cols_cat + cols_num]
y = df['tr']

In [7]:
# split data into: train/validation/holdout: 70/20/10
from sklearn.model_selection import train_test_split

X_tmp, X_hold, y_tmp, y_hold = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=1234)

X_train, X_test, y_train, y_test = train_test_split(X_tmp, y_tmp, test_size=0.2/0.9, shuffle=True, random_state=1234)

print(f"all: {X.shape}, train: {X_train.shape}, validation: {X_test.shape}, holdout: {X_hold.shape}")

all: (40790, 6), train: (28552, 6), validation: (8159, 6), holdout: (4079, 6)


In [8]:
train_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('random_forest', random_forest)
    ]
)

In [9]:
train_pipeline.fit(X_train, y_train)

  warn(


In [11]:
train_pipeline.predict(X_test)

array([23431.75790091,  5362.67082986,  2911.68081614, ...,
       20515.26540072, 22849.72104653,  7907.97845396])