In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import model_selection, preprocessing, metrics

In [2]:
df = pd.read_csv('Pokemon.csv')
df.fillna(value='None', inplace = True)
df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [3]:
selected_columns = ['Type 1','Type 2','Legendary','Generation']
target_column = 'Speed'

In [4]:
def run_lgb(df, target):
    train_X, test_X, train_Y, test_Y = model_selection.train_test_split(df.loc[:, ~df.columns.isin([target])], 
                                                        df[target], 
                                                        test_size=0.2)
    params = {
        "objective" : "regression",
        "metric" : "rmse", 
        "learning_rate" : 0.01,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.5,
        "bagging_frequency" : 5,
        "bagging_seed" : 1989,
        'seed': 1989,
        'min_data':1,
        'min_data_in_bin':1
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_Y)
    lgval = lgb.Dataset(test_X, label=test_Y)
    model = lgb.train(params, lgtrain, 1000, valid_sets=[lgtrain, lgval], 
                      early_stopping_rounds=50, verbose_eval=20)

## Label Encoder

In [5]:
df_copy = df.copy()
for column in selected_columns:
    df_copy[column] = preprocessing.LabelEncoder().fit_transform(df_copy[column])
df_copy.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,9,14,318,45,49,49,65,65,45,0,0
1,2,Ivysaur,9,14,405,60,62,63,80,80,60,0,0
2,3,Venusaur,9,14,525,80,82,83,100,100,80,0,0
3,3,VenusaurMega Venusaur,9,14,625,80,100,123,122,120,80,0,0
4,4,Charmander,6,12,309,39,52,43,60,50,65,0,0


In [6]:
run_lgb(df_copy[selected_columns+[target_column]], target_column)

Training until validation scores don't improve for 50 rounds.
[20]	training's rmse: 27.8245	valid_1's rmse: 28.8414
[40]	training's rmse: 26.9592	valid_1's rmse: 28.2187
[60]	training's rmse: 26.2635	valid_1's rmse: 27.7209
[80]	training's rmse: 25.685	valid_1's rmse: 27.3163
[100]	training's rmse: 25.1905	valid_1's rmse: 26.9568
[120]	training's rmse: 24.7788	valid_1's rmse: 26.6486
[140]	training's rmse: 24.4405	valid_1's rmse: 26.4119
[160]	training's rmse: 24.1596	valid_1's rmse: 26.2226
[180]	training's rmse: 23.9235	valid_1's rmse: 26.0769
[200]	training's rmse: 23.6867	valid_1's rmse: 25.9852
[220]	training's rmse: 23.4778	valid_1's rmse: 25.9221
[240]	training's rmse: 23.2992	valid_1's rmse: 25.8921
[260]	training's rmse: 23.1405	valid_1's rmse: 25.8831
[280]	training's rmse: 22.9973	valid_1's rmse: 25.8628
[300]	training's rmse: 22.8616	valid_1's rmse: 25.8478
[320]	training's rmse: 22.7401	valid_1's rmse: 25.853
[340]	training's rmse: 22.6289	valid_1's rmse: 25.8534
Early sto

## Frequency encoding

In [7]:
def frequency_encoding(frame, col):
    freq_encoding = frame.groupby([col]).size()/frame.shape[0] 
    freq_encoding = freq_encoding.reset_index().rename(columns={0:'{}_Frequency'.format(col)})
    frame[col] = freq_encoding['{}_Frequency'.format(col)]
    return frame

In [8]:
df_copy = df.copy()
for column in selected_columns:
    df_copy = frequency_encoding(df_copy, column)
df_copy.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,0.08625,0.00375,318,45,49,49,65,65,45,0.2075,0.91875
1,2,Ivysaur,0.03875,0.025,405,60,62,63,80,80,60,0.1325,0.08125
2,3,Venusaur,0.04,0.0225,525,80,82,83,100,100,80,0.2,
3,3,VenusaurMega Venusaur,0.055,0.0075,625,80,100,123,122,120,80,0.15125,
4,4,Charmander,0.02125,0.02875,309,39,52,43,60,50,65,0.20625,


In [9]:
run_lgb(df_copy[selected_columns+[target_column]], target_column)

Training until validation scores don't improve for 50 rounds.
[20]	training's rmse: 29.1343	valid_1's rmse: 28.501
[40]	training's rmse: 29.1024	valid_1's rmse: 28.49
[60]	training's rmse: 29.0766	valid_1's rmse: 28.4825
[80]	training's rmse: 29.0557	valid_1's rmse: 28.4766
[100]	training's rmse: 29.0388	valid_1's rmse: 28.4709
[120]	training's rmse: 29.025	valid_1's rmse: 28.4669
[140]	training's rmse: 29.0138	valid_1's rmse: 28.464
[160]	training's rmse: 29.0047	valid_1's rmse: 28.4618
[180]	training's rmse: 28.9972	valid_1's rmse: 28.46
[200]	training's rmse: 28.9912	valid_1's rmse: 28.4589
[220]	training's rmse: 28.9862	valid_1's rmse: 28.4581
[240]	training's rmse: 28.9822	valid_1's rmse: 28.4575
[260]	training's rmse: 28.9789	valid_1's rmse: 28.4571
[280]	training's rmse: 28.9762	valid_1's rmse: 28.4569
[300]	training's rmse: 28.974	valid_1's rmse: 28.4568
[320]	training's rmse: 28.9722	valid_1's rmse: 28.4569
[340]	training's rmse: 28.9707	valid_1's rmse: 28.4569
Early stopping,

## Mean encoding

In [10]:
from sklearn.model_selection import KFold

def mean_k_fold_encoding(df, target_name, col, alpha):
    target_mean_global = df[target_name].mean()
    
    nrows_cat = df.groupby(col)[target_name].count()
    target_means_cats = df.groupby(col)[target_name].mean()
    target_means_cats_adj = (target_means_cats*nrows_cat + 
                             target_mean_global*alpha)/(nrows_cat+alpha)
    
    kfold = KFold(n_splits=5, shuffle=True, random_state=1989)
    parts = []
    for trn_inx, val_idx in kfold.split(df):
        df_for_estimation, df_estimated = df.iloc[trn_inx], df.iloc[val_idx]
        nrows_cat = df_for_estimation.groupby(col)[target_name].count()
        target_means_cats = df_for_estimation.groupby(col)[target_name].mean()

        target_means_cats_adj = (target_means_cats * nrows_cat + 
                                target_mean_global * alpha) / (nrows_cat + alpha)

        encoded_col_train_part = df_estimated[col].map(target_means_cats_adj)
        parts.append(encoded_col_train_part)
        
    encoded_col_train = pd.concat(parts, axis=0)
    encoded_col_train.fillna(target_mean_global, inplace=True)
    encoded_col_train.sort_index(inplace=True)
    
    return encoded_col_train

In [11]:
df_copy = df.copy()
for column in selected_columns:
    df_copy[column] = mean_k_fold_encoding(df_copy, target_column, column, 5)
df_copy.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,61.799802,67.668359,318,45,49,49,65,65,45,72.002981,66.054801
1,2,Ivysaur,61.856458,64.74625,405,60,62,63,80,80,60,71.767555,64.882807
2,3,Venusaur,61.856458,64.74625,525,80,82,83,100,100,80,71.767555,64.882807
3,3,VenusaurMega Venusaur,61.558405,64.011071,625,80,100,123,122,120,80,72.044052,64.621864
4,4,Charmander,76.103606,65.741627,309,39,52,43,60,50,65,72.002981,66.054801


In [12]:
run_lgb(df_copy[selected_columns+[target_column]], target_column)

Training until validation scores don't improve for 50 rounds.
[20]	training's rmse: 28.2842	valid_1's rmse: 26.177
[40]	training's rmse: 27.2129	valid_1's rmse: 25.7708
[60]	training's rmse: 26.3201	valid_1's rmse: 25.4576
[80]	training's rmse: 25.531	valid_1's rmse: 25.2634
[100]	training's rmse: 24.8602	valid_1's rmse: 25.0573
[120]	training's rmse: 24.3124	valid_1's rmse: 24.9282
[140]	training's rmse: 23.8142	valid_1's rmse: 24.805
[160]	training's rmse: 23.3809	valid_1's rmse: 24.7501
[180]	training's rmse: 23.0024	valid_1's rmse: 24.7698
[200]	training's rmse: 22.6802	valid_1's rmse: 24.7625
Early stopping, best iteration is:
[165]	training's rmse: 23.2909	valid_1's rmse: 24.7468
