In [2]:
import os

import pandas as pd
import numpy as np

from pylightgbm.models import GBMRegressor

from sklearn.base import TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [4]:
model = GBMRegressor(
    num_threads=-1,
    learning_rate = 0.03,
    num_iterations = 5000, 
    num_leaves=250,
    verbose = False,
    early_stopping_round = 50,
    feature_fraction = 0.8,
    bagging_fraction = 0.8,
) 

In [5]:
df = pd.read_csv('../raw_data/train.csv')
df_test = pd.read_csv('../raw_data/test.csv')
X = df.drop(['loss'], 1)
y = df.loss
X_test = df_test

In [6]:
cv = KFold(n_splits=3)

In [7]:
class FeatureExtractor(TransformerMixin):
       
    def fit(self, df_1, df_2):
        df = pd.concat([df_1, df_2], axis=0)
        self.cat_columns = [col for col in df.columns if col[:3] == 'cat']
        self.le_dict = {}
        
        for col in self.cat_columns:
            self.le_dict[col] = LabelEncoder().fit(df[col])
        
        return self
    
    def transform(self, df):
        df = df.copy()
        
        df.drop(['id'], 1, inplace=True)
        
        for col in self.cat_columns:
            df[col] = self.le_dict[col].transform(df[col])
        return df

In [8]:
def target_transform(y):
    return np.log(y + 200)


def target_inverse_transform(y_tr):
    return np.exp(y_tr) - 200

def mae_eval(y_true, y_pred_tr):
    y_pred = target_inverse_transform(y_pred_tr)
    return mean_absolute_error(y_true, y_pred)

In [9]:
fe = FeatureExtractor().fit(X, X_test)
X_tr = fe.transform(X)

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X_tr, y, test_size=0.2, random_state=100)

In [11]:
model.fit(
    X_train.values, 
    target_transform(y_train.values), 
    test_data = [(
        X_val.values, 
        target_transform(y_val.values)
    )]
)
print model.best_round
y_pred = target_inverse_transform(model.predict(X_val))
score = mean_absolute_error(y_val, y_pred)
print 'MAE = {:.2f}'.format(score)

366
MAE = 1137.26


In [37]:
%%time

scores = []
for train, val in cv.split(X_tr):
    x_train = X_tr.iloc[train,:]
    x_val = X_tr.iloc[val,:]
    y_train = y[train]
    y_val = y[val]
    
    model.fit(
        x_train.values, 
        target_transform(y_train.values), 
        test_data = [(
            x_val.values, 
            target_transform(y_val.values)
        )]
    )
    print model.best_round
    
    y_pred = target_inverse_transform(model.predict(x_val))
    score = mean_absolute_error(y_val, y_pred)
    scores.append(score)
print 'MAE = {:.2f} +- {:.2f}'.format(np.mean(scores), np.std(scores))

697
473
534
MAE = 1141.31 +- 2.67
CPU times: user 46.3 s, sys: 2.49 s, total: 48.8 s
Wall time: 1min 32s


In [20]:
np.max(y_pred)

38069.174202785995

In [38]:
model = GBMRegressor(
    num_threads=-1,
    learning_rate = 0.03,
    num_iterations = 700, 
    verbose = False, 
    early_stopping_round = 50,
    feature_fraction = 0.8,
    bagging_fraction = 0.8,
) 

In [39]:
model.fit(
        X_tr.values, 
        target_transform(y.values)
    )

In [40]:
X_tr_test = fe.transform(X_test)

In [41]:
y_pred = target_inverse_transform(model.predict(X_tr_test))

In [42]:
def get_submission(y_sub):
    df_sub = df_test[['id']].copy()
    df_sub['loss'] = y_sub
    return df_sub

In [43]:
submission = get_submission(y_pred)

In [44]:
submission.to_csv('../submissions/11_19_2.csv', index=False)