In [1]:
"""
Based on Vladimir Iglovikov' method: 
    https://www.kaggle.com/iglovikov/allstate-claims-severity/xgb-1114/discussion
"""
import pandas as pd
import numpy as np
import lightgbm as lgb
import time
import gc
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

print('loading train')
train = pd.read_csv('../../data/train.csv')
print('loading test')
test = pd.read_csv('../../data/test.csv')
test['loss'] = np.nan
joined = pd.concat([train, test])
def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    con =2
    x =preds-labels
    grad =con*x / (np.abs(x)+con)
    hess =con**2 / (np.abs(x)+con)**2
    return grad, hess 


def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))



This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


loading train
loading test


In [2]:
print('setting features')

for column in tqdm(list(train.select_dtypes(include=['object']).columns)):
    if train[column].nunique() != test[column].nunique():
        set_train = set(train[column].unique())
        set_test = set(test[column].unique())
        remove_train = set_train - set_test
        remove_test = set_test - set_train

        remove = remove_train.union(remove_test)
        def filter_cat(x):
            if x in remove:
                return np.nan
            return x

        joined[column] = joined[column].apply(lambda x: filter_cat(x), 1)

    joined[column] = pd.factorize(joined[column].values, sort=True)[0]


  0%|          | 0/116 [00:00<?, ?it/s]

setting features


100%|██████████| 116/116 [00:45<00:00,  2.56it/s]


In [3]:

train = joined[joined['loss'].notnull()]
test = joined[joined['loss'].isnull()]
print('log loss')
shift = 200
#y = train['loss']
y = np.log(train['loss'] + shift)
ids = test['id']
X = train.drop(['loss', 'id'], 1)
X_test = test.drop(['loss', 'id'], 1)

categorical_columns = [c for c in train.columns if ('cat' in c)]
categorical_columns

param = {'num_leaves': 200,
     'min_data_in_leaf': 9,
     'num_iterations': 20000,
     'num_thread': 4,
     'early_stopping_round': 200,
     'objective':'regression', # notice: the default value is regression
     'max_depth': -1,
     'learning_rate': 0.002,
     "boosting": "gbdt",
     "feature_fraction": 0.3149,
     "bagging_freq": 100,
     "bagging_fraction": 0.8 ,
     "bagging_seed": 2019,
     "metric": 'l1',
     "lambda_l1": 0.1,
     "random_state": 2019,
     "verbosity": -1         
        }

log loss


In [None]:
lst_cv = [11328]

for i in range(100):
    print('##### round {:d}#####'.format(i))
    print('spliting data')
    time0 = round(time.time())
    x_train, x_valid, y_train, y_valid =train_test_split(X, y, test_size=0.1, random_state=time0)
    d_train = lgb.Dataset(x_train, label=y_train, categorical_feature = categorical_columns)
    d_valid = lgb.Dataset(x_valid, label=y_valid, categorical_feature = categorical_columns)

    print('start training')
    num_round = 10000
    
    model = lgb.train(param, d_train, num_round, valid_sets = [d_train, d_valid], verbose_eval=500)
    gc.collect()

    print('calculating CV')
    oof  = np.exp(model.predict(x_valid, num_iteration=model.best_iteration)) - shift
    cv = mean_absolute_error(np.exp(y_valid)-shift, oof)
    print("CV score: {:<8.5f}".format(cv))
    
    if len(lst_cv) < 10 or cv < lst_cv[9]:
        lst_cv = sorted(lst_cv + [cv])
        print('start predicting')
        prediction = np.exp(model.predict(X_test)) - shift
        print('preparing output')
        submission = pd.DataFrame()
        submission['loss'] = prediction
        submission['id'] = ids
        tm = str(time0) + '_' + str(round(cv*10))
        print('time: ',tm)
        submission.to_csv('submit_'+ tm +'.csv', index=False)
        #submission.to_csv('submit_'+ tm +'.csv.gz', compression='gzip', index=False)

        '''pck = open('pretrained_'+ tm +'.pkl', 'wb')
        pickle.dump(model, pck)
        pck.close()'''
print(lst_cv)

In [5]:
'''
print('spliting data')
time0 = round(time.time())
time0 = 1549662358
x_train, x_valid, y_train, y_valid =train_test_split(X, y, test_size=0.1, random_state=time0)
d_train = lgb.Dataset(x_train, label=y_train, categorical_feature = categorical_columns)
d_valid = lgb.Dataset(x_valid, label=y_valid, categorical_feature = categorical_columns)

print('start training')
num_round = 10000

model = lgb.train(param, d_train, num_round, valid_sets = [d_train, d_valid], verbose_eval=500)
gc.collect()

print('calculating CV')
oof  = np.exp(model.predict(x_valid, num_iteration=model.best_iteration)) - shift
cv = mean_absolute_error(np.exp(y_valid)-shift, oof)
print("CV score: {:<8.5f}".format(cv))
'''

'print(\'spliting data\')\ntime0 = round(time.time())\ntime0 = 1549662358\nx_train, x_valid, y_train, y_valid =train_test_split(X, y, test_size=0.1, random_state=time0)\nd_train = lgb.Dataset(x_train, label=y_train, categorical_feature = categorical_columns)\nd_valid = lgb.Dataset(x_valid, label=y_valid, categorical_feature = categorical_columns)\n\nprint(\'start training\')\nnum_round = 10000\n\nmodel = lgb.train(param, d_train, num_round, valid_sets = [d_train, d_valid], verbose_eval=500)\ngc.collect()\n\nprint(\'calculating CV\')\noof  = np.exp(model.predict(x_valid, num_iteration=model.best_iteration)) - shift\ncv = mean_absolute_error(np.exp(y_valid)-shift, oof)\nprint("CV score: {:<8.5f}".format(cv))'

### without using categorical_feature = categorical_columns
[9983]	training's l1: 0.322764	valid_1's l1: 0.366759
CV score: 1136.30073

### with: categorical_feature = categorical_columns
[8945]	training's l1: 0.31769	valid_1's l1: 0.370906
CV score: 1132.81941

### with t0 = 1549661848
CV score: 1129.79278

### with t0 = 1549662358
CV score: 1123.17918

# run the model 100 times, and record the random_state (t0) and cv scores.

##### round 0#####
CV score: 1129.79278
1549661848 1129.7927782815498
##### round 1#####
CV score: 1136.51924
1549662123 1136.5192416569341
##### round 2#####
CV score: 1123.17918
1549662358 1123.179182642565
##### round 3#####
CV score: 1132.07467
1549662604 1132.0746656433714
##### round 4#####
CV score: 1135.50915
1549662799 1135.5091455530624
##### round 5#####
CV score: 1137.78329
1549663022 1137.783288284275

In [11]:
# sort by the cv score

with open('output.txt', 'r') as f:
    i = 1
    dct = {}
    for line in f:
        if i%3 == 0: 
            [a, b] = line.split(' ')
            dct[int(a)] = float(b)
        i += 1
    print(dct)
    top = sorted(dct.items(), key=lambda kv: kv[1], reverse = False)
    print(top)

{1549661848: 1129.7927782815498, 1549662123: 1136.5192416569341, 1549662358: 1123.179182642565, 1549662604: 1132.0746656433714, 1549662799: 1135.5091455530624, 1549663022: 1137.783288284275, 1549663248: 1114.6283538761509, 1549663495: 1107.54382844742, 1549663736: 1153.052261414065, 1549663969: 1139.2455491007345, 1549664216: 1140.2105253885306, 1549664481: 1139.2710443417081, 1549664733: 1127.5950969440396, 1549664978: 1147.7933065604948, 1549665207: 1147.649831426128, 1549665471: 1124.7687147852087, 1549665742: 1149.9824618573034, 1549665996: 1125.2262189515152, 1549666246: 1132.9058776181228, 1549666476: 1136.4724553095907, 1549666707: 1148.2555535277206, 1549666957: 1136.569249619499, 1549667198: 1126.0090995993924, 1549667467: 1162.0702028292685, 1549667705: 1133.5121320955172, 1549667947: 1140.888875923199, 1549668171: 1132.9828559170223, 1549668422: 1133.1794257586664, 1549668675: 1147.7777698624245, 1549668906: 1155.4665076823535, 1549669143: 1115.5866950100128, 1549669364: 114

In [None]:
# below are the top 10 cv score

top_10 = [(1549671965, 1101.2077218618037), (1549663495, 1107.54382844742), (1549683571, 1112.9327088227114), (1549663248, 1114.6283538761509), (1549669143, 1115.5866950100128), (1549675782, 1117.3292056910147), (1549682774, 1118.8597725723334), (1549680354, 1118.8780890290184), (1549683827, 1119.2518437220297), (1549681046, 1120.45433460368)]
