In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from prepare_data import prepare
from load_data import load

from sklearn.model_selection import KFold

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

In [2]:
train, test, y, train_dict = load()

100%|████████████████████████████████████████████| 8/8 [00:08<00:00,  1.03s/it]


In [3]:
all_data = prepare(pd.concat([train, test]).reset_index(drop=True), train_dict)

train = all_data.loc[:train.shape[0] -1,:]
test = all_data.loc[train.shape[0]:,:] 

In [5]:
SEED = 42
K = 10
fold = list(KFold(K, shuffle=True, random_state=SEED).split(train))
np.random.seed(SEED)

In [6]:
def xgb_model(trn_X, trn_y, val_X, val_y, test, verbose):
    
    params = {'objective': 'reg:linear',
              'eta': 0.01, 
              'max_depth': 5,
              'subsample': 0.6,
              'colsample_bytree': 0.7,
              'eval_metrics': 'rmse',
              'seed': SEED,
              'silent': True}
    
    record = dict()
    
    model = xgb.train(params, xgb.DMatrix(trn_X, trn_y), 1000,
                      [(xgb.DMatrix(trn_X, trn_y), 'train'),
                      (xgb.DMatrix(val_X, val_y), 'valid')],
                      verbose_eval=verbose,
                      early_stopping_rounds=200,
                      callbacks=[xgb.callback.record_evaluation(record)])
    
    best_idx = np.argmin(np.array(record['valid']['rmse']))
    
    val_pred = model.predict(xgb.DMatrix(val_X), ntree_limit=model.best_ntree_limit)
    test_pred = model.predict(xgb.DMatrix(test), ntree_limit=model.best_ntree_limit)
    
    return {'val': val_pred, 'test': test_pred, 'error': record['valid']['rmse'][best_idx], 'importance': [i for k, i in model.get_score().items()]}

In [7]:
def lgb_model(trn_x, trn_y, val_x, val_y, test, verbose) :

    params = {'objective':'regression',
         'num_leaves' : 40,
         'min_data_in_leaf' : 20,
         'max_depth' : 5,
         'learning_rate': 0.01,
         'feature_fraction': 0.8,
         'bagging_freq': 1,
         'bagging_fraction': 0.8,
         'bagging_seed': SEED,
         'metric': 'rmse',
         'random_state' : SEED,
         'verbosity': -1}

    record = dict()
    model = lgb.train(params
                      , lgb.Dataset(trn_x, trn_y)
                      , num_boost_round = 10000
                      , valid_sets = [lgb.Dataset(val_x, val_y)]
                      , verbose_eval = verbose
                      , early_stopping_rounds = 200
                      , callbacks = [lgb.record_evaluation(record)]
                     )
    best_idx = np.argmin(np.array(record['valid_0']['rmse']))

    val_pred = model.predict(val_x, num_iteration = model.best_iteration)
    test_pred = model.predict(test, num_iteration = model.best_iteration)
    
    return {'val':val_pred, 'test':test_pred, 'error':record['valid_0']['rmse'][best_idx], 'importance':model.feature_importance('gain')}

In [19]:
def cat_model(trn_x, trn_y, val_x, val_y, test, verbose) :
    
    model = CatBoostRegressor(iterations=10000,
                                 learning_rate=0.01,
                                 depth=5,
                                 eval_metric='RMSE',
                                 colsample_bylevel=0.7,
                                 random_seed = SEED,
                                 bagging_temperature = 0.2,
                                 metric_period = None
                                )
    model.fit(trn_x, trn_y,
                 eval_set=(val_x, val_y),
                 use_best_model=True,
                 verbose=False)
    
    val_pred = model.predict(val_x)
    test_pred = model.predict(test)
    
    return {'val':val_pred, 'test':test_pred, 'error':model.get_best_score()['validation_0']['RMSE'], 'importance':model.get_feature_importance()}

In [8]:
result_dict = dict()
val_pred = np.zeros(train.shape[0])
test_pred = np.zeros(test.shape[0])
final_err = 0
verbose = False

for i, (trn, val) in enumerate(fold) :
    print(i+1, "fold.    RMSE")
    
    trn_x = train.loc[trn, :]
    trn_y = y[trn]
    val_x = train.loc[val, :]
    val_y = y[val]
    
    fold_val_pred = []
    fold_test_pred = []
    fold_err = []
    
    
    start = datetime.now()
    result = xgb_model(trn_x, trn_y, val_x, val_y, test, verbose)
    fold_val_pred.append(result['val'])
    fold_test_pred.append(result['test'])
    fold_err.append(result['error'])
    print("xgb model.", "{0:.5f}".format(result['error']), '(' + str(int((datetime.now()-start).seconds/60)) + 'm)')
    
    start = datetime.now()
    result = lgb_model(trn_x, trn_y, val_x, val_y, test, verbose)
    fold_val_pred.append(result['val'])
    fold_test_pred.append(result['test'])
    fold_err.append(result['error'])
    print("lgb model.", "{0:.5f}".format(result['error']), '(' + str(int((datetime.now()-start).seconds/60)) + 'm)')
    
    """
    start = datetime.now()
    result = cat_model(trn_x, trn_y, val_x, val_y, test, verbose)
    fold_val_pred.append(result['val'])
    fold_test_pred.append(result['test'])
    fold_err.append(result['error'])
    print("cat model.", "{0:.5f}".format(result['error']), '(' + str(int((datetime.now()-start).seconds/60)) + 'm)')
    """
    
    val_pred[val] += np.mean(np.array(fold_val_pred), axis = 0)
    test_pred += np.mean(np.array(fold_test_pred), axis = 0) / K
    final_err += (sum(fold_err) / len(fold_err)) / K
    
    print("---------------------------")
    print("avg   err.", "{0:.5f}".format(sum(fold_err) / len(fold_err)))
    print("blend err.", "{0:.5f}".format(np.sqrt(np.mean((np.mean(np.array(fold_val_pred), axis = 0) - val_y)**2))))
    
    print('')
    
print("fianl avg   err.", final_err)
print("fianl blend err.", np.sqrt(np.mean((val_pred - y)**2)))

1 fold.    RMSE
xgb model. 2.20203 (0m)
lgb model. 2.18165 (0m)
---------------------------
avg   err. 2.19184
blend err. 2.18668

2 fold.    RMSE
xgb model. 2.60756 (0m)
lgb model. 2.57574 (0m)
---------------------------
avg   err. 2.59165
blend err. 2.58534

3 fold.    RMSE
xgb model. 2.32793 (0m)
lgb model. 2.29481 (0m)
---------------------------
avg   err. 2.31137
blend err. 2.30251

4 fold.    RMSE
xgb model. 2.29672 (0m)
lgb model. 2.24911 (0m)
---------------------------
avg   err. 2.27291
blend err. 2.26502

5 fold.    RMSE
xgb model. 2.43987 (0m)
lgb model. 2.44855 (0m)
---------------------------
avg   err. 2.44421
blend err. 2.43433

6 fold.    RMSE
xgb model. 2.33555 (0m)
lgb model. 2.32395 (0m)
---------------------------
avg   err. 2.32975
blend err. 2.32382

7 fold.    RMSE
xgb model. 2.16296 (0m)
lgb model. 2.14415 (0m)
---------------------------
avg   err. 2.15356
blend err. 2.14695

8 fold.    RMSE
xgb model. 2.29163 (0m)
lgb model. 2.28621 (0m)
-------------------

In [13]:
sub = pd.read_csv('data/sample_submission.csv')
df_sub = pd.DataFrame()
df_sub['id'] = sub['id']
df_sub['revenue'] = np.expm1(test_pred)
df_sub.to_csv("submission.csv", index=False)

In [9]:
def xgb_model_2(trn_X, trn_y, val_X, val_y, test, verbose):
    
    params = {'objective': 'reg:linear',
              'eta': 0.1, 
              'max_depth': 3,
              'subsample': 0.4,
              'colsample_bytree': 0.5,
              'eval_metrics': 'rmse',
              'seed': SEED,
              'silent': True}
    
    record = dict()
    
    model = xgb.train(params, xgb.DMatrix(trn_X, trn_y), 1000,
                      [(xgb.DMatrix(trn_X, trn_y), 'train'),
                      (xgb.DMatrix(val_X, val_y), 'valid')],
                      verbose_eval=verbose,
                      early_stopping_rounds=200,
                      callbacks=[xgb.callback.record_evaluation(record)])
    
    best_idx = np.argmin(np.array(record['valid']['rmse']))
    
    val_pred = model.predict(xgb.DMatrix(val_X), ntree_limit=model.best_ntree_limit)
    test_pred = model.predict(xgb.DMatrix(test), ntree_limit=model.best_ntree_limit)
    
    return {'val': val_pred, 'test': test_pred, 'error': record['valid']['rmse'][best_idx], 'importance': [i for k, i in model.get_score().items()]}

In [10]:
def lgb_model_2(trn_x, trn_y, val_x, val_y, test, verbose) :

    params = {'objective':'regression_l2',
         'num_leaves' : 50,
         'min_data_in_leaf' : 15,
         'max_depth' : 4,
         'learning_rate': 0.01,
         'feature_fraction': 0.8,
         'bagging_freq': 1,
         'bagging_fraction': 0.8,
         'bagging_seed': SEED,
         'metric': 'rmse',
         'random_state' : SEED,
         'verbosity': -1}

    record = dict()
    model = lgb.train(params
                      , lgb.Dataset(trn_x, trn_y)
                      , num_boost_round = 10000
                      , valid_sets = [lgb.Dataset(val_x, val_y)]
                      , verbose_eval = verbose
                      , early_stopping_rounds = 200
                      , callbacks = [lgb.record_evaluation(record)]
                     )
    best_idx = np.argmin(np.array(record['valid_0']['rmse']))

    val_pred = model.predict(val_x, num_iteration = model.best_iteration)
    test_pred = model.predict(test, num_iteration = model.best_iteration)
    
    return {'val':val_pred, 'test':test_pred, 'error':record['valid_0']['rmse'][best_idx], 'importance':model.feature_importance('gain')}

In [12]:
result_dict_2 = dict()
val_pred_2 = np.zeros(train.shape[0])
test_pred_2 = np.zeros(test.shape[0])
final_err_2 = 0
verbose = False

for i, (trn, val) in enumerate(fold) :
    print(i+1, "fold.    RMSE")
    
    trn_x = train.loc[trn, :]
    trn_y = y[trn]
    val_x = train.loc[val, :]
    val_y = y[val]
    
    fold_val_pred = []
    fold_test_pred = []
    fold_err = []
    
    
    start = datetime.now()
    result = xgb_model_2(trn_x, trn_y, val_x, val_y, test, verbose)
    fold_val_pred.append(result['val'])
    fold_test_pred.append(result['test'])
    fold_err.append(result['error'])
    print("xgb model.", "{0:.5f}".format(result['error']), '(' + str(int((datetime.now()-start).seconds/60)) + 'm)')
    
    start = datetime.now()
    result = lgb_model_2(trn_x, trn_y, val_x, val_y, test, verbose)
    fold_val_pred.append(result['val'])
    fold_test_pred.append(result['test'])
    fold_err.append(result['error'])
    print("lgb model.", "{0:.5f}".format(result['error']), '(' + str(int((datetime.now()-start).seconds/60)) + 'm)')
    
    """
    start = datetime.now()
    result = cat_model(trn_x, trn_y, val_x, val_y, test, verbose)
    fold_val_pred.append(result['val'])
    fold_test_pred.append(result['test'])
    fold_err.append(result['error'])
    print("cat model.", "{0:.5f}".format(result['error']), '(' + str(int((datetime.now()-start).seconds/60)) + 'm)')
    """
    
    val_pred_2[val] += np.mean(np.array(fold_val_pred), axis = 0)
    test_pred_2 += np.mean(np.array(fold_test_pred), axis = 0) / K
    final_err_2 += (sum(fold_err) / len(fold_err)) / K
    
    print("---------------------------")
    print("avg   err.", "{0:.5f}".format(sum(fold_err) / len(fold_err)))
    print("blend err.", "{0:.5f}".format(np.sqrt(np.mean((np.mean(np.array(fold_val_pred), axis = 0) - val_y)**2))))
    
    print('')
    
print("fianl avg   err.", final_err_2)
print("fianl blend err.", np.sqrt(np.mean((val_pred_2 - y)**2)))

1 fold.    RMSE
xgb model. 2.20590 (0m)
lgb model. 2.19485 (0m)
---------------------------
avg   err. 2.20037
blend err. 2.18922

2 fold.    RMSE
xgb model. 2.61319 (0m)
lgb model. 2.56509 (0m)
---------------------------
avg   err. 2.58914
blend err. 2.56123

3 fold.    RMSE
xgb model. 2.33092 (0m)
lgb model. 2.29304 (0m)
---------------------------
avg   err. 2.31198
blend err. 2.29923

4 fold.    RMSE
xgb model. 2.31830 (0m)
lgb model. 2.24868 (0m)
---------------------------
avg   err. 2.28349
blend err. 2.26996

5 fold.    RMSE
xgb model. 2.40779 (0m)
lgb model. 2.42948 (0m)
---------------------------
avg   err. 2.41863
blend err. 2.40144

6 fold.    RMSE
xgb model. 2.37108 (0m)
lgb model. 2.30686 (0m)
---------------------------
avg   err. 2.33897
blend err. 2.32200

7 fold.    RMSE
xgb model. 2.18863 (0m)
lgb model. 2.14278 (0m)
---------------------------
avg   err. 2.16571
blend err. 2.14810

8 fold.    RMSE
xgb model. 2.36428 (0m)
lgb model. 2.29333 (0m)
-------------------

In [14]:
sub = pd.read_csv('data/sample_submission.csv')
df_sub = pd.DataFrame()
df_sub['id'] = sub['id']
df_sub['revenue'] = np.expm1(test_pred_2)
df_sub.to_csv("submission_2.csv", index=False)

In [15]:
sub1 = pd.read_csv('submission.csv')
sub2 = pd.read_csv('submission_2.csv')

In [16]:
sub1.head()

Unnamed: 0,id,revenue
0,3001,6415744.0
1,3002,2414149.0
2,3003,7401852.0
3,3004,7022518.0
4,3005,1025829.0


In [17]:
fin_sub = sub1

In [19]:
fin_sub['revenue'] = .5*sub1['revenue'] + .5*sub2['revenue']

In [20]:
fin_sub.to_csv('fin_sub.csv', index=False)