## consider object_collection

In [1]:
# ==================
# library
# ==================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import os
from tqdm import tqdm_notebook as tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
%matplotlib inline
pd.set_option('display.max_columns', 300)

In [2]:
data = '../datasets/'
out_dir = '../output/'
TRAIN_PATH = data+"train.csv"
TEST_PATH = data+ 'test.csv'
SUB_PATH = data + 'atmacup10__sample_submission.csv'
SAVE_TEST_SUB_PATH = out_dir + "sub.csv"

In [3]:
# ===============
# Settings
# ===============

SEED = 0
N_SPLITS = 5
SHUFFLE = True
LGBM_PARAMS = {'num_leaves': 32,
               'min_data_in_leaf': 64,
               'objective': 'regression',
               'max_depth': -1,
               'learning_rate': 0.05,
               "boosting": "gbdt",
               "bagging_freq": 1,
               "bagging_fraction": 0.8,
               "bagging_seed": SEED,
               "verbosity": -1,
              'reg_alpha': 0.1,
              'reg_lambda': 0.3,
              'colsample_bytree': 0.7,
              'metric':"rmse",
              'num_threads':6,
         }

LGBM_FIT_PARAMS = {
    'num_boost_round': 10000,
    'early_stopping_rounds': 200,
    'verbose_eval': 200,
}



In [4]:
# =====================
# Function
# =====================
def train_lgbm(X_train, y_train, X_valid, y_valid, X_test, categorical_features, feature_name, fold_id,lgb_params, fit_params, loss_func, calc_importances=True):
    
    train_df = lgb.Dataset(X_train, y_train,
                        categorical_feature=categorical_features,
                        feature_name=feature_name)
    if X_valid is not None:
        valid = lgb.Dataset(X_valid, y_valid,
                            categorical_feature=categorical_features,
                            feature_name=feature_name)
   
    if X_valid is not None:
        model = lgb.train(
            lgb_params,
            train_df,
            valid_sets=[train_df,valid],
            **fit_params
        )
    else:
        model = lgb.train(
            lgb_params,
            train_df,
            **fit_params
        )
    
    # train score
    if X_valid is not None:
        y_pred_valid = model.predict(X_valid)
        valid_loss = loss_func(y_valid, y_pred_valid)
    else:
        y_pred_valid = None
        valid_loss = None
    
    #test
    if X_test is not None:
        y_pred_test = model.predict(X_test)
    else:
        y_pred_test = None

    if calc_importances:
        importances = pd.DataFrame()
        importances['feature'] = feature_name
        importances['gain'] = model.feature_importance(importance_type='gain')
        importances['split'] = model.feature_importance(importance_type='split')
        importances['fold'] = fold_id
    else:
        importances = None

    return y_pred_valid, y_pred_test, valid_loss, importances, model.best_iteration

def calc_loss(y_true, y_pred):
    return  np.sqrt(mean_squared_error(y_true, y_pred))
def get_collection(df, collections):
    obj_collection_df = pd.read_csv('../datasets/object_collection.csv')
    df ['has_collection'] = df['object_id'].isin(obj_collection_df['object_id'])*1
    for collect in collections:
        print(collect)
        df['has_'+collect] = ''
        df['has_'+collect] = df['object_id'].isin(obj_collection_df['object_id'][obj_collection_df['name'] == collect])*1
    return df

In [5]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
len_train = len(train_df)
y = np.log1p(train_df["likes"])

In [6]:
train_df['description'] = train_df['description'].isnull()*1
test_df['description'] = test_df['description'].isnull()*1

In [7]:
collections = ['paintings','prints']
train_df = get_collection(train_df,collections)
test_df = get_collection(test_df,collections)

paintings
prints
paintings
prints


In [8]:
cols = ['principal_maker', 'principal_or_first_maker',
        'copyright_holder','acquisition_method',
       'acquisition_credit_line', 
       'dating_period', 'dating_year_early',
       'dating_year_late','description','has_prints',
        'has_paintings',
        'has_collection',
       ]
cat_cols = ['principal_maker', 'principal_or_first_maker','copyright_holder','acquisition_method','acquisition_credit_line']

In [9]:
for c in cat_cols:
    train_df.loc[~train_df[c].isin(test_df[c].unique()),c] = np.nan
    test_df.loc[~test_df[c].isin(train_df[c].unique()),c] = np.nan
train_df = pd.concat([train_df[cols],test_df[cols]]).reset_index(drop=True)

In [10]:
for c in cat_cols:
    train_df[c] = train_df[c].astype(str)
    le = LabelEncoder()
    train_df[c] = le.fit_transform(train_df[c])

In [11]:
test_df = train_df.iloc[len_train:].reset_index(drop=True)
train_df = train_df.iloc[:len_train].reset_index(drop=True)

In [12]:
kf = KFold(n_splits=N_SPLITS,random_state=SEED, shuffle=SHUFFLE)
y_oof = np.empty([len(train_df),])
y_test = []
features = list(train_df.columns)
drop_cols = []
features = [i for i in features if i not in drop_cols]
feature_importances = pd.DataFrame()
categorical_features = ["principal_maker","principal_or_first_maker"]
for fold, (train_idx, valid_idx) in enumerate(kf.split(train_df,y)):
    print('Fold {}'.format(fold + 1))
    x_train, y_train = train_df.iloc[train_idx][features], y.iloc[train_idx]
    x_val, y_val =train_df.iloc[valid_idx][features], y.iloc[valid_idx]

    y_pred_valid, y_pred_test, valid_loss, importances, best_iter = train_lgbm(
                x_train, y_train, x_val, y_val,test_df[features],
                categorical_features=categorical_features,
                feature_name=features,
                fold_id=fold,
                lgb_params=LGBM_PARAMS,
                fit_params=LGBM_FIT_PARAMS,
                loss_func=calc_loss,
                calc_importances=True
            )

    y_oof[valid_idx] = y_pred_valid
    score = calc_loss(y[valid_idx], y_pred_valid)
    y_test.append(y_pred_test)
    feature_importances = pd.concat([feature_importances, importances], axis=0, sort=False)

score = calc_loss(y, y_oof)
print(score)

Fold 1
Training until validation scores don't improve for 200 rounds




[200]	training's rmse: 1.01674	valid_1's rmse: 1.10872
[400]	training's rmse: 0.97806	valid_1's rmse: 1.10928
Early stopping, best iteration is:
[275]	training's rmse: 0.999658	valid_1's rmse: 1.10831
Fold 2
Training until validation scores don't improve for 200 rounds




[200]	training's rmse: 1.02368	valid_1's rmse: 1.08111
[400]	training's rmse: 0.985182	valid_1's rmse: 1.07619
[600]	training's rmse: 0.959583	valid_1's rmse: 1.07617
Early stopping, best iteration is:
[464]	training's rmse: 0.975572	valid_1's rmse: 1.07543
Fold 3




Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 1.00816	valid_1's rmse: 1.13668
[400]	training's rmse: 0.970542	valid_1's rmse: 1.13422
Early stopping, best iteration is:
[341]	training's rmse: 0.979807	valid_1's rmse: 1.13337
Fold 4
Training until validation scores don't improve for 200 rounds




[200]	training's rmse: 1.02254	valid_1's rmse: 1.08884
[400]	training's rmse: 0.985733	valid_1's rmse: 1.08207
[600]	training's rmse: 0.960648	valid_1's rmse: 1.08317
Early stopping, best iteration is:
[501]	training's rmse: 0.971908	valid_1's rmse: 1.08107
Fold 5
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 1.02202	valid_1's rmse: 1.08356




[400]	training's rmse: 0.98352	valid_1's rmse: 1.08233
Early stopping, best iteration is:
[286]	training's rmse: 1.00272	valid_1's rmse: 1.08051
1.0959608250524109


In [13]:
y_test_sub =  np.mean(y_test,axis=0)
y_test_sub = np.expm1(y_test_sub)
sub = pd.read_csv(SUB_PATH)
sub["likes"] = y_test_sub
sub.loc[sub.likes <= 0,"likes"] = 0
sub.to_csv(SAVE_TEST_SUB_PATH,index=False)