In [None]:
import numpy as np 
import pandas as pd 
import os
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import lightgbm as lgb
import xgboost as xgb
import time
import datetime

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, RidgeCV
import gc
from catboost import CatBoostRegressor


import warnings
warnings.filterwarnings("ignore")

pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)

# import workalendar
# from workalendar.america import Brazil

In [None]:
%%time
train = pd.read_csv('../input/train_df.csv')
test = pd.read_csv('../input/test_df.csv')

In [None]:
features = ["new_month_lag_max","hist_month_diff_max","new_month_lag_min","hist_month_diff_min","hist_quarter_max","new_month_max","hist_month_max",
                "hist_month_lag_min","new_category_1_mean","feature_1","new_month_min","new_category_1_sum","new_Black_Friday_2017_mean",
                "month_lag_min","new_quarter_min","hist_month_diff_mean","new_Children_day_2017_mean","quarter","new_month_lag_mean",
                "new_month_mean","feature_min","hist_category_1_sum","new_quarter_max","new_purchase_date_uptonow","hist_category_3_sum",
                "new_purchase_amount_max","hist_purchase_date_diff","new_Christmas_Day_2017_mean","hist_quarter_min",
                "hist_purchase_date_uptonow","new_weekofyear_mean","hist_duration_min","new_purchase_date_diff","hist_category_1_mean","hist_installments_sum",
                "elapsed_time","category_1_mean","days_feature3","new_duration_max","feature_3","hist_fathers_day_2017_mean","purchase_amount_diff_v2",
                "hist_merchant_id_nunique","hist_authorized_flag_mean","hist_last_buy","hist_month_lag_max","hist_duration_max","month_lag_max",
                "hist_Valentine_Day_2017_mean","duration_min","hist_amount_month_ratio_min","card_id_cnt_ratio","hist_duration_mean",
                "hist_CLV","new_merchant_category_id_nunique","hist_month_min","purchase_amount_mlag_ratio","new_purchase_amount_mean","new_card_id_size","hist_purchase_amount_sum",
                "new_day_mean","new_price_min","hist_purchase_date_uptomin","hist_month_lag_mean","hist_Mothers_Day_2017_mean","hist_merchant_category_id_nunique",
                "month_diff_mean","new_duration_mean","hist_subsector_id_nunique","month_lag_mean","hist_category_3_mean_mean","hist_day_nunique",
                "new_quarter_mean","new_purchase_date_average","installments_total","new_amount_month_ratio_max","purchase_amount_ratio_v3",
                "purchase_amount_diff_v3","new_installments_max","new_installments_mean","hist_month_lag_var","hist_card_id_count",
                "hist_purchase_date_average","new_day_min","feature_sum","days_feature2","hist_Children_day_2017_mean","new_duration_min",
                "new_merchant_id_nunique","new_card_id_count","new_last_buy","new_CLV","feature_mean","card_id_total","new_purchase_date_uptomin",
                "new_month_lag_var","hist_price_sum","price_mean","installments_mean","new_category_3_mean_mean"]
X = train.loc[:,features]
y = train['target']
X_test = test.loc[:,features]

In [None]:
n_fold = 10
folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)

In [None]:
def train_model(X=X, X_test=X_test, y=y, params=None, folds=folds, model_type='lgb'):

    oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        if model_type == 'lgb':
            lgb_train = lgb.Dataset(X_train,
                                label=y_train,
                                free_raw_data=False)
            lgb_test = lgb.Dataset(X_valid,
                               label=y_valid,
                               free_raw_data=False)

            model = lgb.train(
                        params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,verbose_eval = 500,
                        early_stopping_rounds= 200)
            
            y_pred_valid = model.predict(X_valid,num_iteration=model.best_iteration)
            y_pred = model.predict(X_test,num_iteration=model.best_iteration)
            
        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=100, evals=watchlist, early_stopping_rounds=20, params=params,verbose_eval = 10)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid), ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test), ntree_limit=model.best_ntree_limit)
            
        if model_type == 'rcv':
            model = RidgeCV(alphas=(0.01, 0.1, 1.0, 10.0, 100.0), scoring='neg_mean_squared_error', cv=3)
            model.fit(X_train, y_train)
            print(model.alpha_)

            y_pred_valid = model.predict(X_valid)
            score = mean_squared_error(y_valid, y_pred_valid) ** 0.5
            print(f'Fold {fold_n}. RMSE: {score:.4f}.')
            print('')
            
            y_pred = model.predict(X_test)
            
        if model_type == 'cat':
            model = CatBoostRegressor(iterations=1000,  eval_metric='RMSE', **params)
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=300,
                     early_stopping_rounds = 100)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)
        
        oof[valid_index] = y_pred_valid.reshape(-1,)
        scores.append(mean_squared_error(y_valid, y_pred_valid) ** 0.5)
        
        prediction += y_pred    

    prediction /= n_fold
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    return oof, prediction

In [None]:
params ={'task': 'train',
        'boosting': 'goss',
        'objective': 'regression',
        'metric': 'rmse',
        'learning_rate': 0.01,
        'subsample': 0.9855232997390695,
        'max_depth': 7,
        'top_rate': 0.9064148448434349,
        'num_leaves': 63,
        'min_child_weight': 41.9612869171337,
        'other_rate': 0.0721768246018207,
        'reg_alpha': 9.677537745007898,
        'colsample_bytree': 0.5665320670155495,
        'min_split_gain': 9.820197773625843,
        'reg_lambda': 8.2532317400459,
        'min_data_in_leaf': 21,
        'verbose': -1,
        'seed':5687,
        'bagging_seed':12479,
        'drop_seed':745}

In [None]:
oof_lgb, prediction_lgb = train_model(params=params, model_type='lgb')

In [None]:
xgb_params = {'eta': 0.01, 'max_depth': 7, 'subsample': 0.8, 'colsample_bytree': 0.8, 
          'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': True, 'nthread': 4}
oof_xgb, prediction_xgb = train_model(params=xgb_params, model_type='xgb')

In [None]:
cat_params = {'learning_rate': 0.02,
              'depth': 7,
              'l2_leaf_reg': 4,
              'bootstrap_type': 'Bernoulli',
              #'metric_period': 500,
              'od_type': 'Iter',
              'od_wait': 50,
              'random_seed': 11,
              'allow_writing_files': False}
oof_cat, prediction_cat = train_model(params=cat_params, model_type='cat')

In [None]:
#submission['target'] = (prediction_lgb + prediction_xgb + prediction_rcv + prediction_cat) / 4
#submission.to_csv('blend.csv', index=False)

In [None]:
train_stack = np.vstack([oof_lgb, oof_xgb, oof_cat]).transpose()
train_stack = pd.DataFrame(train_stack)
test_stack = np.vstack([prediction_lgb, prediction_xgb, prediction_cat]).transpose()
test_stack = pd.DataFrame(test_stack)

In [None]:
oof_rcv_stack, prediction_rcv_stack = train_model(X=train_stack, X_test=test_stack, params=params, model_type='rcv')

In [None]:
test.loc[:,'target'] = prediction_rcv_stack
test = test.reset_index()
test[['card_id', 'target']].to_csv('stacking.csv', index=False)

In [None]:
from IPython.display import HTML,FileLinks
FileLinks('.')