# Load packages

In [None]:
import os
import gc

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,train_test_split

import lightgbm as lgb

import warnings
warnings.simplefilter('ignore', FutureWarning)

print(os.listdir("../input"))

In [None]:
train = pd.read_csv('../input/train_df.csv')
test = pd.read_csv('../input/test_df.csv')

In [None]:
train.shape, test.shape

In [None]:
def missing_impute(df):
    for i in df.columns:
        if df[i].dtype == "object":
            df[i] = df[i].fillna("other")
        elif (df[i].dtype == "int64" or df[i].dtype == "float64"):
            df[i] = df[i].fillna(df[i].mean())
        else:
            pass
    return df

In [None]:
# missing values
print('Columns with missing values in train {}'.format(train.columns[train.isnull().any()]))
print('Columns with missing values in test {}'.format(test.columns[test.isnull().any()]))

In [None]:
# Do impute missing values
#for df in [train, test]:
#    missing_impute(df)

In [None]:
# Prepare data for training
X = train.copy()
y = X['target']

FEATS_EXCLUDED = ['first_active_month', 'target', 'card_id','outliers',
                  'hist_purchase_date_max', 'hist_purchase_date_min', 'hist_card_id_size',
                  'new_purchase_date_max', 'new_purchase_date_min', 'new_card_id_size','hist_purchase_date_min']


# train test split
X_train , X_valid , y_train , y_valid = train_test_split(X.drop(FEATS_EXCLUDED,axis=1),y,random_state = 123,test_size = 0.2)
print(X_train.shape,X_valid.shape)

# LightGBM Regressor estimator
model = lgb.LGBMRegressor(boosting= 'goss',n_estimators=3000,
                objective = 'regression',
                metric= 'rmse',
                learning_rate= 0.01,
                subsample= 0.9855232997390695,
                max_depth= 7,
                top_rate= 0.9064148448434349,
                num_leaves= 63,
                min_child_weight= 41.9612869171337,
                other_rate= 0.0721768246018207,
                reg_alpha= 9.677537745007898,
                colsample_bytree= 0.5665320670155495,
                min_split_gain= 9.820197773625843,
                reg_lambda =8.2532317400459,
                min_data_in_leaf= 21,
                verbose= -1,
                seed= 123,
                bagging_seed= 123,
                drop_seed =123
             )

# Fit
model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    verbose=100, eval_metric='rmse',
    early_stopping_rounds=500
)
    
# predictions on valid and test set
valid_predictions = model.predict(X_valid, num_iteration=model.best_iteration_)
test_preds = model.predict(test[X_train.columns], num_iteration=model.best_iteration_)

print('Validation error  {}'.format(mean_squared_error(y_valid, valid_predictions) ** .5))

In [None]:
# Make submission
test.loc[:,'target'] = test_preds
test = test.reset_index()
test[['card_id', 'target']].to_csv('simple_lgb.csv', index=False)

In [None]:
from IPython.display import HTML,FileLinks
FileLinks('.')