In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
dtype = {'feature1' : 'int16',
        'feature2' : 'int16',
        'feature3' : 'int16'}

train_data = pd.read_csv("../input/train.csv",dtype = dtype, 
                         parse_dates = ['first_active_month'])
test_data = pd.read_csv("../input/test.csv",dtype = dtype,
                       parse_dates = ['first_active_month'])

In [None]:
data_types = {'authorized_flag' : 'str',
              'card_id': 'str',
              'city_id':'int16',
              'category_1': 'str',
              'installments':'int16',
              'merchant_category_id':'int16',
              'state_id': 'int16',
              'subsector_id': 'int16'}


hist_trans = pd.read_csv("../input/historical_transactions.csv",dtype = data_types)
new_trans = pd.read_csv("../input/new_merchant_transactions.csv",dtype = data_types)

In [None]:
mapping = {'Y': 1, 'N': 0}
values = {'category_2':2,'category_3':'A'}

for data in [hist_trans,new_trans]:
    data['authorized_flag'] = data['authorized_flag'].map(mapping)
    data['category_1'] = data['category_1'].map(mapping)
    data.fillna(value = values)

In [None]:
#pd.to_datetime(hist_trans['purchase_date'])

We will start with the data manipulation of the historical transaction dataset 

In [None]:
hist_trans = pd.get_dummies(hist_trans,columns = ['category_2','category_3'])

In [None]:
hist_trans.head()

In [None]:
#merchants = pd.get_dummies(merchants,columns = ['most_recent_sales_range','most_recent_purchases_range'])
#merchants

Aggregating the data to the card_id so we can join it in a one-to-one relationship with the training data 

In [None]:
aggregations = {'category_1': [sum ,'mean'],
                'authorized_flag': [sum,'mean'],
                'installments': [sum, 'mean', 'max', 'min', 'std'],
                'purchase_amount': [sum, 'mean', 'max', 'min', 'std'],
                'category_3_A': ['mean'],
                'category_3_B': ['mean'],
                'category_3_C': ['mean'],
                'category_2_1.0' : ['mean'],
                'category_2_2.0' : ['mean'],
                'category_2_3.0' : ['mean'],
                'category_2_4.0' : ['mean'],
                'category_2_5.0' : ['mean']}

temp = hist_trans.groupby('card_id').agg(aggregations)
temp.columns = ["_".join(x) for x in temp.columns.ravel()]
temp.head()

In [None]:
new_train = pd.merge(train_data,temp,on = 'card_id',how = 'left')
new_test = pd.merge(test_data,temp,on = 'card_id',how = 'left')

In [None]:
features = new_train.columns

a = np.arange(2,5)
b = np.arange(6,len(features))  
c = np.append(a,b)
features = features[c]

In [None]:
X = new_train[features]
y = new_train['target']

In [None]:
np.array(y).shape

## Training and fitting of the data

In [None]:
test_set = new_test[features]
prediction = 0


model = KFold(n_splits=5,shuffle = True)
for train_index, test_index in model.split(np.array(X)):
    
    X_train, y_train = X.loc[train_index,:], y[train_index]
    X_test, y_test = X.loc[test_index,:], y[test_index]
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test,reference=lgb_train)

# set parameters for training 
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves': 31,
        'learning_rate': 0.01,
        'verbose': 0 }

    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=10000,
                    valid_sets= [lgb_train,lgb_eval],
                    verbose_eval = 100,
                    early_stopping_rounds=200)
    
    pred = gbm.predict(test_set,num_iteration=gbm.best_iteration)
    prediction += pred

prediction = prediction / 5

In [None]:
submission = pd.DataFrame({'card_id':test_data['card_id'],
                          'target':prediction})
#submission.set_index('card_id')

In [None]:
submission.to_csv('submission.csv', index = False)