## Loading Libraries and Data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import os
import datetime as dt

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from lightgbm import LGBMRegressor

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('../input/elo-merchant-category-recommendation/train.csv')
test = pd.read_csv('../input/elo-merchant-category-recommendation/test.csv')
sample = pd.read_csv('../input/elo-merchant-category-recommendation/sample_submission.csv')

In [None]:
print(train.shape, test.shape)

## EDA and Preprocessing

In [None]:
train.head(n=10)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
test['first_active_month'].fillna(test['first_active_month'].mode()[0], inplace = True)

We have one null value in the test dateset. Let's fill it with mode.

In [None]:
train['card_id'].nunique() 

Okay, so there are no missing values and all values in card_id column are unique. Only one categorical column(first_active_month) is there.

In [None]:
fig, ax = plt.subplots(1,2,figsize=(20,6))
sns.distplot(train['target'], ax=ax[0], color='magenta')
sns.distplot(np.log1p(train[np.isfinite(np.log(train['target']+1))]['target']), ax=ax[1], kde=False, bins=50)

The loyalty score of customers varies from -10 to +10. But there is also a slight bump at -35,  either some customers have a very low loyalty score or these are some hidden missing values.

In [None]:
train = train[np.isfinite(np.log(train['target']+1))]
train['target'] = np.log1p(train['target'])

In [None]:
sns.heatmap(train.corr(), annot=True)

feature_1 and feature_3 have a slightly high correlation. Correlations of all features with the target is almost 0. Let's create some features out of the first_active_month column.


In [None]:
train['first_active_year'] = train['first_active_month'].apply(lambda x: int(x.split('-')[0]))
train['first_active_month_num'] = train['first_active_month'].apply(lambda x: int(x.split('-')[1]))

test['first_active_year'] = test['first_active_month'].apply(lambda x: int(x.split('-')[0]))
test['first_active_month_num'] = test['first_active_month'].apply(lambda x: int(x.split('-')[1]))

In [None]:
train['first_active_month'] = pd.to_datetime(train['first_active_month'])
test['first_active_month'] = pd.to_datetime(test['first_active_month'])

In [None]:
date = pd.Timestamp.today()
train['days_since_first_active'] = train['first_active_month'].apply(lambda x : int(-1 * (x - date).days))
test['days_since_first_active'] = test['first_active_month'].apply(lambda x : int(-1 * (x - date).days))

In [None]:
sns.heatmap(train.corr(), annot=True)

## Baseline Submission

In [None]:
train.columns

In [None]:
features = ['feature_1', 'feature_2', 'feature_3',
        'first_active_year', 'first_active_month_num',
       'days_since_first_active']

In [None]:
X = train[features]
y = train['target']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2, random_state=111, shuffle=True )

In [None]:
lr = LinearRegression(normalize=True)
lr.fit(X_train,y_train)
preds = lr.predict(X_val)
np.sqrt(mean_squared_error(y_val, preds))

In [None]:
lgbm = LGBMRegressor(n_estimators=100, learning_rate=0.01)
lgbm.fit(X_train,y_train)
preds = lgbm.predict(X_val)
np.sqrt(mean_squared_error(y_val, preds))

In [None]:
sample.head()

In [None]:
test_predictions = lgbm.predict(test[features])

In [None]:
fig, ax = plt.subplots(1,2,figsize=(20,6))
sns.distplot(test_predictions, ax=ax[0], color='magenta')
sns.distplot(np.expm1(test_predictions), ax=ax[1], kde=False, bins=50)

In [None]:
sample['target'] = np.expm1(test_predictions)
sample.to_csv('submission.csv', index=False)