# Jeju Credit Card Usage Prediction - July


In [None]:
# Import necessary packages
import pandas as pd
import numpy as np 
import os
import itertools
from tqdm.notebook import tqdm
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
from pandas.plotting import scatter_matrix
from sklearn import preprocessing, metrics
import category_encoders as ce

"""
For model building, we consider
- XGboost Regressor
- Random Forest Regressor
"""

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
import warnings

warnings.filterwarnings("ignore")

plt.rcParams['font.family'] = 'NanumBarunGothic'

%matplotlib inline

In [None]:

# Read files 
train_data = pd.read_csv('201901-202003.csv') # Unfortunately, this file is not available for download currently.
apr_data = pd.read_csv('202004.csv')
submission = pd.read_csv('submission.csv')

# Append april data to test data
train_data = train_data.append(apr_data)

# fill all empty entries with ''
train_data = train_data.fillna('')

# Drop unnecessary features - the home cities/provinces of the customers are not important
train_data.drop(['HOM_SIDO_NM','HOM_CCG_NM'], axis=1, inplace=True)

# Rename SEX column
train_data.rename(columns={'SEX_CTGO_CD':'SEX'}, inplace=True)

# change data type of SEX column
train_data['SEX'] = train_data['SEX'].astype(str)

In [None]:
# Make dict of {SIDO: [CCG]} - Make dictionary of Province: City Lists
SIDO_CCG_dict = {}

# List of all unique CARD_SIDO_NM
sidos = list(train_data['CARD_SIDO_NM'].unique())

for sido in sidos:
    ccgs = list(train_data[train_data['CARD_SIDO_NM'] == sido]['CARD_CCG_NM'].unique())
    SIDO_CCG_dict[sido] = ccgs


In [None]:
# Group by and keep only useful features
train_2 = pd.DataFrame(train_data.groupby(['CARD_CCG_NM','STD_CLSS_NM','AGE','SEX'])['AMT'].sum())
train_2.reset_index(inplace=True)
train_x = train_2[['CARD_CCG_NM','STD_CLSS_NM','AGE','SEX']]
train_y = pd.DataFrame(np.log(train_2['AMT']))


In [None]:
# Now keep the test_data in the same format
# Keep only REG_YYMM = 202007
test_data = submission[submission['REG_YYMM'] == 202007]

test_x = test_data[['CARD_SIDO_NM','STD_CLSS_NM']]
test_y = test_data['AMT']

# Make the test_x include CARD_CCG_NM as well
columns = ['CARD_SIDO_NM','CARD_CCG_NM']
SIDO_CCG = pd.DataFrame(columns=columns)

for sido in sidos:
    for ccg in SIDO_CCG_dict[sido]:
        SIDO_CCG = SIDO_CCG.append({'CARD_SIDO_NM':sido, 'CARD_CCG_NM':ccg}, ignore_index=True)
        
test_x = pd.merge(test_x, SIDO_CCG)

test_sido_ccg = test_x[['CARD_SIDO_NM','CARD_CCG_NM']]

# Add SEX column
temp_sex = pd.DataFrame({'SEX':['1','2']})
temp_sex['key'] = 0
test_x['key'] = 0

test_x = test_x.merge(temp_sex, on='key')

# Add AGE column
temp_age = pd.DataFrame({'AGE':['10s','20s','30s','40s','50s','60s','70s']})
temp_age['key'] = 0
test_x = test_x.merge(temp_age, on='key')

test_x.drop(['CARD_SIDO_NM','key'], axis=1, inplace=True)


# Feature Generation
* There are now only 4 features in our train_x. (STD_CLSS_NM, CARD_CCG_NM, SEX, AGE)
* Combine each feature, and use target encodings to convert categorical variables into numerical variables
* Both on train and test 

In [None]:

def feature_combine(df_1, df_2):
    df_train = df_1.copy()
    df_test = df_2.copy()
    cat_features = df_train.columns.tolist()
    # Iterate through cat_features into 10 different combinations
    for features in itertools.combinations(cat_features, 2):
        new_feature = features[0] + '_' + features[1]
        # Make combined column
        df_train[new_feature] = df_train[features[0]] + '_' + df_train[features[1]]
        df_test[new_feature] = df_test[features[0]] + '_' + df_test[features[1]]
        
        # Groupby 
        df_grouped = pd.DataFrame(train_data.groupby([features[0],features[1]])['AMT'].mean())
        df_grouped.reset_index(inplace=True)
        df_grouped[new_feature] = df_grouped[features[0]] + '_' + df_grouped[features[1]]
        encoder = pd.Series(np.log(df_grouped['AMT'].values), index=df_grouped[new_feature])
        
        # Encoding process
        df_train[new_feature] = df_train[new_feature].map(encoder)
        df_test[new_feature] = df_test[new_feature].map(encoder)
        
    return df_train, df_test


train_final_x, test_final_x = feature_combine(train_x, test_x)

In [None]:

# Label Encodings for AGE 
train_final_x['SEX'] = train_final_x['SEX'].astype('int64')
test_final_x['SEX'] = test_final_x['SEX'].astype('int64')

# AGE ordinal encoding
age_dict = {'10s':1,'70s':2, '20s':3, '60s':4, '30s':5, '40s':6, '50s':7}
train_final_x['AGE'] = train_final_x['AGE'].apply(lambda x: age_dict[x])
test_final_x['AGE'] = test_final_x['AGE'].apply(lambda x: age_dict[x])

# STD_CLSS and CARD_CCG target encoding
for feature in ['STD_CLSS_NM','CARD_CCG_NM']:
    temp_group = np.log(train_data.groupby([feature])['AMT'].mean())
    train_final_x[feature + '_encoded'] = train_final_x[feature].map(temp_group)
    test_final_x[feature + '_encoded'] = test_final_x[feature].map(temp_group)
    
train_final_x.drop(['CARD_CCG_NM','STD_CLSS_NM'], axis=1, inplace=True)
test_final_x.drop(['CARD_CCG_NM','STD_CLSS_NM'], axis=1, inplace=True)


In [None]:
# Check any null value in the test_x entry 
test_final_x.isna().sum()
test_final_x.fillna(0, inplace=True)
test_final_x.isna().sum()

In [None]:
# Check column match
print(train_final_x.shape, test_final_x.shape)

assert(train_final_x.columns.all() == test_final_x.columns.all())

# EDA 
* Check correlation among features

In [None]:
# Now check correlation between each input variable and target
corr_df = pd.concat([train_final_x,train_y], axis=1)
corr = corr_df.corr()
display(corr.style.background_gradient(cmap='coolwarm').set_precision(4))
print(corr['AMT'].sort_values(ascending=False))


# Train

In [None]:
# Model selection and prediction 

model_1 = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1, verbose=10)
model_1.fit(train_final_x, train_y)
print("Fit Complete")
prediction = model_1.predict(test_final_x)

"""
#model_2 = XGBRegressor(n_estimators=1000, learning_rate=0.01, n_jobs=-1, verbose=10)
#model_2.fit(train_x, train_y)
#print("Fit Complete")
prediction = model_2.predict(apr_x)
"""

In [None]:
prediction = pd.DataFrame(np.exp(prediction))
prediction.columns = ['AMT']
final_df = pd.concat([test_x, prediction], axis=1)

# Add CARD_SIDO column
final_df = final_df.merge(test_sido_ccg, how='inner', on='CARD_CCG_NM')
final_df.drop_duplicates(inplace=True)
final_df = pd.DataFrame(final_df.groupby(['CARD_SIDO_NM','STD_CLSS_NM'])['AMT'].sum())
final_df.reset_index(inplace=True)
final_df

# Submission

In [None]:
# Now fit the data into submission 
# First, groupby for REG_YYMM == 202004
apr_data = pd.DataFrame(apr_data.groupby(['REG_YYMM','CARD_SIDO_NM','STD_CLSS_NM'])['AMT'].sum())
apr_data.reset_index(inplace=True)
display(apr_data)

# Reinitialize 'AMT' column to 0 before insertion
submission['AMT'] = 0

# Fill in 
submission_final = submission.merge(apr_data, how='left', on=['REG_YYMM','STD_CLSS_NM','CARD_SIDO_NM'])
submission_final.drop(['id','AMT_x'], axis=1, inplace=True)
submission_final.rename(columns={'AMT_y':'AMT'}, inplace=True)
submission_final.loc[submission_final['REG_YYMM'] == 202007, 'AMT'] = final_df['AMT'].values
submission_final.fillna(0, inplace=True)

submission_final.to_csv('submission.csv', encoding='UTF-8-sig')