# Data Preprocessing

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
def make_dummies(data,feature):
    temp = pd.get_dummies(data[feature]).rename(columns=lambda x: feature +'_'+ str(x))
    data = pd.concat([data, temp], axis=1)
    return data

In [4]:
from sklearn.cluster import KMeans

def add_Kmean_group(data,feature_list):
    for feature in feature_list :
        model= KMeans(n_clusters=8,random_state=5)
        temp = model.fit_predict(pd.DataFrame(data[feature]))
        temp = pd.DataFrame(temp,columns=[feature + '_group'])
        data = pd.concat([data, temp], axis=1)
    return data

In [5]:
def smape(A, F):
    A , F = np.array(A) ,np.array(F)
    return 100 - 100/len(A) * np.sum(np.power(np.abs(F - A),2) \
                                     / np.power(np.minimum(2 * np.abs(A),np.abs(F)) + np.abs(A),2))

In [6]:
cc_data = pd.read_csv('./data/cc.csv')
data = pd.read_csv('data/demographics.csv')
kp_data = pd.read_csv('./data/kplus.csv')
test = pd.read_csv('./data/test.csv')
train = pd.read_csv('./data/train.csv')
y_train = pd.DataFrame(np.log1p(train['income']))

data['ocp_cd'] = data['ocp_cd'].fillna(0)
data['ocp_cd'] = data['ocp_cd'].astype(int)

df_cc_data = pd.merge(data,cc_data,on=['cc_no'], how="left")

df_cc_data_mean = df_cc_data.groupby(['id','gender', 'ocp_cd', 'age'])['cc_txn_amt'].mean()
df_cc_data_std = df_cc_data.groupby(['id','gender', 'ocp_cd', 'age'])['cc_txn_amt'].std()
df_cc_data_max = df_cc_data.groupby(['id','gender', 'ocp_cd', 'age'])['cc_txn_amt'].max()
df_cc_data_min = df_cc_data.groupby(['id','gender', 'ocp_cd', 'age'])['cc_txn_amt'].min()
df_cc_data_median = df_cc_data.groupby(['id','gender', 'ocp_cd', 'age'])['cc_txn_amt'].median()
df_cc_data_count = df_cc_data.groupby(['id','gender', 'ocp_cd', 'age'])['cc_txn_amt'].count()
df_cc_data_sum = df_cc_data.groupby(['id','gender', 'ocp_cd', 'age'])['cc_txn_amt'].sum()

df_cc_data = pd.concat([df_cc_data_mean, df_cc_data_std, df_cc_data_max, df_cc_data_min, df_cc_data_median, df_cc_data_count, df_cc_data_sum], axis=1, join='inner')
df_cc_data.columns = ['cc_data_mean','cc_data_std', 'cc_data_max', 'cc_data_min', 'cc_data_median', 'cc_data_count', 'cc_data_sum']
df_cc_data = df_cc_data.fillna(0)
df_cc_data = df_cc_data.reset_index()

df_kp_data_mean = kp_data.groupby(['id'])['kp_txn_amt'].mean()
df_kp_data_std = kp_data.groupby(['id'])['kp_txn_amt'].std()
df_kp_data_max = kp_data.groupby(['id'])['kp_txn_amt'].max()
df_kp_data_min = kp_data.groupby(['id'])['kp_txn_amt'].min()
df_kp_data_median = kp_data.groupby(['id'])['kp_txn_amt'].median()
df_kp_data_count = kp_data.groupby(['id'])['kp_txn_amt'].count()
df_kp_data_sum = kp_data.groupby(['id'])['kp_txn_amt'].sum()

df_kp_data = pd.concat([df_kp_data_mean, df_kp_data_std, df_kp_data_max, df_kp_data_min, df_kp_data_median, df_kp_data_count, df_kp_data_sum], axis=1, join='inner')
df_kp_data.columns = ['kp_data_mean', 'kp_data_std', 'kp_data_max', 'kp_data_min', 'kp_data_median', 'kp_data_count', 'kp_data_sum']
df_kp_data = df_kp_data.fillna(0)
df_kp_data = df_kp_data.reset_index()

sum_all_data = pd.merge(df_cc_data,df_kp_data,on=['id'], how="left")
sum_all_data = sum_all_data.fillna(0)

sum_all_data['sum_amt'] = sum_all_data['kp_data_sum'] + sum_all_data['cc_data_sum']

feature_list = ['sum_amt','cc_data_mean', 'cc_data_std',
       'cc_data_max', 'cc_data_min', 'cc_data_median', 'cc_data_count',
       'cc_data_sum', 'kp_data_mean', 'kp_data_std', 'kp_data_max',
       'kp_data_min', 'kp_data_median', 'kp_data_count', 'kp_data_sum']
sum_all_data = add_Kmean_group(sum_all_data,feature_list)

feature_list = ['gender', 'ocp_cd', 'age','sum_amt_group', 'cc_data_mean_group', 'cc_data_std_group',
       'cc_data_max_group', 'cc_data_min_group', 'cc_data_median_group',
       'cc_data_count_group', 'cc_data_sum_group', 'kp_data_mean_group',
       'kp_data_std_group', 'kp_data_max_group', 'kp_data_min_group',
       'kp_data_median_group', 'kp_data_count_group', 'kp_data_sum_group']
for feature in feature_list :
    sum_all_data = make_dummies(sum_all_data,feature)

sum_all_data = sum_all_data.drop(columns=['id'])
x_train,x_test = sum_all_data.head(50000),sum_all_data.tail(15000)

x_train.to_csv('x_train.csv',index=False)    
x_test.to_csv('x_test.csv',index=False)
y_train.to_csv('y_train_log.csv',index=False)


# Train Model

In [7]:
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from datetime import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from sklearn.externals import joblib



In [8]:
x_train = pd.read_csv('x_train.csv')    
y_train = pd.read_csv('y_train_log.csv')

In [9]:
print('START ML', datetime.now(), )

kfolds = KFold(n_splits=10, shuffle=True, random_state=42)
# setup models    
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

ridge = make_pipeline(RobustScaler(),
                      RidgeCV(alphas=alphas_alt, cv=kfolds))

lasso = make_pipeline(RobustScaler(),
                      LassoCV(max_iter=1e7, alphas=alphas2,
                              random_state=42, cv=kfolds))

elasticnet = make_pipeline(RobustScaler(),
                           ElasticNetCV(max_iter=1e7, alphas=e_alphas,
                                        cv=kfolds, l1_ratio=e_l1ratio,random_state=42))
                                        
svr = make_pipeline(RobustScaler(),
                      SVR(C= 20, epsilon= 0.008, gamma=0.0003))

gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =42)

xgboost = XGBRegressor(learning_rate=0.1, n_estimators=40,
                                     max_depth=6, min_child_weight=0,
                                     gamma=0.8, subsample=0.5,
                                     colsample_bytree=0.5,colsample_bylevel=1,nthread=4,random_state=42)

# stack
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet,
                                            gbr, xgboost),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True,random_state=42)

print('START Fit')

print('stack_gen')
stack_model = stack_gen.fit(np.array(x_train), np.array(y_train))

print('xgboost')
xgb_model = xgboost.fit(x_train, y_train)

filename = 'xgb_model.sav'
joblib.dump(xgb_model, filename)

filename = 'stack_model.sav'
joblib.dump(stack_model, filename)


START ML 2019-11-11 13:24:09.666291
START Fit
stack_gen


  y = column_or_1d(y, warn=True)


xgboost


['stack_model.sav']

# Prediction

In [10]:
from sklearn.externals import joblib
import pandas as pd
import numpy as np

In [26]:
def blend_models_predict(X):
    return ((0.5* pd.DataFrame(np.expm1(xgb_model.predict(X)))) + \
            (0.5* pd.DataFrame(np.expm1(stack_model.predict(np.array(X))))))

In [27]:
x_test = pd.read_csv('x_test.csv')
xgb_model = joblib.load('xgb_model.sav')
stack_model = joblib.load('stack_model.sav')



In [28]:
pred = blend_models_predict(x_test)
ans = pd.read_csv('data/test.csv')
ans['income'] = pred
ans['income'] = ans['income']+1000
ans.to_csv('O_0044.csv',index=False)

In [29]:
ans['income'].mean()

33025.16