In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt, gc, os
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

#pca
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA

#lightgbm
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from lightgbm import LGBMClassifier, early_stopping

#xgbm
import xgboost as xgb
from xgboost import XGBClassifier

#catboost
import catboost as cb
from catboost import CatBoostClassifier

#decision tree
from sklearn.tree import DecisionTreeClassifier

#random forest
from sklearn.ensemble import RandomForestClassifier

#logistic regression
from sklearn.linear_model import LogisticRegression

#metrics
from pathlib import Path
input_path = Path('/kaggle/input/amex-default-prediction/')

# Load Data

In [23]:
TRAIN_PATH = '/Users/wangjiaxin/Desktop/study program/data analysis/project/amex/train.parquet'
train = pd.read_parquet(TRAIN_PATH)
train.shape

(5531451, 190)

In [24]:
#change format of ['customer_ID'],['S_2']
train['customer_ID'] = train['customer_ID'].str[-16:].apply(lambda x: int(x, 16))
train.S_2 = pd.to_datetime( train.S_2 )
train.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,13914591055249847850,2017-03-09,0.938469,0,0.008724,1.006838,0.009228,0.124035,0.0,0.004709,...,-1,-1,-1,0,0,0.0,,0,0.00061,0
1,13914591055249847850,2017-04-07,0.936665,0,0.004923,1.000653,0.006151,0.12675,0.0,0.002714,...,-1,-1,-1,0,0,0.0,,0,0.005492,0
2,13914591055249847850,2017-05-28,0.95418,3,0.021655,1.009672,0.006815,0.123977,0.0,0.009423,...,-1,-1,-1,0,0,0.0,,0,0.006986,0
3,13914591055249847850,2017-06-13,0.960384,0,0.013683,1.0027,0.001373,0.117169,0.0,0.005531,...,-1,-1,-1,0,0,0.0,,0,0.006527,0
4,13914591055249847850,2017-07-16,0.947248,0,0.015193,1.000727,0.007605,0.117325,0.0,0.009312,...,-1,-1,-1,0,0,0.0,,0,0.008126,0


In [25]:
y = pd.read_csv('/Users/wangjiaxin/Desktop/study program/data analysis/project/amex/train_labels.csv')
#from string to int
y['customer_ID'] = y['customer_ID'].str[-16:].apply(lambda x: int(x, 16))
y=y.set_index('customer_ID')
y.head()

Unnamed: 0_level_0,target
customer_ID,Unnamed: 1_level_1
13914591055249847850,0
11750091188498716901,0
7128959966677571777,0
6537921148391624412,0
2065103583825424365,0


In [26]:
len(y)

458913

In [27]:
#target value
c = dict(y['target'].value_counts())
c

{0: 340085, 1: 118828}

# AMEX Metric

In [28]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

# Feature Engineering

In [29]:
#handle duplicates
# keep only the last record for each customer
train.drop_duplicates('customer_ID',keep='last',inplace=True)
train=train.set_index('customer_ID')

### Deal with Missing Values

In [30]:
#dealing with missing value
def check_value(df):
        zero_val = (df == 0.00).astype(int).sum(axis=0)
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)
        mz_table = mz_table.rename(
        columns = {0 : 'Zero Values', 1 : 'Missing Values', 2 : '% Missing'})
        mz_table['Data Type'] = df.dtypes
        mz_table = mz_table[
            mz_table.iloc[:,1] != 0].sort_values(
        '% Missing', ascending=False).round(1)
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " Rows.\n"      
            "There are " + str(mz_table.shape[0]) +
              " columns that have missing values.")
        return mz_table

In [31]:
train.shape

(458913, 189)

In [32]:
Missing_values_table_train = check_value(train)
train_null_tables= Missing_values_table_train.loc[Missing_values_table_train['% Missing'] > 70]
train_null_tables['Unique'] = [train[col].nunique() for col in train_null_tables.index ]
print (train_null_tables)

Your selected dataframe has 189 columns and 458913 Rows.
There are 58 columns that have missing values.
       Zero Values  Missing Values  % Missing Data Type  Unique
D_88             0          458086       99.8   float32     827
D_110            0          455235       99.2   float32    3665
B_39             0          454808       99.1   float32    4105
D_73             0          454674       99.1   float32    4239
B_42             0          452771       98.7   float32    6141
D_134            0          442518       96.4   float32   16377
B_29             0          431589       94.0   float32   27308
D_76             0          409597       89.3   float32   49301
D_132            0          407153       88.7   float32   51736
D_42             0          399003       86.9   float32   59874
D_142            0          378598       82.5   float32   80151
D_53             0          325932       71.0   float32  132847


In [33]:
#drop columns with more than 80% missing values
train = train.drop(columns = ['D_88','D_110','B_39','D_73','B_42','D_134','B_29','D_132','D_76','D_42','D_142'])


In [34]:
train.dtypes.value_counts()

int8              86
float32           82
int16              9
datetime64[ns]     1
dtype: int64

In [35]:
#dealing with categorical data, fill in with mode
#dealing with numeric data, fill in with mean 
cat_col=['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
for col in list(train.columns):
    if col in cat_col:
        train[col] = train[col].fillna(train[col].mode().iloc[0])
    else:
        train[col] = train[col].fillna(train[col].mean())
        
print('missing values in training data set :', train.isnull().sum())

missing values in training data set : S_2      0
P_2      0
D_39     0
B_1      0
B_2      0
        ..
D_140    0
D_141    0
D_143    0
D_144    0
D_145    0
Length: 178, dtype: int64


### Deal with date time

In [36]:
#feature creation
train['S_2_dayofweek'] = train['S_2'].dt.weekday
train['S_2_dayofmonth'] = train['S_2'].dt.day
train = train.drop(columns=['S_2'])

### PCA Feature Selection

In [37]:
train = train.merge(y, left_index=True, right_index=True, how='left')

In [39]:
train.head()

Unnamed: 0_level_0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_43,D_44,...,D_138,D_139,D_140,D_141,D_143,D_144,D_145,S_2_dayofweek,S_2_dayofmonth,target
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13914591055249847850,0.934745,0,0.009382,1.007647,0.006104,0.135021,0.0,0.007174,0.163763,0,...,-1,0,0,0.0,0,0.00297,0,1,13,0
11750091188498716901,0.880519,6,0.034684,1.004028,0.006911,0.165509,0.0,0.005068,0.060646,0,...,-1,0,0,0.0,0,0.003169,0,6,25,0
7128959966677571777,0.880875,0,0.004284,0.812649,0.00645,0.229062,0.0,0.007196,0.163763,0,...,-1,0,0,0.0,0,0.000834,0,0,12,0
6537921148391624412,0.621776,0,0.012564,1.006183,0.007829,0.287766,0.0,0.009937,0.046104,0,...,-1,0,0,0.0,0,0.00556,0,3,29,0
2065103583825424365,0.8719,0,0.007679,0.815746,0.001247,0.229062,0.0,0.005528,0.044671,0,...,-1,0,0,0.0,0,0.006944,0,4,30,0


In [42]:
#Cumulative Explained Variance explained by the components
# number of Principal components:27
transformer = IncrementalPCA(n_components=27)
train_pca = transformer.fit_transform(train.iloc[:,:-1])

In [43]:
train_pca_df=pd.DataFrame(train_pca)

# Machine Learning Models

### Split Data

In [44]:
train_pca_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,652.873216,307.498018,-12.418427,1.017803,-0.810467,-7.448045,-4.135032,0.733977,-14.169576,-3.136355,...,0.012161,-0.374496,1.264482,0.633252,-0.258036,-0.633671,0.206105,-2.049039,-0.04936,-1.205019
1,303.724847,-32.221681,-20.177674,4.396501,-0.274523,-5.930358,2.778015,-0.141011,-8.607245,8.389138,...,-0.005056,0.05527,-1.62342,-1.229234,-1.521698,-1.031182,0.270474,2.983509,-2.316054,-0.205647
2,-1012.458015,-13.772412,-28.045417,5.482018,0.023204,-5.895226,-2.298894,0.39468,-10.073151,-5.557609,...,-0.025895,0.082064,-2.227237,-1.677394,-1.737196,-1.84898,0.265594,-3.166713,-0.518928,-0.697512
3,-397.415252,285.560808,-20.122561,2.977111,-0.427864,-9.093937,-1.423652,0.382674,5.067887,12.497593,...,-0.060106,-0.217144,1.024727,0.904255,0.907871,2.365351,-0.064884,0.415232,2.133228,-0.75671
4,-1012.523641,-13.803814,-20.470712,3.132295,-0.041359,10.968365,-8.850462,0.201472,1.054519,12.628264,...,-0.117518,-0.270248,2.159601,1.206614,0.027384,-0.216611,-0.077358,1.248184,-0.256863,-0.192523


In [45]:
# 30% for testing, 70% for training
x_train, x_test, y_train, y_test = train_test_split(train_pca_df, train['target'], test_size=0.3, random_state=0, stratify=y)
                                                                        

In [50]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((321239, 27), (137674, 27), (321239,), (137674,))

### Lightgbm

In [51]:
param_lgbm = {
            'metric': "binary_logloss",
            'boosting_type': "dart",
            'n_estimators':1000,
            'verbosity': -1,
            'lambda_l1': 3.1412416493672213e-06,
             'lambda_l2': 1.919550703890871,
             'num_leaves': 53,
             'feature_fraction': 0.8361911823947347,
             'bagging_fraction': 0.5246353885003125,
             'bagging_freq': 7,
             'min_child_samples': 91}

lgbm =LGBMClassifier(**param_lgbm).fit(x_train, y_train)

In [52]:
#prediction
prdeictions_lgbm = lgbm.predict_proba(x_test)
preds_lgbm = pd.DataFrame(prdeictions_lgbm)
pred_final_lgbm = np.array(preds_lgbm[1])
pred_final_lgbm

array([0.00149071, 0.00315772, 0.00201955, ..., 0.01090927, 0.04437709,
       0.78643622])

### xgbm

In [53]:
xgb_parms ={
    'booster': 'dart',
     'n_jobs':4,
     'n_estimators':500,
    'lambda': 4.091409953463271e-08,
    'alpha': 3.6353429991712695e-08,
    'subsample': 0.6423675532438815,
    'colsample_bytree': 0.7830450413657872,
    'max_depth': 9,
    'min_child_weight': 5,
    'eta': 0.3749337530972536,
    'gamma': 0.0745370910451703,
    'grow_policy': 'depthwise',
    'sample_type': 'uniform',
    'normalize_type': 'tree',
    'rate_drop': 0.0723975209176045,
    'skip_drop': 0.9026367296518939}

xgbm = XGBClassifier(**xgb_parms)
xgbm.fit(x_train, y_train)  



In [54]:
#prediction
prdeictions_xgbm = xgbm.predict_proba(x_test)
preds_xgbm = pd.DataFrame(prdeictions_xgbm)
pred_final_xgbm = np.array(preds_xgbm[1])
pred_final_xgbm

array([2.5311316e-04, 6.3488398e-05, 4.3588417e-04, ..., 9.6202539e-03,
       2.6996108e-03, 8.5414588e-01], dtype=float32)

### Catboost

In [56]:
Params_cat={ 
    'objective': 'CrossEntropy',
    'n_estimators':1000,
    'colsample_bylevel': 0.07868805912943484,
    'depth': 9,
    'boosting_type': 'Plain',
    'bootstrap_type': 'MVS',
    }

cbm =CatBoostClassifier(**Params_cat).fit(x_train, y_train)

0:	learn: 0.6819678	total: 77.2ms	remaining: 1m 17s
1:	learn: 0.6749746	total: 86.1ms	remaining: 42.9s
2:	learn: 0.6555902	total: 110ms	remaining: 36.6s
3:	learn: 0.6307222	total: 133ms	remaining: 33.2s
4:	learn: 0.6083639	total: 156ms	remaining: 31.1s
5:	learn: 0.6011658	total: 167ms	remaining: 27.6s
6:	learn: 0.5900201	total: 179ms	remaining: 25.4s
7:	learn: 0.5740742	total: 200ms	remaining: 24.8s
8:	learn: 0.5569249	total: 223ms	remaining: 24.6s
9:	learn: 0.5400625	total: 248ms	remaining: 24.5s
10:	learn: 0.5304341	total: 262ms	remaining: 23.6s
11:	learn: 0.5251195	total: 275ms	remaining: 22.6s
12:	learn: 0.5120639	total: 300ms	remaining: 22.8s
13:	learn: 0.5097468	total: 312ms	remaining: 21.9s
14:	learn: 0.4960095	total: 335ms	remaining: 22s
15:	learn: 0.4893641	total: 353ms	remaining: 21.7s
16:	learn: 0.4875672	total: 363ms	remaining: 21s
17:	learn: 0.4809951	total: 376ms	remaining: 20.5s
18:	learn: 0.4733542	total: 393ms	remaining: 20.3s
19:	learn: 0.4635915	total: 415ms	remainin

In [57]:
#prediction
prdeictions_cbm = cbm.predict_proba(x_test)
preds_cbm = pd.DataFrame(prdeictions_cbm)
pred_final_cbm = np.array(preds_cbm[1])
pred_final_cbm 

array([0.00306359, 0.00590065, 0.00402013, ..., 0.01308868, 0.03674958,
       0.80463141])

### Random Forest

In [60]:
#random forest classifier
forest = RandomForestClassifier(criterion='entropy',random_state=50,n_estimators=100,)
forest.fit(x_train,y_train)

In [61]:
#prediction
predict_y_rf = forest.predict_proba(x_test)
preds_rf = pd.DataFrame(predict_y_rf)
pred_final_rf = np.array(preds_rf[1])
pred_final_rf

array([0.  , 0.01, 0.  , ..., 0.01, 0.03, 0.77])

### Logistic Regression

In [62]:
logreg = LogisticRegression()
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
logreg.fit(x_train,y_train)

In [63]:
#prediction
predict_y_logistic=logreg.predict_proba(x_test)
preds_logistic = pd.DataFrame(predict_y_logistic)
pred_final_logistic = np.array(preds_logistic[1])
pred_final_logistic

array([0.0536075 , 0.10505094, 0.10627271, ..., 0.13032637, 0.29519976,
       0.78970713])

### Evluation

In [73]:
y_true=y_test.reset_index()
y_true
#pred_final_lgbm,pred_final_xgbm,pred_final_cbm,pred_final_rf,pred_final_logistic

Unnamed: 0,customer_ID,target
0,1438258030537934421,0
1,7862704852926137339,0
2,14820336679247132372,0
3,16399199556160497916,1
4,13982988533756619543,0
...,...,...
137669,13311305499010140177,0
137670,17811584250307115404,0
137671,15119483344209594545,0
137672,5084016308273490973,0


In [85]:
#pred_final_lgbm,pred_final_xgbm,pred_final_cbm,pred_final_rf,pred_final_logistic
pred_lgbm_df=pd.DataFrame(pred_final_lgbm)
pred_lgbm_df=pd.DataFrame(pred_final_lgbm)
pred_lgbm_df=pd.DataFrame(pred_final_lgbm)
pred_lgbm_df=pd.DataFrame(pred_final_lgbm)

Unnamed: 0,0
0,0.001491
1,0.003158
2,0.00202
3,0.13
4,0.12166


In [97]:
pred_lgbm_df=pd.DataFrame(pred_final_lgbm)
pred_xgbm_df=pd.DataFrame(pred_final_xgbm)
pred_cbm_df=pd.DataFrame(pred_final_cbm)
pred_rf_df=pd.DataFrame(pred_final_rf)
pred_logistic_df=pd.DataFrame(pred_final_logistic)
pred_lgbm_df.columns=['prediction']
pred_xgbm_df.columns=['prediction']
pred_cbm_df.columns=['prediction']
pred_rf_df.columns=['prediction']
pred_logistic_df.columns=['prediction']

In [98]:
print('lgbm:',amex_metric(y_true,pred_lgbm_df))
print('xgbm:',amex_metric(y_true,pred_xgbm_df))
print('cbm:',amex_metric(y_true,pred_cbm_df))
print('rf:',amex_metric(y_true,pred_rf_df))
print('logistic:',amex_metric(y_true,pred_logistic_df))

lgbm: 0.7192125156065636
xgbm: 0.6711487964956283
cbm: 0.7085156413873195
rf: 0.6959572093431239
logistic: 0.6426848643280002
