# Description

This notebook runs models on PCA features.

In [9]:
import numpy as np
import pandas as pd
import feather

import os
import warnings
warnings.filterwarnings('ignore')

import gc

npc = 611 # number of used principal component
pc_list = ['pc_{}'.format(i) for i in range(npc)]

In [10]:
train = feather.read_dataframe('../input/home-credit-merged-pca/train.feather', columns=pc_list + ['SK_ID_CURR', 'TARGET'])
test = feather.read_dataframe('../input/home-credit-merged-pca/test.feather', columns=pc_list + ['SK_ID_CURR'])

# Submission dataframe
submit = test[['SK_ID_CURR']]

In [11]:
train_labels = train['TARGET']
train = train.drop(columns=['SK_ID_CURR'])
test = test.drop(columns=['SK_ID_CURR'])

print('Training Features shape: ', train.shape)
print('Testing Features shape: ', test.shape)

In [12]:
from sklearn.preprocessing import MinMaxScaler

# Drop the target from the training data
if 'TARGET' in train:
    train = train.drop(columns = ['TARGET'])
    
# Feature names
features = list(train.columns)

for feat in features:

    # Scale each feature to 0-1
    scaler = MinMaxScaler(feature_range = (0, 1))

    # Repeat with the scaler
    scaler.fit(train[feat].values.reshape(-1, 1))
    train[feat] = scaler.transform(train[feat].values.reshape(-1, 1))
    test[feat] = scaler.transform(test[feat].values.reshape(-1, 1))

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

gc.enable()
del scaler
gc.collect()

In [5]:
from sklearn.linear_model import LogisticRegression

# Make the model with the specified regularization parameter
log_reg = LogisticRegression(C = 1e-4)

# Train on the training data
log_reg.fit(train, train_labels)

In [6]:
log_reg_pred = log_reg.predict_proba(test)[:, 1]

gc.enable()
del log_reg
gc.collect()

submit['TARGET'] = log_reg_pred

submit.head()

In [7]:
# Save the submission to a csv file
submit.to_csv('log_reg_pca.csv', index = False)

Private Score: 0.63584, Public Score: 0.64655

In [8]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 233, verbose = 1, n_jobs = -1)

# Train on the training data
random_forest.fit(train, train_labels)

# Make predictions
rf_pred = random_forest.predict_proba(test)[:, 1]

gc.enable()
del random_forest
gc.collect()

In [None]:
submit['TARGET'] = rf_pred

submit.head()

In [None]:
# Save the submission to a csv file
submit.to_csv('rf_pca.csv', index = False)

Private Score: 0.62734, Public Score: 0.61795

In [13]:
# K-fold cross validation
from sklearn.model_selection import KFold
folds = KFold(n_splits=10, shuffle=True, random_state=233)

In [14]:
from sklearn.linear_model import Ridge
from sklearn.metrics import roc_auc_score

oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train)):
    trn_x, trn_y = train.iloc[trn_idx], train_labels.iloc[trn_idx]
    val_x, val_y = train.iloc[val_idx], train_labels.iloc[val_idx]
    clf = Ridge(alpha=20, copy_X=True, fit_intercept=True, solver='auto', max_iter=10000, normalize=False, random_state=0, tol=0.0025)
    clf.fit(trn_x, trn_y)
    
    oof_preds[val_idx] = clf.predict(val_x)
    sub_preds += clf.predict(test) / folds.n_splits
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    del clf, trn_x, trn_y, val_x, val_y
    gc.collect()

In [15]:
submit['TARGET'] = sub_preds
submit.loc[submit['TARGET'] < 0, 'TARGET'] = 0
submit.loc[submit['TARGET'] > 1, 'TARGET'] = 1

submit.head()

In [16]:
# Save the submission to a csv file
submit.to_csv('ridge_pca.csv', index = False)

Private Score: 0.75292, Public Score: 0.75754

In [None]:
# Build Lightgbm
import lightgbm as lgb

In [None]:
import re
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

from sklearn.metrics import roc_auc_score

oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train)):
    trn_x, trn_y = train.iloc[trn_idx], train_labels.iloc[trn_idx]
    val_x, val_y = train.iloc[val_idx], train_labels.iloc[val_idx]
    
    # Create Lightgbm model
    model = lgb.LGBMClassifier(n_estimators=10000, objective='binary', 
                               class_weight='balanced', learning_rate=0.05, 
                               reg_alpha=0.1, reg_lambda=0.1, 
                               subsample=0.8, n_jobs=-1, random_state=233)
    
    model.fit(trn_x, trn_y, eval_metric='auc', eval_set=[(val_x, val_y), (trn_x, trn_y)],
              eval_names=['valid', 'train'], early_stopping_rounds=100, verbose=200)
    
    oof_preds[val_idx] = model.predict_proba(val_x)[:, 1]
    sub_preds += model.predict_proba(test)[:, 1] / folds.n_splits
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    del model, trn_x, trn_y, val_x, val_y
    gc.collect()

In [None]:
submit['TARGET'] = sub_preds

submit.head()

In [None]:
# Save the submission to a csv file
submit.to_csv('lgb_pca.csv', index = False)

Private Score: 0.76954, Public Score: 0.77358

In [None]:
# Use keras to build the neural network
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout, BatchNormalization

In [None]:
oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train)):
    trn_x, trn_y = train.iloc[trn_idx], train_labels.iloc[trn_idx]
    val_x, val_y = train.iloc[val_idx], train_labels.iloc[val_idx]
    
    # Neural Network
    nn = Sequential()
    nn.add(Dense(units=400, kernel_initializer='normal', activation='relu', input_dim=train.shape[1]))
    nn.add(BatchNormalization())
    nn.add(Dropout(.1))
    nn.add(Dense(units=100, kernel_initializer='normal', activation='relu'))
    nn.add(BatchNormalization())
    nn.add(Dropout(.1))
    nn.add(Dense(units=20, kernel_initializer='normal', activation='relu'))
    nn.add(BatchNormalization())
    nn.add(Dropout(.1))
    nn.add(Dense(units=1, kernel_initializer='normal', activation='sigmoid'))
    nn.compile(loss='binary_crossentropy', optimizer='adam')
    
    nn.fit(trn_x, trn_y, epochs=10, verbose=2)
    
    oof_preds[val_idx] = nn.predict(val_x).flatten()
    sub_preds += nn.predict(test).flatten() / folds.n_splits
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    del nn, trn_x, trn_y, val_x, val_y
    gc.collect()

In [None]:
submit['TARGET'] = sub_preds

submit.head()

In [None]:
# Save the submission to a csv file
submit.to_csv('nn_pca.csv', index = False)

Private Score: 0.76720, Public Score: 0.76852