# Description

This notebook runs models on selected features.

In [1]:
import numpy as np
import pandas as pd

import os
import warnings
warnings.filterwarnings('ignore')

import gc

In [2]:
# Directly load selected dataframe
train = pd.read_csv('../input/home-credit-selected/train.csv')
test = pd.read_csv('../input/home-credit-selected/test.csv')

# Submission dataframe
submit = test[['SK_ID_CURR']]

In [3]:
import sys

def return_size(df):
    """Return size of dataframe in gigabytes"""
    return round(sys.getsizeof(df) / 1e9, 2)

def convert_types(df, print_info = False):
    
    original_memory = df.memory_usage().sum()
    
    # Iterate through each column
    for c in df:
        
        # Convert ids and booleans to integers
        if ('SK_ID' in c):
            df[c] = df[c].fillna(0).astype(np.int32)
            
        # Convert objects to category
        elif (df[c].dtype == 'object') and (df[c].nunique() < df.shape[0]):
            df[c] = df[c].astype('category')
        
        # Booleans mapped to integers
        elif list(df[c].unique()) == [1, 0]:
            df[c] = df[c].astype(bool)
        
        # Float64 to float32
        elif df[c].dtype == float:
            df[c] = df[c].astype(np.float32)
            
        # Int64 to int32
        elif df[c].dtype == int:
            df[c] = df[c].astype(np.int32)
        
    new_memory = df.memory_usage().sum()
    
    if print_info:
        print(f'Original Memory Usage: {round(original_memory / 1e9, 2)} gb.')
        print(f'New Memory Usage: {round(new_memory / 1e9, 2)} gb.')
        
    return df

In [4]:
train = convert_types(train, print_info=True)
test = convert_types(test, print_info=True)

In [5]:
train_labels = train['TARGET']

print('Training Features shape: ', train.shape)
print('Testing Features shape: ', test.shape)

In [6]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# Drop the target from the training data
if 'TARGET' in train:
    train = train.drop(columns = ['TARGET'])
    
# Feature names
features = list(train.columns)

for feat in features:
    # Median imputation of missing values
    imputer = SimpleImputer(strategy = 'median')

    # Scale each feature to 0-1
    scaler = MinMaxScaler(feature_range = (0, 1))

    # Fit on the training data
    imputer.fit(train[feat].values.reshape(-1, 1))

    # Transform both training and testing data
    train[feat] = imputer.transform(train[feat].values.reshape(-1, 1))
    test[feat] = imputer.transform(test[feat].values.reshape(-1, 1))

    # Repeat with the scaler
    scaler.fit(train[feat].values.reshape(-1, 1))
    train[feat] = scaler.transform(train[feat].values.reshape(-1, 1))
    test[feat] = scaler.transform(test[feat].values.reshape(-1, 1))

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

gc.enable()
del imputer, scaler
gc.collect()

In [7]:
from sklearn.linear_model import LogisticRegression

# Make the model with the specified regularization parameter
log_reg = LogisticRegression(C = 1e-4)

# Train on the training data
log_reg.fit(train, train_labels)

In [8]:
log_reg_pred = log_reg.predict_proba(test)[:, 1]

gc.enable()
del log_reg
gc.collect()

submit['TARGET'] = log_reg_pred

submit.head()

In [9]:
# Save the submission to a csv file
submit.to_csv('log_reg_selected.csv', index = False)

Private Score: 0.71423, Public Score: 0.70805

In [10]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 233, verbose = 1, n_jobs = -1)

# Train on the training data
random_forest.fit(train, train_labels)

# Make predictions
rf_pred = random_forest.predict_proba(test)[:, 1]

gc.enable()
del random_forest
gc.collect()

In [11]:
submit['TARGET'] = rf_pred

submit.head()

In [12]:
# Save the submission to a csv file
submit.to_csv('rf_selected.csv', index = False)

Private Score: 0.70517, Public Score: 0.69213

In [13]:
# K-fold cross validation
from sklearn.model_selection import KFold
folds = KFold(n_splits=10, shuffle=True, random_state=233)

In [14]:
from sklearn.linear_model import Ridge
from sklearn.metrics import roc_auc_score

oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train)):
    trn_x, trn_y = train.iloc[trn_idx], train_labels.iloc[trn_idx]
    val_x, val_y = train.iloc[val_idx], train_labels.iloc[val_idx]
    clf = Ridge(alpha=20, copy_X=True, fit_intercept=True, solver='auto', max_iter=10000, normalize=False, random_state=0, tol=0.0025)
    clf.fit(trn_x, trn_y)
    
    oof_preds[val_idx] = clf.predict(val_x)
    sub_preds += clf.predict(test) / folds.n_splits
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    del clf, trn_x, trn_y, val_x, val_y
    gc.collect()

In [15]:
submit['TARGET'] = sub_preds
submit.loc[submit['TARGET'] < 0, 'TARGET'] = 0

submit.head()

In [16]:
# Save the submission to a csv file
submit.to_csv('ridge_selected.csv', index = False)

Private Score: 0.75925, Public Score: 0.76517

In [17]:
# Build Lightgbm
import lightgbm as lgb

In [18]:
import re
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

from sklearn.metrics import roc_auc_score

oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train)):
    trn_x, trn_y = train.iloc[trn_idx], train_labels.iloc[trn_idx]
    val_x, val_y = train.iloc[val_idx], train_labels.iloc[val_idx]
    
    # Create Lightgbm model
    model = lgb.LGBMClassifier(n_estimators=10000, objective='binary', 
                               class_weight='balanced', learning_rate=0.05, 
                               reg_alpha=0.1, reg_lambda=0.1, 
                               subsample=0.8, n_jobs=-1, random_state=233)
    
    model.fit(trn_x, trn_y, eval_metric='auc', eval_set=[(val_x, val_y), (trn_x, trn_y)],
              eval_names=['valid', 'train'], early_stopping_rounds=100, verbose=200)
    
    oof_preds[val_idx] = model.predict_proba(val_x)[:, 1]
    sub_preds += model.predict_proba(test)[:, 1] / folds.n_splits
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    del model, trn_x, trn_y, val_x, val_y
    gc.collect()

In [19]:
submit['TARGET'] = sub_preds

submit.head()

In [20]:
# Save the submission to a csv file
submit.to_csv('lgb_selected.csv', index = False)

Private Score: 0.77942, Public Score: 0.77878

In [21]:
# Use keras to build the neural network
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout, BatchNormalization

In [22]:
oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train)):
    trn_x, trn_y = train.iloc[trn_idx], train_labels.iloc[trn_idx]
    val_x, val_y = train.iloc[val_idx], train_labels.iloc[val_idx]
    
    # Neural Network
    nn = Sequential()
    nn.add(Dense(units=400, kernel_initializer='normal', activation='relu', input_dim=train.shape[1]))
    nn.add(BatchNormalization())
    nn.add(Dropout(.1))
    nn.add(Dense(units=100, kernel_initializer='normal', activation='relu'))
    nn.add(BatchNormalization())
    nn.add(Dropout(.1))
    nn.add(Dense(units=20, kernel_initializer='normal', activation='relu'))
    nn.add(BatchNormalization())
    nn.add(Dropout(.1))
    nn.add(Dense(units=1, kernel_initializer='normal', activation='sigmoid'))
    nn.compile(loss='binary_crossentropy', optimizer='adam')
    
    nn.fit(trn_x, trn_y, epochs=10, verbose=2)
    
    oof_preds[val_idx] = nn.predict(val_x).flatten()
    sub_preds += nn.predict(test).flatten() / folds.n_splits
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    del nn, trn_x, trn_y, val_x, val_y
    gc.collect()

In [23]:
submit['TARGET'] = sub_preds

submit.head()

In [24]:
# Save the submission to a csv file
submit.to_csv('nn_selected.csv', index = False)

Private Score: 0.76850, Public Score: 0.77016