# Description

This notebook applies feature selection on merged table.

In [1]:
import numpy as np
import pandas as pd

import os
import warnings
warnings.filterwarnings('ignore')

import gc

In [2]:
print(os.listdir("../input/"))

In [3]:
def agg_numeric(df, parent_var, df_name):
    """
    Groups and aggregates the numeric values in a child dataframe
    by the parent variable.
    
    Parameters
    --------
        df (dataframe): 
            the child dataframe to calculate the statistics on
        parent_var (string): 
            the parent variable used for grouping and aggregating
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated by the `parent_var` for 
            all numeric columns. Each observation of the parent variable will have 
            one row in the dataframe with the parent variable as the index. 
            The columns are also renamed using the `df_name`. Columns with all duplicate
            values are removed. 
    
    """
    
    # Remove id variables other than grouping variable
    for col in df:
        if col != parent_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    # Only want the numeric variables
    parent_ids = df[parent_var].copy()
    numeric_df = df.select_dtypes('number').copy()
    numeric_df[parent_var] = parent_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(parent_var).agg(['count', 'mean', 'max', 'min', 'sum'])

    # Need to create new column names
    columns = []

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        if var != parent_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))
    
    agg.columns = columns
    
    # Remove the columns with all redundant values
    _, idx = np.unique(agg, axis = 1, return_index=True)
    agg = agg.iloc[:, idx]
    
    return agg

In [4]:
def agg_categorical(df, parent_var, df_name):
    """
    Aggregates the categorical features in a child dataframe
    for each observation of the parent variable.
    
    Parameters
    --------
    df : dataframe 
        The dataframe to calculate the value counts for.
        
    parent_var : string
        The variable by which to group and aggregate the dataframe. For each unique
        value of this variable, the final dataframe will have one row
        
    df_name : string
        Variable added to the front of column names to keep track of columns

    
    Return
    --------
    categorical : dataframe
        A dataframe with aggregated statistics for each observation of the parent_var
        The columns are also renamed and columns with duplicate values are removed.
        
    """
    
    # Select the categorical columns
    categorical = pd.get_dummies(df.select_dtypes('category'))

    # Make sure to put the identifying id on the column
    categorical[parent_var] = df[parent_var]

    # Groupby the group var and calculate the sum and mean
    categorical = categorical.groupby(parent_var).agg(['sum', 'count', 'mean'])
    
    column_names = []
    
    # Iterate through the columns in level 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['sum', 'count', 'mean']:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
    
    # Remove duplicate columns by values
    _, idx = np.unique(categorical, axis = 1, return_index = True)
    categorical = categorical.iloc[:, idx]
    
    return categorical

In [2]:
import sys

def return_size(df):
    """Return size of dataframe in gigabytes"""
    return round(sys.getsizeof(df) / 1e9, 2)

def convert_types(df, print_info = False):
    
    original_memory = df.memory_usage().sum()
    
    # Iterate through each column
    for c in df:
        
        # Convert ids and booleans to integers
        if ('SK_ID' in c):
            df[c] = df[c].fillna(0).astype(np.int32)
            
        # Convert objects to category
        elif (df[c].dtype == 'object') and (df[c].nunique() < df.shape[0]):
            df[c] = df[c].astype('category')
        
        # Booleans mapped to integers
        elif list(df[c].unique()) == [1, 0]:
            df[c] = df[c].astype(bool)
        
        # Float64 to float32
        elif df[c].dtype == float:
            df[c] = df[c].astype(np.float32)
            
        # Int64 to int32
        elif df[c].dtype == int:
            df[c] = df[c].astype(np.int32)
        
    new_memory = df.memory_usage().sum()
    
    if print_info:
        print(f'Original Memory Usage: {round(original_memory / 1e9, 2)} gb.')
        print(f'New Memory Usage: {round(new_memory / 1e9, 2)} gb.')
        
    return df

In [6]:
def aggregate_client(df, group_vars, df_names):
    """Aggregate a dataframe with data at the loan level 
    at the client level
    
    Args:
        df (dataframe): data at the loan level
        group_vars (list of two strings): grouping variables for the loan 
        and then the client (example ['SK_ID_PREV', 'SK_ID_CURR'])
        names (list of two strings): names to call the resulting columns
        (example ['cash', 'client'])
        
    Returns:
        df_client (dataframe): aggregated numeric stats at the client level. 
        Each client will have a single row with all the numeric data aggregated
    """
    
    # Aggregate the numeric columns
    df_agg = agg_numeric(df, parent_var=group_vars[0], df_name=df_names[0])
    
    # If there are categorical variables
    if any(df.dtypes == 'category'):
    
        # Count the categorical columns
        df_counts = agg_categorical(df, parent_var=group_vars[0], df_name=df_names[0])

        # Merge the numeric and categorical
        df_by_loan = df_counts.merge(df_agg, on=group_vars[0], how='outer')

        gc.enable()
        del df_agg, df_counts
        gc.collect()

        # Merge to get the client id in dataframe
        df_by_loan = df_by_loan.merge(df[[group_vars[0], group_vars[1]]], on=group_vars[0], how='left')

        # Remove the loan id
        df_by_loan = df_by_loan.drop(columns=[group_vars[0]])

        # Aggregate numeric stats by column
        df_by_client = agg_numeric(df_by_loan, parent_var=group_vars[1], df_name=df_names[1])

        
    # No categorical variables
    else:
        # Merge to get the client id in dataframe
        df_by_loan = df_agg.merge(df[[group_vars[0], group_vars[1]]], on=group_vars[0], how='left')
        
        gc.enable()
        del df_agg
        gc.collect()
        
        # Remove the loan id
        df_by_loan = df_by_loan.drop(columns=[group_vars[0]])
        
        # Aggregate numeric stats by column
        df_by_client = agg_numeric(df_by_loan, parent_var=group_vars[1], df_name=df_names[1])
        
    # Memory management
    gc.enable()
    del df, df_by_loan
    gc.collect()

    return df_by_client

In [7]:
bureau = pd.read_csv('../input/home-credit-default-risk/bureau.csv')
bureau = convert_types(bureau, print_info=True)
bureau.head()

In [8]:
bureau_agg = agg_numeric(bureau.drop(columns=['SK_ID_BUREAU']), 'SK_ID_CURR', 'bureau')
bureau_agg.head()

In [9]:
bureau_counts = agg_categorical(bureau, 'SK_ID_CURR', 'bureau')
bureau_counts.head()

In [10]:
bureau_balance = pd.read_csv('../input/home-credit-default-risk/bureau_balance.csv')
bureau_balance = convert_types(bureau_balance, print_info=True)
bureau_balance.head()

In [11]:
bureau_balance_counts = agg_categorical(bureau_balance, 'SK_ID_BUREAU', 'bureau_balance')
bureau_balance_counts.head()

In [12]:
bureau_balance_agg = agg_numeric(bureau_balance, 'SK_ID_BUREAU', 'bureau_balance')
bureau_balance_agg.head()

In [13]:
# Dataframe grouped by the loan
bureau_by_loan = bureau_balance_agg.merge(bureau_balance_counts, right_index=True, left_on='SK_ID_BUREAU', how='outer')

# Merge to include the SK_ID_CURR
bureau_by_loan = bureau[['SK_ID_BUREAU', 'SK_ID_CURR']].merge(bureau_by_loan, on='SK_ID_BUREAU', how='left')

# Aggregate the stats for each client
bureau_balance_by_client = agg_numeric(bureau_by_loan.drop(columns=['SK_ID_BUREAU']), 'SK_ID_CURR', 'client')

bureau_balance_by_client.head()

In [14]:
# Load training data
app_train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
app_test = convert_types(app_train, print_info=True)
print('Training data shape: ', app_train.shape)
app_train.head()

In [15]:
# Load testing data
app_test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
app_test = convert_types(app_test, print_info=True)
print('Testing data shape: ', app_test.shape)
app_test.head()

In [16]:
from sklearn.preprocessing import LabelEncoder
# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in app_train:
    if app_train[col].dtype == 'category':
        # For binary columns, encode with 0 and 1 (indeed the same as one-hot encoding)
        if len(list(app_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(app_train[col])
            # Transform both training and testing data
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

In [17]:
# one-hot encoding of categorical variables
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

# The resulting tables (ignore the target column) have different number of columns
# Because some values occur only in the training data
print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

In [18]:
# Merge with the value counts of bureau
train = app_train.merge(bureau_counts, on='SK_ID_CURR', how='left')

# Merge with the stats of bureau
train = train.merge(bureau_agg, on='SK_ID_CURR', how='left')

# Merge with the monthly information grouped by client
train = train.merge(bureau_balance_by_client, on='SK_ID_CURR', how='left')

print('Training data shape: ', train.shape)

In [19]:
# Merge with the value counts of bureau
test = app_test.merge(bureau_counts, on='SK_ID_CURR', how='left')

# Merge with the stats of bureau
test = test.merge(bureau_agg, on='SK_ID_CURR', how='left')

# Merge with the value counts of bureau balance
test = test.merge(bureau_balance_by_client, on='SK_ID_CURR', how='left')

print('Testing data shape: ', test.shape)

In [20]:
gc.enable()
del bureau, bureau_counts, bureau_agg, bureau_balance, bureau_by_loan, bureau_balance_by_client
gc.collect()

In [21]:
previous = pd.read_csv('../input/home-credit-default-risk/previous_application.csv')
previous = convert_types(previous, print_info=True)
previous.head()

In [22]:
# Calculate aggregate statistics for each numeric column
previous_agg = agg_numeric(previous, 'SK_ID_CURR', 'previous')
print('Previous aggregation shape: ', previous_agg.shape)
previous_agg.head()

In [23]:
# Calculate value counts for each categorical column
previous_counts = agg_categorical(previous, 'SK_ID_CURR', 'previous')
print('Previous counts shape: ', previous_counts.shape)
previous_counts.head()

In [24]:
# Merge in the previous information
train = train.merge(previous_counts, on='SK_ID_CURR', how='left')
train = train.merge(previous_agg, on='SK_ID_CURR', how='left')

test = test.merge(previous_counts, on='SK_ID_CURR', how='left')
test = test.merge(previous_agg, on='SK_ID_CURR', how='left')

# Remove variables to free memory
gc.enable()
del previous, previous_agg, previous_counts
gc.collect()

In [25]:
cash = pd.read_csv('../input/home-credit-default-risk/POS_CASH_balance.csv')
cash = convert_types(cash, print_info=True)
cash.head()

In [26]:
cash_by_client = aggregate_client(cash, group_vars=['SK_ID_PREV', 'SK_ID_CURR'], df_names=['cash', 'client'])
cash_by_client.head()

In [27]:
print('Cash by Client Shape: ', cash_by_client.shape)

train = train.merge(cash_by_client, on='SK_ID_CURR', how='left')
test = test.merge(cash_by_client, on='SK_ID_CURR', how='left')

gc.enable()
del cash, cash_by_client
gc.collect()

In [28]:
credit = pd.read_csv('../input/home-credit-default-risk/credit_card_balance.csv')
credit = convert_types(credit, print_info=True)
credit.head()

In [29]:
credit_by_client = aggregate_client(credit, group_vars=['SK_ID_PREV', 'SK_ID_CURR'], df_names=['credit', 'client'])
credit_by_client.head()

In [30]:
print('Credit by client shape: ', credit_by_client.shape)

train = train.merge(credit_by_client, on='SK_ID_CURR', how='left')
test = test.merge(credit_by_client, on='SK_ID_CURR', how='left')

gc.enable()
del credit, credit_by_client
gc.collect()

In [31]:
installments = pd.read_csv('../input/home-credit-default-risk/installments_payments.csv')
installments = convert_types(installments, print_info=True)
installments.head()

In [32]:
installments_by_client = aggregate_client(installments, group_vars = ['SK_ID_PREV', 'SK_ID_CURR'], df_names = ['installments', 'client'])
installments_by_client.head()

In [33]:
print('Installments by client shape: ', installments_by_client.shape)

train = train.merge(installments_by_client, on='SK_ID_CURR', how='left')
test = test.merge(installments_by_client, on='SK_ID_CURR', how='left')

gc.enable()
del installments, installments_by_client
gc.collect()

In [34]:
def remove_missing_columns(train, test, threshold = 70):
    # Calculate missing stats for train and test (remember to calculate a percent!)
    train_miss = pd.DataFrame(train.isnull().sum())
    train_miss['percent'] = 100 * train_miss[0] / len(train)
    
    test_miss = pd.DataFrame(test.isnull().sum())
    test_miss['percent'] = 100 * test_miss[0] / len(test)
    
    # list of missing columns for train and test
    missing_train_columns = list(train_miss.index[train_miss['percent'] > threshold])
    missing_test_columns = list(test_miss.index[test_miss['percent'] > threshold])
    
    # Combine the two lists together
    missing_columns = list(set(missing_train_columns + missing_test_columns))
    
    # Print information
    print('There are %d columns with greater than %d%% missing values.' % (len(missing_columns), threshold))
    
    # Drop the missing columns and return
    train = train.drop(columns = missing_columns)
    test = test.drop(columns = missing_columns)
    
    return train, test

In [35]:
train, test = remove_missing_columns(train, test)
print('Final Training Shape: ', train.shape)
print('Final Testing Shape: ', test.shape)

In [36]:
train_labels = train['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
train, test = train.align(test, join = 'inner', axis = 1)

# Add the target back in
train['TARGET'] = train_labels

print('Training Features shape: ', train.shape)
print('Testing Features shape: ', test.shape)

In [37]:
# Save the merged train and test dataset
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

In [3]:
# Directly load saved dataframe
train = pd.read_csv('../input/home-credit-merged/train.csv')
test = pd.read_csv('../input/home-credit-merged/test.csv')

In [4]:
train = convert_types(train, print_info=True)

In [5]:
test = convert_types(test, print_info=True)

In [6]:
# Create correlation matrix
corr_matrix = train.sample(n=1000, random_state=233).corr().abs()
corr_matrix.head()

In [7]:
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

print(len(to_drop), 'columns need to be deleted.')

In [8]:
train = train.drop(train[to_drop], axis=1)
test = test.drop(test[to_drop], axis=1)

In [9]:
print('Training Features shape: ', train.shape)
print('Testing Features shape: ', test.shape)

In [10]:
import lightgbm as lgb

train_labels = train['TARGET']
train_id = train[['SK_ID_CURR']]
train = train.drop(columns=['SK_ID_CURR'])
test_id = test[['SK_ID_CURR']]
test = test.drop(columns=['SK_ID_CURR'])
train = train.drop(columns = ['TARGET'])

import re
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

# Initialize an empty array to hold feature importances
feature_importances = np.zeros(train.shape[1])

In [11]:
# K-fold cross validation
from sklearn.model_selection import KFold
folds = KFold(n_splits=10, shuffle=True, random_state=233)

from sklearn.metrics import roc_auc_score

oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train)):
    trn_x, trn_y = train.iloc[trn_idx], train_labels.iloc[trn_idx]
    val_x, val_y = train.iloc[val_idx], train_labels.iloc[val_idx]
    
    # Create Lightgbm model
    model = lgb.LGBMClassifier(n_estimators=10000, objective='binary', 
                               class_weight='balanced', learning_rate=0.05, 
                               reg_alpha=0.1, reg_lambda=0.1, 
                               subsample=0.8, n_jobs=-1, random_state=233)
    
    model.fit(trn_x, trn_y, eval_metric='auc', eval_set=[(val_x, val_y), (trn_x, trn_y)],
              eval_names=['valid', 'train'], early_stopping_rounds=100, verbose=200)
    
    oof_preds[val_idx] = model.predict_proba(val_x)[:, 1]
    sub_preds += model.predict_proba(test)[:, 1] / folds.n_splits
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    feature_importances += model.feature_importances_
    del model, trn_x, trn_y, val_x, val_y
    gc.collect()

In [12]:
feature_importances = feature_importances / 10
feature_importances = pd.DataFrame({'feature': list(train.columns), 'importance': feature_importances}).sort_values('importance', ascending = False)

feature_importances.head()

In [13]:
zero_features = list(feature_importances[feature_importances['importance'] == 0.0]['feature'])
print('There are %d features with 0.0 importance' % len(zero_features))
feature_importances.tail()

In [14]:
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 22
import seaborn as sns

def plot_feature_importances(df, threshold = 0.9):
    """
    Plots 15 most important features and the cumulative importance of features.
    Prints the number of features needed to reach threshold cumulative importance.
    
    Parameters
    --------
    df : dataframe
        Dataframe of feature importances. Columns must be feature and importance
    threshold : float, default = 0.9
        Threshold for prining information about cumulative importances
        
    Return
    --------
    df : dataframe
        Dataframe ordered by feature importances with a normalized column (sums to 1)
        and a cumulative importance column
    
    """
    
    plt.rcParams['font.size'] = 18
    
    # Sort features according to importance
    df = df.sort_values('importance', ascending = False).reset_index()
    
    # Normalize the feature importances to add up to one
    df['importance_normalized'] = df['importance'] / df['importance'].sum()
    df['cumulative_importance'] = np.cumsum(df['importance_normalized'])

    # Make a horizontal bar chart of feature importances
    plt.figure(figsize = (10, 6))
    ax = plt.subplot()
    
    # Need to reverse the index to plot most important on top
    ax.barh(list(reversed(list(df.index[:5]))), 
            df['importance_normalized'].head(5), 
            align = 'center', edgecolor = 'k')
    
    # Set the yticks and labels
    ax.set_yticks(list(reversed(list(df.index[:5]))))
    ax.set_yticklabels(df['feature'].head(5))
    
    # Plot labeling
    plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
    plt.show()
    
    # Cumulative importance plot
    plt.figure(figsize = (8, 6))
    plt.plot(list(range(len(df))), df['cumulative_importance'], 'r-')
    plt.xlabel('Number of Features'); plt.ylabel('Cumulative Importance'); 
    plt.title('Cumulative Feature Importance');
    plt.show();
    
    importance_index = np.min(np.where(df['cumulative_importance'] > threshold))
    print('%d features required for %0.2f of cumulative importance' % (importance_index + 1, threshold))
    
    return df

In [15]:
norm_feature_importances = plot_feature_importances(feature_importances)

In [16]:
train = train.drop(columns = zero_features)
test = test.drop(columns = zero_features)
train['SK_ID_CURR'] = train_id['SK_ID_CURR']
test['SK_ID_CURR'] = test_id['SK_ID_CURR']

print('Training shape: ', train.shape)
print('Testing shape: ', test.shape)

In [17]:
train['TARGET'] = train_labels

# Save the selected train and test dataset
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)