# Introduction
This notebook focuse on using auto features to train model

In [17]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft
import lightgbm as lgb
%matplotlib inline
import seaborn as sns
import gc

RSEED = 50

In [18]:
# read in data and check basic info
def read_check_data(file_path):
    data = pd.read_csv(file_path)
    #print('Training data shape: ', data.shape)
    #print(data.head())
    return data

In [19]:
# read in train and test file and combine them
app_train = read_check_data("./data/application_train.csv")
app_test = read_check_data("./data/application_test.csv")
app = app_train.append(app_test,sort=False)
del app_train, app_test
app.tail()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
48739,456221,,Cash loans,F,N,Y,0,121500.0,412560.0,17473.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
48740,456222,,Cash loans,F,N,N,2,157500.0,622413.0,31909.5,...,0,0,0,0,,,,,,
48741,456223,,Cash loans,F,Y,Y,1,202500.0,315000.0,33205.5,...,0,0,0,0,0.0,0.0,0.0,0.0,3.0,1.0
48742,456224,,Cash loans,M,N,N,0,225000.0,450000.0,25128.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
48743,456250,,Cash loans,F,Y,N,0,135000.0,312768.0,24709.5,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0


In [20]:
def label_encoder(df, categorical_columns=None):
    """Encode categorical values as integers (0,1,2,3...) with pandas.factorize. """
    # if categorical_colunms are not given than treat object as categorical features
    if not categorical_columns:
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    for col in categorical_columns:
        df[col], uniques = pd.factorize(df[col])
    return df, categorical_columns

In [21]:
# remove 4 people code_gender value 'XNA'
app = app[app['CODE_GENDER'] != 'XNA']  # 4 people with XNA code gender
app['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
app['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True)
# change all categorical feature to numerical 
app_clean, categorical_columns = label_encoder(app, categorical_columns=None)

In [22]:
def agg_numeric(df_child, parent_var, df_col_name):
    """
    Groups and aggregates the numeric values in a child dataframe
    by the parent variable.
    
    Parameters
    --------
        df_child (dataframe): 
            the child dataframe to calculate the statistics on
        parent_var (string): 
            the parent variable used for grouping and aggregating
        df_col_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated by the `parent_var` for 
            all numeric columns. The aggregate function are 'count', 'mean', 'max', 'min', 'sum'
            Each observation of the parent variable will have 
            one row in the dataframe with the parent variable as the index. 
            The columns are also renamed using the `df_col_name`. Columns with all duplicate
            values are removed. 
    
    """
    
    # Remove id variables other than grouping variable
    # e.g. SK_ID_BUREAU
    for col in df_child:
        if col != parent_var and 'SK_ID' in col:
            df_child = df_child.drop(columns = col)
            
    # Only want the numeric variables
    parent_ids = df_child[parent_var].copy()
    numeric_df = df_child.select_dtypes('number').copy()
    numeric_df[parent_var] = parent_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(parent_var).agg(['count', 'mean', 'max', 'min', 'sum'])

    # Need to create new column names
    columns = []

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        if var != parent_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_col_name, var, stat))
    
    agg.columns = columns
    
    # Remove the columns with all redundant values
    _, idx = np.unique(agg, axis = 1, return_index=True)
    agg = agg.iloc[:, idx]
    
    return agg

In [23]:
def agg_categorical(df, parent_var, df_name):
    """
    Aggregates the categorical features in a child dataframe
    for each observation of the parent variable.
    
    Parameters
    --------
    df : dataframe 
        The dataframe to calculate the value counts for.
        
    parent_var : string
        The variable by which to group and aggregate the dataframe. For each unique
        value of this variable, the final dataframe will have one row
        
    df_name : string
        Variable added to the front of column names to keep track of columns

    
    Return
    --------
    categorical : dataframe
        A dataframe with aggregated statistics for each observation of the parent_var
        The columns are also renamed and columns with duplicate values are removed.
        
    """
    
    # Select the categorical columns
    categorical = pd.get_dummies(df.select_dtypes('object'))

    # Make sure to put the identifying id on the column
    categorical[parent_var] = df[parent_var]

    # Groupby the group var and calculate the sum and mean
    categorical = categorical.groupby(parent_var).agg(['sum', 'count', 'mean'])
    
    column_names = []
    
    # Iterate through the columns in level 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['sum', 'count', 'mean']:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
    
    # Remove duplicate columns by values
    _, idx = np.unique(categorical, axis = 1, return_index = True)
    categorical = categorical.iloc[:, idx]
    
    return categorical

In [24]:
# memory management
def agg_grandchild(df, parent_df, parent_var, grandparent_var, df_name):
    """
    Aggregate a grandchild dataframe at the grandparent level.
    
    Parameters
    --------
        df : dataframe
            Data with each row representing one observation
            
        parent_df : dataframe
            Parent table of df that must have the parent_var and 
            the grandparent_var. Used only to get the grandparent_var into
            the dataframe after aggregations
            
        parent_var : string
            Variable representing each unique observation in the parent.
            For example, `SK_ID_BUREAU` or `SK_ID_PREV`
            
        grandparent_var : string
            Variable representing each unique observation in the grandparent.
            For example, `SK_ID_CURR`. 
            
        df_name : string
            String for renaming the resulting columns.
            The columns are name with the `df_name` and with the 
            statistic calculated in the column
    
    Return
    --------
        df_info : dataframe
            A dataframe with one row for each observation of the grandparent variable.
            The grandparent variable forms the index, and the resulting dataframe
            can be merged with the grandparent to be used for training/testing. 
            Columns with all duplicate values are removed from the dataframe before returning.
    
    """
    
    # set the parent_var as the index of the parent_df for faster merges
    parent_df = parent_df[[parent_var, grandparent_var]].copy().set_index(parent_var)
    
    # Aggregate the numeric variables at the parent level
    df_agg = agg_numeric(df, parent_var, '%s_LOAN' % df_name)
    
    # Merge to get the grandparent variable in the data
    df_agg = df_agg.merge(parent_df, 
                          on = parent_var, how = 'left')
    
    # Aggregate the numeric variables at the grandparent level
    df_agg_client = agg_numeric(df_agg, grandparent_var, '%s_CLIENT' % df_name)
    
    # Can only apply one-hot encoding to categorical variables
    if any(df.dtypes == 'object'):
    
        # Aggregate the categorical variables at the parent level
        df_agg_cat = agg_categorical(df, parent_var, '%s_LOAN' % df_name)
        df_agg_cat = df_agg_cat.merge(parent_df,
                                      on = parent_var, how = 'left')

        # Aggregate the categorical variables at the grandparent level
        df_agg_cat_client = agg_numeric(df_agg_cat, grandparent_var, '%s_CLIENT' % df_name)
        df_info = df_agg_client.merge(df_agg_cat_client, on = grandparent_var, how = 'outer')
        
        gc.enable()
        del df_agg, df_agg_client, df_agg_cat, df_agg_cat_client
        gc.collect()
    
    # If there are no categorical variables, then we only need the numeric aggregations
    else:
        df_info = df_agg_client.copy()
    
        gc.enable()
        del df_agg, df_agg_client
        gc.collect()
    
    # Drop the columns with all duplicated values
    _, idx = np.unique(df_info, axis = 1, return_index=True)
    df_info = df_info.iloc[:, idx]
    
    return df_info

In [25]:
def cross_validate(train):
    """Compute cross validation ROC AUC of a gradient boosting model for a given training dataset"""
    
    # Extract the labels
    train_labels = np.array(train['TARGET'].astype(np.int32)).reshape((-1, ))
    train = train.drop(columns = ['TARGET', 'SK_ID_CURR'])

    # Create a  lgb training set
    train_set = lgb.Dataset(train, label = train_labels)

    # Find default hyperparameters
    model = lgb.LGBMClassifier()
    params = model.get_params()

    # Number of estimators will be selected through early stopping
    del params['n_estimators'], params['silent']

    # Early stoppping with 5 fold cross validation
    cv_results = lgb.cv(params, train_set, num_boost_round = 10000, metrics = 'auc', 
                        early_stopping_rounds = 100, seed = RSEED, nfold = 5)

    print('Cross Validation ROC AUC: {:.5f} with std: {:.5f}.'.format(cv_results['auc-mean'][-1],
                                                                               cv_results['auc-stdv'][-1]))

    print('Number of estimators trained: {}'.format(len(cv_results['auc-mean'])))
    
    return cv_results

In [26]:
def get_model(cv_results, train):
     # Extract the labels
    train_labels = np.array(train['TARGET'].astype(np.int32)).reshape((-1, ))
    train = train.drop(columns = ['TARGET', 'SK_ID_CURR'])
    
    # Make model with optimal number of estimators and train on training data
    model = lgb.LGBMClassifier(n_estimators = len(cv_results['auc-mean']), random_state=RSEED)
    model.fit(train, train_labels)
    
    return model

In [27]:
def get_feature_importance(model,train):
    fi = pd.DataFrame({'feature': train.drop(columns = ['TARGET', 'SK_ID_CURR']).columns, 
                   'importance': model.feature_importances_})
    fi = fi.sort_values('importance', ascending = False)
    return fi

In [28]:
def make_submission(model, test):
    """Make a submission dataframe for the Kaggle competition for a given dataset."""
    
    # Extract the labels
    test_ids = list(test['SK_ID_CURR'])
    test = test.drop(columns = ['TARGET','SK_ID_CURR'])
     
    # Make predictions on the testing data
    preds = model.predict_proba(test)[:, 1]
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 
                                'TARGET': preds})
    
    return submission

In [37]:
def get_submission(features,sub_filename):
    features_train_temp = features[features['TARGET'].notnull()]
    features_test_temp = features[features['TARGET'].isnull()]
    del features
    cv_results_temp = cross_validate(features_train_temp)
    model_temp = get_model(cv_results_temp, features_train_temp)
    
    fi_temp = get_feature_importance(model_temp,features_train_temp)
    fi_unimportant = fi_temp[fi_temp['importance'] < 5]
    
    features_train = features_train_temp.drop(np.array(fi_unimportant['feature']), axis=1)
    features_test = features_test_temp.drop(np.array(fi_unimportant['feature']), axis=1)
    del features_train_temp
    del features_test_temp
    
    cv_results = cross_validate(features_train)
    model = get_model(cv_results, features_train)
    submission = make_submission(model, features_test)
    submission.to_csv(sub_filename, index = False)

In [30]:
def add_ratios_features(df):
    df['BUREAU_INCOME_CREDIT_RATIO'] = df['BUREAU_AMT_CREDIT_SUM_mean'] / df['AMT_INCOME_TOTAL']
    

In [31]:
def get_bureau_feat(df):
    #add hand-design features for bureau data
    df['CREDIT_DURATION'] = -df['DAYS_CREDIT'] + df['DAYS_CREDIT_ENDDATE']
    df['ENDDATE_DIF'] = df['DAYS_CREDIT_ENDDATE'] - df['DAYS_ENDDATE_FACT']
    # Credit to debt ratio and difference
    df['DEBT_PERCENTAGE'] = df['AMT_CREDIT_SUM'] / df['AMT_CREDIT_SUM_DEBT']
    df['DEBT_CREDIT_DIFF'] = df['AMT_CREDIT_SUM'] - df['AMT_CREDIT_SUM_DEBT']
    df['CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT_SUM'] / df['AMT_ANNUITY']
    return df


In [32]:
def get_previous_feat(df):
    # Feature engineering: ratios and difference
    df['APPLICATION_CREDIT_DIFF'] = df['AMT_APPLICATION'] - df['AMT_CREDIT']
    df['APPLICATION_CREDIT_RATIO'] = df['AMT_APPLICATION'] / df['AMT_CREDIT']
    df['CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT']/df['AMT_ANNUITY']
    df['DOWN_PAYMENT_TO_CREDIT'] = df['AMT_DOWN_PAYMENT'] / df['AMT_CREDIT']
    return df

In [33]:
# read in data and make auto features
bureau = read_check_data("./data/bureau.csv")
bureau = get_bureau_feat(bureau)
bureau_agg = agg_numeric(bureau, 'SK_ID_CURR', 'BUREAU')
bureau_categorical = agg_categorical(bureau, 'SK_ID_CURR', 'BUREAU')

previous_application = read_check_data("./data/previous_application.csv")
previous_application = get_previous_feat(previous_application)
previous_agg = agg_numeric(previous_application, 'SK_ID_CURR', 'previous')
previous_categorical = agg_categorical(previous_application, 'SK_ID_CURR', 'PREVIOUS')

bureau_balance = read_check_data("./data/bureau_balance.csv")
bureau_balance_results = agg_grandchild(bureau_balance, bureau, 
                                        'SK_ID_BUREAU', 'SK_ID_CURR', 'BB')
del bureau_balance

credit_card = read_check_data('./data/credit_card_balance.csv')
credit_card_info = agg_grandchild(credit_card, previous_application,
                                  'SK_ID_PREV', 'SK_ID_CURR', 'CC')
del credit_card

cash = read_check_data('./data/POS_CASH_balance.csv')
cash_info = agg_grandchild(cash, previous_application, 
                           'SK_ID_PREV', 'SK_ID_CURR', 'CASH')
del cash

installments = read_check_data('./data/installments_payments.csv')
installments_info = agg_grandchild(installments, previous_application,
                                   'SK_ID_PREV', 'SK_ID_CURR', 'IN')
del installments

In [34]:
# Merge features
app = pd.merge(app_clean,bureau_agg,on='SK_ID_CURR',how='left')
del bureau_agg,app_clean
app = pd.merge(app,bureau_categorical,on='SK_ID_CURR',how='left')
del bureau_categorical
app = pd.merge(app,previous_agg,on='SK_ID_CURR',how='left')
del previous_agg
app = pd.merge(app,previous_categorical,on='SK_ID_CURR',how='left')
del previous_categorical
app = pd.merge(app,bureau_balance_results,on='SK_ID_CURR',how='left')
del bureau_balance_results
app = pd.merge(app,credit_card_info,on='SK_ID_CURR',how='left')
del credit_card_info
app = pd.merge(app,cash_info,on='SK_ID_CURR',how='left')
del cash_info
app = pd.merge(app,installments_info,on='SK_ID_CURR',how='left')
del installments_info

print(app.shape)

(356251, 1340)


In [38]:
get_submission(app,'./data/submission_nighth.csv')

Cross Validation ROC AUC: 0.77766 with std: 0.00301.
Number of estimators trained: 184
Cross Validation ROC AUC: 0.77873 with std: 0.00239.
Number of estimators trained: 171
