# Introduction
In this notebook I use the data clean process from the previous notebook and 

# Basic feature

In [2]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft
import lightgbm as lgb
%matplotlib inline
import seaborn as sns

RSEED = 50

In [6]:
# read in data and check basic info
def read_check_data(file_path):
    data = pd.read_csv(file_path)
    #print('Training data shape: ', data.shape)
    #print(data.head())
    return data

In [7]:
# read in train and test file and combine them
app_train = read_check_data("./data/application_train.csv")
app_test = read_check_data("./data/application_test.csv")
app = app_train.append(app_test,sort=False)
app.tail()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
48739,456221,,Cash loans,F,N,Y,0,121500.0,412560.0,17473.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
48740,456222,,Cash loans,F,N,N,2,157500.0,622413.0,31909.5,...,0,0,0,0,,,,,,
48741,456223,,Cash loans,F,Y,Y,1,202500.0,315000.0,33205.5,...,0,0,0,0,0.0,0.0,0.0,0.0,3.0,1.0
48742,456224,,Cash loans,M,N,N,0,225000.0,450000.0,25128.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
48743,456250,,Cash loans,F,Y,N,0,135000.0,312768.0,24709.5,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0


In [8]:
def label_encoder(df, categorical_columns=None):
    """Encode categorical values as integers (0,1,2,3...) with pandas.factorize. """
    # if categorical_colunms are not given than treat object as categorical features
    if not categorical_columns:
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    for col in categorical_columns:
        df[col], uniques = pd.factorize(df[col])
    return df, categorical_columns

In [9]:
# remove 4 people code_gender value 'XNA'
app = app[app['CODE_GENDER'] != 'XNA']  # 4 people with XNA code gender
app['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
app['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True)
# change all categorical feature to numerical 
app_clean, categorical_columns = label_encoder(app, categorical_columns=None)

# Baseline Model

In [10]:
app_base_train = app_clean[app_clean['TARGET'].notnull()]
app_base_test = app_clean[app_clean['TARGET'].isnull()]
app_base_test.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,,0,1,0,0,0,135000.0,568800.0,20560.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,,0,0,0,0,0,99000.0,222768.0,17370.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,,0,0,1,0,0,202500.0,663264.0,69777.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,,0,1,0,0,2,315000.0,1575000.0,49018.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,,0,0,1,1,1,180000.0,625500.0,32067.0,...,0,0,0,0,,,,,,


In [20]:
def cross_validate(train):
    """Compute cross validation ROC AUC of a gradient boosting model for a given training dataset"""
    
    # Extract the labels
    train_labels = np.array(train['TARGET'].astype(np.int32)).reshape((-1, ))
    train = train.drop(columns = ['TARGET', 'SK_ID_CURR'])

    # Create a  lgb training set
    train_set = lgb.Dataset(train, label = train_labels)

    # Find default hyperparameters
    model = lgb.LGBMClassifier()
    params = model.get_params()

    # Number of estimators will be selected through early stopping
    del params['n_estimators'], params['silent']

    # Early stoppping with 5 fold cross validation
    cv_results = lgb.cv(params, train_set, num_boost_round = 10000, metrics = 'auc', 
                        early_stopping_rounds = 100, seed = RSEED, nfold = 5)

    print('Cross Validation ROC AUC: {:.5f} with std: {:.5f}.'.format(cv_results['auc-mean'][-1],
                                                                               cv_results['auc-stdv'][-1]))

    print('Number of estimators trained: {}'.format(len(cv_results['auc-mean'])))
    
    return cv_results

In [33]:
def get_model(cv_results, train):
     # Extract the labels
    train_labels = np.array(train['TARGET'].astype(np.int32)).reshape((-1, ))
    train = train.drop(columns = ['TARGET', 'SK_ID_CURR'])
    
    # Make model with optimal number of estimators and train on training data
    model = lgb.LGBMClassifier(n_estimators = len(cv_results['auc-mean']), random_state=RSEED)
    model.fit(train, train_labels)
    
    return model

In [34]:
def make_submission(model, test):
    """Make a submission dataframe for the Kaggle competition for a given dataset."""
    
    # Extract the labels
    test_ids = list(test['SK_ID_CURR'])
    test = test.drop(columns = ['TARGET','SK_ID_CURR'])
     
    # Make predictions on the testing data
    preds = model.predict_proba(test)[:, 1]
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 
                                'TARGET': preds})
    
    return submission

In [21]:
cv_results_baseline = cross_validate(app_base_train)

Cross Validation ROC AUC: 0.75552 with std: 0.00560.
Number of estimators trained: 142


In [35]:
model_base = get_model(cv_results_baseline, app_base_train)

In [46]:
def get_feature_importance(model,train):
    fi = pd.DataFrame({'feature': train.drop(columns = ['TARGET', 'SK_ID_CURR']).columns, 
                   'importance': model.feature_importances_})
    fi = fi.sort_values('importance', ascending = False)
    return fi

In [42]:
fi_baseline = get_feature_importance(model,app_base_train)
fi_baseline

Unnamed: 0,feature,importance
39,EXT_SOURCE_1,318
41,EXT_SOURCE_3,309
40,EXT_SOURCE_2,283
6,AMT_CREDIT,225
15,DAYS_BIRTH,213
93,DAYS_LAST_PHONE_CHANGE,204
16,DAYS_EMPLOYED,200
7,AMT_ANNUITY,191
8,AMT_GOODS_PRICE,161
18,DAYS_ID_PUBLISH,154


In [28]:
submission_baseline,model = make_submission(cv_results_baseline, app_base_train, app_base_test)
submission_baseline.to_csv('./data/submission_baseline.csv', index = False)

# Semi-Auto Feature First Try

In [14]:
def agg_numeric(df_child, parent_var, df_col_name):
    """
    Groups and aggregates the numeric values in a child dataframe
    by the parent variable.
    
    Parameters
    --------
        df_child (dataframe): 
            the child dataframe to calculate the statistics on
        parent_var (string): 
            the parent variable used for grouping and aggregating
        df_col_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated by the `parent_var` for 
            all numeric columns. The aggregate function are 'count', 'mean', 'max', 'min', 'sum'
            Each observation of the parent variable will have 
            one row in the dataframe with the parent variable as the index. 
            The columns are also renamed using the `df_col_name`. Columns with all duplicate
            values are removed. 
    
    """
    
    # Remove id variables other than grouping variable
    # e.g. SK_ID_BUREAU
    for col in df_child:
        if col != parent_var and 'SK_ID' in col:
            df_child = df_child.drop(columns = col)
            
    # Only want the numeric variables
    parent_ids = df_child[parent_var].copy()
    numeric_df = df_child.select_dtypes('number').copy()
    numeric_df[parent_var] = parent_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(parent_var).agg(['count', 'mean', 'max', 'min', 'sum'])

    # Need to create new column names
    columns = []

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        if var != parent_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_col_name, var, stat))
    
    agg.columns = columns
    
    # Remove the columns with all redundant values
    _, idx = np.unique(agg, axis = 1, return_index=True)
    agg = agg.iloc[:, idx]
    
    return agg

In [16]:
# auto generated features from bureau data
bureau = read_check_data("./data/bureau.csv")
bureau_agg = agg_numeric(bureau, 'SK_ID_CURR', 'BUREAU')
bureau_agg.head()

Unnamed: 0_level_0,BUREAU_DAYS_CREDIT_sum,BUREAU_DAYS_ENDDATE_FACT_sum,BUREAU_DAYS_CREDIT_min,BUREAU_DAYS_CREDIT_ENDDATE_min,BUREAU_DAYS_ENDDATE_FACT_min,BUREAU_DAYS_ENDDATE_FACT_mean,BUREAU_DAYS_CREDIT_mean,BUREAU_DAYS_CREDIT_UPDATE_sum,BUREAU_DAYS_ENDDATE_FACT_max,BUREAU_DAYS_CREDIT_UPDATE_min,...,BUREAU_AMT_CREDIT_SUM_DEBT_mean,BUREAU_AMT_CREDIT_SUM_min,BUREAU_AMT_CREDIT_SUM_mean,BUREAU_AMT_CREDIT_SUM_DEBT_max,BUREAU_AMT_CREDIT_SUM_max,BUREAU_AMT_CREDIT_SUM_DEBT_sum,BUREAU_AMT_CREDIT_SUM_sum,BUREAU_AMT_CREDIT_MAX_OVERDUE_min,BUREAU_AMT_CREDIT_MAX_OVERDUE_mean,BUREAU_AMT_CREDIT_MAX_OVERDUE_max
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,-5145,-3302.0,-1572,-1329.0,-1328.0,-825.5,-735.0,-652,-544.0,-155,...,85240.928571,85500.0,207623.571429,373239.0,378000.0,596686.5,1453365.0,,,
100002,-6992,-4185.0,-1437,-1072.0,-1185.0,-697.5,-874.0,-3999,-36.0,-1185,...,49156.2,0.0,108131.945625,245781.0,450000.0,245781.0,865055.565,0.0,1681.029,5043.645
100003,-5603,-3292.0,-2586,-2434.0,-2131.0,-1097.333333,-1400.75,-3264,-540.0,-2131,...,0.0,22248.0,254350.125,0.0,810000.0,0.0,1017400.5,0.0,0.0,0.0
100004,-1734,-1065.0,-1326,-595.0,-683.0,-532.5,-867.0,-1064,-382.0,-682,...,0.0,94500.0,94518.9,0.0,94537.8,0.0,189037.8,0.0,0.0,0.0
100005,-572,-123.0,-373,-128.0,-123.0,-123.0,-190.666667,-163,-123.0,-121,...,189469.5,29826.0,219042.0,543087.0,568800.0,568408.5,657126.0,0.0,0.0,0.0


In [17]:
app_clean_second = pd.merge(app_clean,bureau_agg,on='SK_ID_CURR',how='left')
app_second_train = app_clean_second[app_clean_second['TARGET'].notnull()]
app_second_test = app_clean_second[app_clean_second['TARGET'].isnull()]

In [19]:
cv_results_second = cross_validate(app_second_train)

Cross Validation ROC AUC: 0.76088 with std: 0.00494.
Number of estimators trained: 170


In [44]:
model_second = get_model(cv_results_second, app_second_train)

In [45]:
fi_second = get_feature_importance(model_second,)
fi_second

ValueError: arrays must all be same length

In [None]:
# notice that althought this time the local cv gives better result
# the submission result is much worse than baseline model
# may due to overfitting
submission_second = make_submission(cv_results_second, app_second_train, app_second_test)
submission_second.to_csv('./data/submission_second.csv', index = False)