In [1]:
import pandas as pd
import numpy as np
import time
import gc
from contextlib import contextmanager

In [4]:
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(name, time.time() - t0))


In [7]:
with timer('App_train and App_test'):
    df = pd.read_parquet('TO_TRAIN/train_test.gzip')
    print('Shape of train_test: ', df.shape)
with timer("Bureau and Bureau_balance"):
    bureau = pd.read_parquet('TO_TRAIN/bureau.gzip')
    df = df.merge(bureau, on= 'SK_ID_CURR', how= 'left')
    print("Shape of Bureau_bb: ", bureau.shape)
    del bureau; gc.collect()
with timer("Previous"):
    prev = pd.read_parquet('TO_TRAIN/prev.gzip')
    df = df.merge(prev, on = 'SK_ID_CURR', how = 'left')
    print("Shape of previous: ", prev.shape)
    del prev; gc.collect()
with timer("POS_CASH"):
    pos = pd.read_parquet('TO_TRAIN/pos.gzip')
    df = df.merge(pos, on = 'SK_ID_CURR', how = 'left')
    print("Shape of Pos_cash: ", pos.shape)
    del pos; gc.collect()
with timer("Instalment"):
    ins = pd.read_parquet('TO_TRAIN/ins.gzip')
    df = df.merge(ins, on = 'SK_ID_CURR', how = 'left')
    print("Shape of instalments: ", ins.shape)
    del ins; gc.collect()
with timer("Credit_Card"):
    credit = pd.read_parquet('TO_TRAIN/credit.gzip')
    df = df.merge(credit, on = 'SK_ID_CURR', how = 'left')
    print("Shape of Credit_card: ", credit.shape)
    del credit; gc.collect()

Shape of train_test:  (356251, 76)
App_train and App_test - done in 1s
Shape of Bureau_bb:  (305811, 145)
Bureau and Bureau_balance - done in 5s
Shape of previous:  (338857, 193)
Previous - done in 6s
Shape of Pos_cash:  (337252, 27)
POS_CASH - done in 1s
Shape of instalments:  (339587, 133)
Instalment - done in 4s
Shape of Credit_card:  (103558, 72)
Credit_Card - done in 3s


In [43]:
# CREDIT TO INCOME RATIO
df['BUREAU_INCOME_CREDIT_RATIO'] = df['BUREAU_AMT_CREDIT_SUM_MEAN'] / df['AMT_INCOME_TOTAL']
df['BUREAU_ACTIVE_CREDIT_TO_INCOME_RATIO'] = df['BUREAU_ACTIVE_AMT_CREDIT_SUM_SUM'] / df['AMT_INCOME_TOTAL']
# PREVIOUS TO CURRENT CREDIT RATIO
df['CURRENT_TO_APPROVED_CREDIT_MIN_RATIO'] = df['APPROVED_AMT_CREDIT_MIN'] / df['AMT_CREDIT']
df['CURRENT_TO_APPROVED_CREDIT_MAX_RATIO'] = df['APPROVED_AMT_CREDIT_MAX'] / df['AMT_CREDIT']
df['CURRENT_TO_APPROVED_CREDIT_MEAN_RATIO'] = df['APPROVED_AMT_CREDIT_MEAN'] / df['AMT_CREDIT']
# PREVIOUS TO CURRENT ANNUITY RATIO
df['CURRENT_TO_APPROVED_ANNUITY_MAX_RATIO'] = df['APPROVED_AMT_ANNUITY_MAX'] / df['AMT_ANNUITY']
df['CURRENT_TO_APPROVED_ANNUITY_MEAN_RATIO'] = df['APPROVED_AMT_ANNUITY_MEAN'] / df['AMT_ANNUITY']
df['PAYMENT_MIN_TO_ANNUITY_RATIO'] = df['INS_AMT_PAYMENT_MIN'] / df['AMT_ANNUITY']
df['PAYMENT_MAX_TO_ANNUITY_RATIO'] = df['INS_AMT_PAYMENT_MAX'] / df['AMT_ANNUITY']
df['PAYMENT_MEAN_TO_ANNUITY_RATIO'] = df['INS_AMT_PAYMENT_MEAN'] / df['AMT_ANNUITY']
# PREVIOUS TO CURRENT CREDIT TO ANNUITY RATIO
# df['CTA_CREDIT_TO_ANNUITY_MAX_RATIO'] = df['APPROVED_CREDIT_TO_ANNUITY_RATIO_MAX'] / df['CREDIT_TO_ANNUITY_RATIO']
# df['CTA_CREDIT_TO_ANNUITY_MEAN_RATIO'] = df['APPROVED_CREDIT_TO_ANNUITY_RATIO_MEAN'] / df['CREDIT_TO_ANNUITY_RATIO']
# DAYS DIFFERENCES AND RATIOS
df['DAYS_DECISION_MEAN_TO_BIRTH'] = df['APPROVED_DAYS_DECISION_MEAN'] / df['DAYS_BIRTH']
df['DAYS_CREDIT_MEAN_TO_BIRTH'] = df['BUREAU_DAYS_CREDIT_MEAN'] / df['DAYS_BIRTH']
df['DAYS_DECISION_MEAN_TO_EMPLOYED'] = df['APPROVED_DAYS_DECISION_MEAN'] / df['DAYS_EMPLOYED']
df['DAYS_CREDIT_MEAN_TO_EMPLOYED'] = df['BUREAU_DAYS_CREDIT_MEAN'] / df['DAYS_EMPLOYED']

In [8]:
print("Shape of dataframe: ", df.shape)

Shape of dataframe:  (356251, 641)


In [44]:
train = df.loc[df['TARGET'].notnull()]
test = df[df.TARGET.isnull()]

In [45]:
test.drop(columns= ['TARGET'], inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.drop(columns= ['TARGET'], inplace= True)


In [46]:
print("Shape of train: {} and test: {}".format(train.shape, test.shape))

Shape of train: (307507, 655) and test: (48744, 654)


In [55]:
train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,CURRENT_TO_APPROVED_CREDIT_MEAN_RATIO,CURRENT_TO_APPROVED_ANNUITY_MAX_RATIO,CURRENT_TO_APPROVED_ANNUITY_MEAN_RATIO,PAYMENT_MIN_TO_ANNUITY_RATIO,PAYMENT_MAX_TO_ANNUITY_RATIO,PAYMENT_MEAN_TO_ANNUITY_RATIO,DAYS_DECISION_MEAN_TO_BIRTH,DAYS_CREDIT_MEAN_TO_BIRTH,DAYS_DECISION_MEAN_TO_EMPLOYED,DAYS_CREDIT_MEAN_TO_EMPLOYED
0,100002,1.0,0,0,0,0,202500.0,406597.5,24700.5,351000.0,...,0.440374,0.374558,0.374558,0.374558,2.149501,0.467976,0.064052,0.092379,0.951334,1.372057
1,100003,0.0,0,1,0,0,270000.0,1293502.5,35698.5,1129500.0,...,0.374326,2.755214,1.584212,0.186646,15.710334,1.81393,0.077841,0.083552,1.098485,1.179082
2,100004,0.0,1,0,1,0,67500.0,135000.0,6750.0,135000.0,...,0.148933,0.793667,0.793667,0.793667,1.566513,1.051282,0.042791,0.045521,3.622222,3.853333
3,100006,0.0,0,1,0,0,135000.0,312682.5,29686.5,297000.0,...,1.09929,1.345881,0.735762,0.083638,23.30308,2.120394,0.018185,,0.113722,
4,100007,0.0,0,0,0,0,121500.0,513000.0,21865.5,513000.0,...,0.324832,1.037195,0.561561,8e-06,1.037195,0.5586,0.06135,0.057646,0.402513,0.378209


In [56]:
test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,CURRENT_TO_APPROVED_CREDIT_MEAN_RATIO,CURRENT_TO_APPROVED_ANNUITY_MAX_RATIO,CURRENT_TO_APPROVED_ANNUITY_MEAN_RATIO,PAYMENT_MIN_TO_ANNUITY_RATIO,PAYMENT_MAX_TO_ANNUITY_RATIO,PAYMENT_MEAN_TO_ANNUITY_RATIO,DAYS_DECISION_MEAN_TO_BIRTH,DAYS_CREDIT_MEAN_TO_BIRTH,DAYS_DECISION_MEAN_TO_EMPLOYED,DAYS_CREDIT_MEAN_TO_EMPLOYED
307507,100001,0,1,0,0,135000.0,568800.0,20560.5,450000.0,0,...,0.04182,0.192165,0.192165,0.192165,0.846181,0.286235,0.090432,0.0382,0.747102,0.315586
307508,100005,0,0,0,0,99000.0,222768.0,17370.0,180000.0,0,...,0.180248,0.277098,0.277098,0.277098,1.016479,0.359252,0.041907,0.010555,0.169389,0.042664
307509,100013,0,0,1,0,202500.0,663264.0,69777.0,630000.0,-1,...,0.293768,0.331828,0.164498,8.8e-05,5.121283,0.139591,0.052034,0.08671,0.233887,0.389749
307510,100028,0,1,0,2,315000.0,1575000.0,49018.5,1575000.0,0,...,0.087891,0.229505,0.165072,2.4e-05,0.795384,0.088879,0.097596,0.100297,0.730975,0.751206
307511,100038,0,0,1,1,180000.0,625500.0,32067.0,625500.0,0,...,0.14805,0.346169,0.346169,0.346071,0.346169,0.346161,0.06296,,0.374715,


In [18]:
train.to_parquet('train.gzip', compression= 'gzip', index= False)
test.to_parquet('test.gzip', compression= 'gzip', index= False)

FS

In [19]:
def missing_value_table(df, print_info = True):
    # total missing value
    mis_val = df.isnull().sum()

    # percent missing value
    percent_mis_val = 100*mis_val/len(df)

    # make a table contain result
    table = pd.concat([mis_val, percent_mis_val], axis=1)

    # rename columns
    table_re_col = table.rename(columns = {0: 'Missing Values', 1: '% Missing Value'})

    # sort table by percent missing value descending
    table_re_col = table_re_col[table_re_col.iloc[:,1] != 0].sort_values('% Missing Value', ascending= False).round(1)

    if print_info:
        # print something important
        print("Dataframe has " + str(df.shape[1]) + " columns.\n"
                "There are " + str(len(table_re_col)) + " columns that have missing value."
                )

    return table_re_col

In [20]:
def drop_missing_values(df1, df2, thresh = 90, print_info = True):
    
    missing_table_df1 = missing_value_table(df1)
    missing_table_df2 = missing_value_table(df2)

    to_drop_df1 = list(missing_table_df1.loc[missing_table_df1['% Missing Value'] >= thresh].index)
    to_drop_df2 = list(missing_table_df2.loc[missing_table_df2['% Missing Value'] >= thresh].index)
    
    missing_columns = list(set(to_drop_df1 + to_drop_df2))

    df1 = df1.drop(columns = missing_columns)
    df2 = df2.drop(columns = missing_columns)

    if print_info:
        print("There are {} columns have more than {}% missing value".format(len(missing_columns), thresh))
    
    return df1, df2

In [48]:
train, test = drop_missing_values(train, test)

Dataframe has 655 columns.
There are 621 columns that have missing value.
Dataframe has 654 columns.
There are 615 columns that have missing value.
There are 5 columns have more than 90% missing value


In [54]:
print("Shape of train: {} and test: {}".format(train.shape, test.shape))

Shape of train: (307507, 650) and test: (48744, 649)


Lightgbm

In [29]:
from sklearn.model_selection import KFold
from sklearn.metrics import  roc_auc_score
import lightgbm as lgb
from sklearn.metrics import ConfusionMatrixDisplay
import gc

In [57]:
def model(features, test_features, n_folds = 10):
    """ Train and test a light gradient boosting model using cross validation
    Parameters
    ---------
        features (pd.DataFrame): dataframe of training dataset, must have TARGET column
        test_features (pd.DataFrame): dataframe of testing dataset
        encoding (str, default = ohe): method for encoding categorical variable, ohe for one-hot encoder and le for integer label encoding
        n_folds (int, default = 5): number of fold to use for cross validation
    Return
    ---------
        submission (pd.DataFrame): have two column 'SK_ID_CURR' and 'TARGET' proba predicted bt the model
        feature_importances (pd.DataFrame): dataframe with feature importance from model
        valid_metrics (pd.DataFrame): dataframe with training and validation metrics (ROC AUC) for each fold and overall

        """
    
    # Extract the ids
    train_ids = features['SK_ID_CURR']
    test_ids = test_features['SK_ID_CURR']

    # Extract the labels for training
    labels = features['TARGET']

    # Remove the ids and target

    features = features.drop(columns = ['SK_ID_CURR', 'TARGET'])
    test_features = test_features.drop(columns = ['SK_ID_CURR'])


    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    
    feature_names = list(features.columns)

    # Convert to np arrays
    features = np.array(features)
    test_features = np.array(test_features)

    # Create the kfold object
    k_fold = KFold(n_splits= n_folds, shuffle= True, random_state= 50)

    # Empty array for features importance
    feature_importance_values = np.zeros(len(feature_names))

    # Empty array for test predictions
    test_predictions = np.zeros(test_features.shape[0])

    # Empty array for out of fold validation predictions
    out_of_fold = np.zeros(features.shape[0])

    # Lists for recording validation and training scores
    valid_scores = []
    train_scores = []

    confusion_matrix = np.zeros((2,2))

    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(features):
        
        # Training data for the fold
        train_features, train_labels = features.iloc[train_indices], labels.iloc[train_indices]

        # validation data for the fold
        valid_features, valid_labels = features.iloc[valid_indices], labels.iloc[valid_indices]

        # Create model
        model = lgb.LGBMClassifier(n_estimators= 10000, objective='binary', class_weight= 'balanced', learning_rate= 0.02, max_depth=8, reg_alpha= 0.04, reg_lambda= 0.073, subsample= 0.8715623, n_jobs= -1, random_state= 50, num_leaves= 34, colsample_bytree= 0.9497036, min_split_gain= 0.0222415, min_child_weight= 39.3259775, silent= -1, verbose = -1)

        # Train the model
        model = model.fit(train_features, train_labels, eval_metric = 'auc',
                    eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                    eval_names = ['valid', 'train'],
                    categorical_feature = 'auto',
                    early_stopping_rounds = 100,
                    verbose = 200   )

        # Record the best iteration
        best_iteration = model.best_iteration_

        # Record the feature importance
        feature_importance_values += model.feature_importances_ / k_fold.n_splits

        # Make predictions
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:,1] / k_fold.n_splits

        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:,1]

        # Record the best score
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']

        valid_scores.append(valid_score)
        train_scores.append(train_score)

        # Make confusion matrix
        confusion_matrix += (ConfusionMatrixDisplay.from_estimator(model,
                        valid_features,
                        valid_labels,
                        display_labels = ['0','1'],
                        normalize = 'true').confusion_matrix) / k_fold.n_splits
        
        
        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()

    # Make the submission dataframe
    submission = pd.DataFrame({'SK_ID_CURR': test_ids, 'TARGET': test_predictions})

    # Make the feature importance dataFrame
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})

    # Make confusion matrix dataframe
    confusion_matrix = pd.DataFrame(confusion_matrix, index=['0', '1'], columns= ['0', '1'])

    # Overall validation score
    valid_auc = roc_auc_score(labels, out_of_fold)

    # Add the overall scores to the metric
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))

    # Needed for creating dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores}) 
    
    # 

    return submission, feature_importances, metrics, confusion_matrix


In [58]:
submission, fi, metrics, cm = model(train, test, n_folds= 5)

MemoryError: Unable to allocate 1.43 GiB for an array with shape (625, 307507) and data type float64

In [32]:
submission.to_csv('final.csv', index= False)

In [35]:
fi = fi.sort_values('importance', ascending= False)

In [40]:
len(fi[fi.importance == 0.0])

49