In [1]:
import pandas as pd
import numpy as np
import glob

from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from sklearn.model_selection import KFold
from tqdm import tqdm

# from lightgbm import LGBMClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import gc
import os

In [2]:
def save_dataframe(path, dataframe):
    np.save(path + ".data", dataframe.values)
    np.save(path + ".header", dataframe.columns)
    
def load_dataframe(path):
    data = np.load(path + ".data.npy")
    header = np.load(path + ".header.npy")
    return pd.DataFrame(data=data, columns=header)

def save_dataframe32(path, dataframe, keep=[]):
    col64 = [col_ for col_ in dataframe.columns if col_ in keep]
    col32 = [col_ for col_ in dataframe.columns if col_ not in keep]
    dataframe64 = dataframe[col64]
    dataframe32 = dataframe[col32]
    np.save(path + ".data64", dataframe64.values)
    np.save(path + ".header64", col64)
    np.save(path + ".data32", dataframe32.values.astype(np.float32))
    np.save(path + ".header32", col32)

def load_dataframe32(path):
    path_data32 = path + ".data32.npy"
    path_header32 = path + ".header32.npy"
    path_data64 = path + ".data64.npy"
    path_header64 = path + ".header64.npy"
    result = pd.DataFrame()
    if os.path.exists(path_data32):
        data32 = np.load(path_data32)
        header32 = np.load(path_header32)
        df32 = pd.DataFrame(data=data32, columns=header32)
        result = pd.concat([result, df32], axis=1)
    if os.path.exists(path_data64):
        data64 = np.load(path_data64)
        header64 = np.load(path_header64)
        df64 = pd.DataFrame(data=data64, columns=header64)
        result = pd.concat([result, df64], axis=1)
    return result

def get_id_columns(data):
    return [col_ for col_ in data.columns if "ID" in col_]
    
def get_object_columns(data):
    dtypes = data.dtypes
    return list(dtypes[dtypes == "object"].index)

In [9]:
for src_name in tqdm([
    './data/application_test.csv',
    './data/application_train.csv',
    './data/bureau.csv',
    './data/bureau_balance.csv',
    './data/credit_card_balance.csv',
    './data/HomeCredit_columns_description.csv',
    './data/installments_payments.csv',
    './data/POS_CASH_balance.csv',
    './data/previous_application.csv'
 ]):
    tar_name = src_name.replace("data", "bindata").replace(".csv", "")
    data = pd.read_csv(src_name)
    save_dataframe32(tar_name, data, keep=get_object_columns(data))

100%|████████████████████████████████████████████| 9/9 [01:16<00:00,  8.51s/it]


In [10]:
data2 = load_dataframe32(tar_name)

In [11]:
data.dtypes

SK_ID_PREV                       int64
SK_ID_CURR                       int64
NAME_CONTRACT_TYPE              object
AMT_ANNUITY                    float64
AMT_APPLICATION                float64
AMT_CREDIT                     float64
AMT_DOWN_PAYMENT               float64
AMT_GOODS_PRICE                float64
WEEKDAY_APPR_PROCESS_START      object
HOUR_APPR_PROCESS_START          int64
FLAG_LAST_APPL_PER_CONTRACT     object
NFLAG_LAST_APPL_IN_DAY           int64
RATE_DOWN_PAYMENT              float64
RATE_INTEREST_PRIMARY          float64
RATE_INTEREST_PRIVILEGED       float64
NAME_CASH_LOAN_PURPOSE          object
NAME_CONTRACT_STATUS            object
DAYS_DECISION                    int64
NAME_PAYMENT_TYPE               object
CODE_REJECT_REASON              object
NAME_TYPE_SUITE                 object
NAME_CLIENT_TYPE                object
NAME_GOODS_CATEGORY             object
NAME_PORTFOLIO                  object
NAME_PRODUCT_TYPE               object
CHANNEL_TYPE             

In [12]:
data2.dtypes

SK_ID_PREV                     float32
SK_ID_CURR                     float32
AMT_ANNUITY                    float32
AMT_APPLICATION                float32
AMT_CREDIT                     float32
AMT_DOWN_PAYMENT               float32
AMT_GOODS_PRICE                float32
HOUR_APPR_PROCESS_START        float32
NFLAG_LAST_APPL_IN_DAY         float32
RATE_DOWN_PAYMENT              float32
RATE_INTEREST_PRIMARY          float32
RATE_INTEREST_PRIVILEGED       float32
DAYS_DECISION                  float32
SELLERPLACE_AREA               float32
CNT_PAYMENT                    float32
DAYS_FIRST_DRAWING             float32
DAYS_FIRST_DUE                 float32
DAYS_LAST_DUE_1ST_VERSION      float32
DAYS_LAST_DUE                  float32
DAYS_TERMINATION               float32
NFLAG_INSURED_ON_APPROVAL      float32
NAME_CONTRACT_TYPE              object
WEEKDAY_APPR_PROCESS_START      object
FLAG_LAST_APPL_PER_CONTRACT     object
NAME_CASH_LOAN_PURPOSE          object
NAME_CONTRACT_STATUS     

In [5]:
data.head().T

Unnamed: 0,0,1,2,3,4
SK_ID_PREV,2030495,2802425,2523466,2819243,1784265
SK_ID_CURR,271877,108129,122040,176158,202054
NAME_CONTRACT_TYPE,Consumer loans,Cash loans,Cash loans,Cash loans,Cash loans
AMT_ANNUITY,1730.43,25188.6,15060.7,47041.3,31924.4
AMT_APPLICATION,17145,607500,112500,450000,337500
AMT_CREDIT,17145,679671,136444,470790,404055
AMT_DOWN_PAYMENT,0,,,,
AMT_GOODS_PRICE,17145,607500,112500,450000,337500
WEEKDAY_APPR_PROCESS_START,SATURDAY,THURSDAY,TUESDAY,MONDAY,THURSDAY
HOUR_APPR_PROCESS_START,15,11,11,7,9
