## Creating File Records

In this notebook, we will create the file recoreds for auxillary i.e. for each loan application in master file, we will fetch the records from auxillary files, take the mean or weighted mean and store that.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# project directory
current_dir = 'Home Credit_Kaggle'

# set the project folder as current working directory
import os
complete_path = os.path.join('/content/drive/My Drive/Colab Notebooks/',current_dir)
os.chdir(complete_path)

# create output folder for file snapshots if not already present
out_path_data = os.path.join(complete_path,'final_data')
if not os.path.isdir(out_path_data):
  os.mkdir(out_path_data)
  # create folders for all batches
  batches_count = 7
  for b in range(batches_count):
    out_path_batch = os.path.join(out_path_data,'batch' + str(b+1))
    os.mkdir(out_path_batch)

In [None]:
import numpy as np
import pandas as pd
import time
from scipy.sparse import csr_matrix,save_npz

## Load control File

In [None]:
# load HomeCredit_Control File_File Level.csv
file_level_flags = pd.read_csv('control/HomeCredit_Control File_File Level_ml.csv')
print(file_level_flags.shape)
file_level_flags.head(6)

(6, 4)


Unnamed: 0,FILE_NAME,NUM_TOP_REC,ORDER_BY,ASC ORDER?
0,bureau.csv,0,"SK_ID_CURR,SK_ID_BUREAU,DAYS_CREDIT",1
1,bureau_balance.csv,0,"SK_ID_BUREAU,MONTHS_BALANCE",1
2,previous_application.csv,0,"SK_ID_CURR,SK_ID_PREV,DAYS_DECISION",1
3,POS_CASH_balance.csv,0,"SK_ID_CURR,SK_ID_PREV,MONTHS_BALANCE",1
4,installments_payments.csv,0,"SK_ID_CURR,SK_ID_PREV,DAYS_INSTALMENT",1
5,credit_card_balance.csv,0,"SK_ID_CURR,SK_ID_PREV,MONTHS_BALANCE",1


In [None]:
# create a dictionary from above data using [FILE_NAME,FIELD_NAME] as key
# for fast lookup

# prepare key as 'FILE_NAME' for each record
file_name_arr = np.asarray(file_level_flags['FILE_NAME'])
l = len(file_name_arr)
keys = [str(file_name_arr[i]).strip() for i in range(l)]

# prepare values as ['NUM_TOP_REC','ORDER_BY','ASC_ORDER?'] for each record
num_top_rec_arr = np.asarray(file_level_flags['NUM_TOP_REC'])
order_by_arr = np.asarray(file_level_flags['ORDER_BY'])
asc_order_arr = np.asarray(file_level_flags['ASC ORDER?'])
values = [[num_top_rec_arr[i],order_by_arr[i],asc_order_arr[i]] for i in range(l)]

# combined into dictionary
dict_file_flags = dict(zip(keys,values))
print(dict_file_flags.keys())
print(dict_file_flags.values())

dict_keys(['bureau.csv', 'bureau_balance.csv', 'previous_application.csv', 'POS_CASH_balance.csv', 'installments_payments.csv', 'credit_card_balance.csv'])
dict_values([[0, 'SK_ID_CURR,SK_ID_BUREAU,DAYS_CREDIT', 1], [0, 'SK_ID_BUREAU,MONTHS_BALANCE', 1], [0, 'SK_ID_CURR,SK_ID_PREV,DAYS_DECISION', 1], [0, 'SK_ID_CURR,SK_ID_PREV,MONTHS_BALANCE', 1], [0, 'SK_ID_CURR,SK_ID_PREV,DAYS_INSTALMENT', 1], [0, 'SK_ID_CURR,SK_ID_PREV,MONTHS_BALANCE', 1]])


## Load Preprocessed Data

In [None]:
# load preprocessed data of files

from scipy.sparse import load_npz

# Application Train table
app_train_keys = pd.read_csv('preprocessed/app_train_keys.csv')
app_train_numeric_data = np.load('preprocessed/app_train_numeric_data.npy')
app_train_categ_data = load_npz('preprocessed/app_train_categ_data_csr.npz').todense()
print(app_train_keys.shape)
print(app_train_numeric_data.shape)
print(app_train_categ_data.shape)
print('='*120)
 
# Bureau table
bureau_keys = pd.read_csv('preprocessed/bureau_keys.csv')
bureau_numeric_data = np.load('preprocessed/bureau_numeric_data.npy')
bureau_categ_data = load_npz('preprocessed/bureau_categ_data_csr.npz').todense()
print(bureau_keys.shape)
print(bureau_numeric_data.shape)
print(bureau_categ_data.shape)
print('='*120)

# Bureau balance table
bureau_bal_keys = pd.read_csv('preprocessed/bureau_bal_keys.csv')
bureau_bal_numeric_data = np.load('preprocessed/bureau_bal_numeric_data.npy')
bureau_bal_categ_data = load_npz('preprocessed/bureau_bal_categ_data_csr.npz').todense()
print(bureau_bal_keys.shape)
print(bureau_bal_numeric_data.shape)
print(bureau_bal_categ_data.shape)
print('='*120)

# Previous Application
prev_app_keys = pd.read_csv('preprocessed/prev_app_keys.csv')
prev_app_numeric_data = np.load('preprocessed/prev_app_numeric_data.npy')
prev_app_categ_data = load_npz('preprocessed/prev_app_categ_data_csr.npz').todense()
print(prev_app_keys.shape)
print(prev_app_numeric_data.shape)
print(prev_app_categ_data.shape)
print('='*120)

# POS CASH Balance
pos_cash_bal_keys = pd.read_csv('preprocessed/pos_cash_bal_keys.csv')
pos_cash_bal_numeric_data = np.load('preprocessed/pos_cash_bal_numeric_data.npy')
pos_cash_bal_categ_data = load_npz('preprocessed/pos_cash_bal_categ_data_csr.npz').todense()
print(pos_cash_bal_keys.shape)
print(pos_cash_bal_numeric_data.shape)
print(pos_cash_bal_categ_data.shape)
print('='*120)

# Instalments payments
instalm_paym_keys = pd.read_csv('preprocessed/instalm_paym_keys.csv')
instalm_paym_numeric_data = np.load('preprocessed/instalm_paym_numeric_data.npy')
print(instalm_paym_keys.shape)
print(instalm_paym_numeric_data.shape)
print('='*120)

# Credit Card Balance
credit_bal_keys = pd.read_csv('preprocessed/credit_bal_keys.csv')
credit_bal_numeric_data = np.load('preprocessed/credit_bal_numeric_data.npy')
credit_bal_categ_data = load_npz('preprocessed/credit_bal_categ_data_csr.npz').todense()
print(credit_bal_keys.shape)
print(credit_bal_numeric_data.shape)
print(credit_bal_categ_data.shape)
print('='*120)

(307511, 2)
(307511, 27)
(307511, 188)
(1716428, 2)
(1716428, 10)
(1716428, 23)
(13649962, 1)
(13649962, 1)
(13649962, 8)
(1670214, 2)
(1670214, 4)
(1670214, 162)
(5000679, 2)
(5000679, 5)
(5000679, 8)
(13605401, 2)
(13605401, 6)
(3840312, 2)
(3840312, 13)
(3840312, 7)


##Function to calculate weighted mean of columns of a matrix

In [None]:
def weighted_mean(array_2D):
  # array_2D => numpy array of size (rows, cols)
  array_2D = np.asarray(array_2D)
  rows,cols = array_2D.shape

  # create an array of weights
  # of size (rows, 1)
  # integer weighted array
  #sum_wts = rows*(rows + 1) # sum of n natural numbers
  #wts_arr = np.asarray([i/sum_wts for i in range(rows,0,-1)]).reshape(-1,1)
  # exponential decay array
  # weights are like 1,0.1,0.001...
  sum_wts = 1 * ((1 - (0.1**rows))/(1 - 0.1)) # sum of gp
  #print(sum_wts)
  wts_arr = np.asarray([(10**(-i))/sum_wts for i in range(0,rows)]).reshape(-1,1)
  #print(wts_arr)

  #print(wts_arr)

  # multiply this array elementwise with array_2D
  # numpy will automatically broadcast wts_arr to shape (rows,cols)
  #print(type(array_2D))
  #print(type(wts_arr))
  array_2D_wtd = wts_arr * array_2D
  #print(array_2D_wtd)

  # take columnwise mean of above array and return
  #return np.mean(array_2D_wtd,axis=0).reshape(1,-1)
  return np.sum(array_2D_wtd,axis=0).reshape(1,-1)

## Functions to calculate "File Snapshots" of every file for one loan ID

### Bureau Table

In [None]:
def input2_calc(sk_id_curr):
  # fetch the file level flags for this file
  file_name = 'bureau.csv'
  num_top_rec = dict_file_flags[file_name][0] # number of top records to be selected  

  # fetch the list of bureau ids for this sk_id_curr
  selected_recs = bureau_keys[bureau_keys['SK_ID_CURR'] == sk_id_curr]
  selected_inds = selected_recs.index # indices of the selected records
  count_sel_recs = len(selected_inds) # no of selected records
  #print(count_sel_recs)

  inp2_numeric = np.array([[]]) # should be of shape (num_top_rec + 1 X length of one row of numeric values for bureau)
  inp2_categ = np.array([[]]) # should be of shape (num_top_rec + 1 X length of one row of categ values for bureau)
  if count_sel_recs > 0 and count_sel_recs <= num_top_rec:
    # calculate starting and ending indices
    s_ind = selected_inds[0] # first index
    e_ind = selected_inds[-1] # last index, since now there are <= num_top_rec records

    # store numerical data
    inp2_numeric = bureau_numeric_data[s_ind:e_ind+1,:]
    # store categorical data
    inp2_categ = bureau_categ_data[s_ind:e_ind+1,:]

    # zero padding 
    no_zero_recs = num_top_rec - count_sel_recs + 1 # no of all zero records required for padding

    # for numerical data
    noofcols_numeric = bureau_numeric_data.shape[1]
    padding_numeric = np.zeros((no_zero_recs,noofcols_numeric))
    inp2_numeric = np.append(inp2_numeric,padding_numeric,axis=0)

    # for categorical data
    noofcols_categ = bureau_categ_data.shape[1]
    padding_categ = np.zeros((no_zero_recs,noofcols_categ))
    inp2_categ = np.append(inp2_categ,padding_categ,axis=0)

  elif count_sel_recs > num_top_rec:

    if num_top_rec > 0: # if any selected records are required
      # calculate starting and ending indices
      s_ind = selected_inds[0] # first index
      e_ind = selected_inds[num_top_rec-1] # ending index, since now there are >num_top_rec records

      # store numerical data
      inp2_numeric = bureau_numeric_data[s_ind:e_ind+1,:]
      # store categorical data
      inp2_categ = bureau_categ_data[s_ind:e_ind+1,:]
    
    # calculate columnwise mean of remaining data
    # calcualte starting and ending indices of remaining data
    s_ind = selected_inds[num_top_rec] # right after ending index
    e_ind = selected_inds[-1] # last index

    # for numerical data
    if calc_wtd_mean:
      mean_numeric = weighted_mean(bureau_numeric_data[s_ind:e_ind+1,:])
    else:
      mean_numeric = np.mean(bureau_numeric_data[s_ind:e_ind+1,:],axis=0).reshape(1,-1)
    
    if num_top_rec > 0:
      inp2_numeric = np.append(inp2_numeric,mean_numeric,axis=0)
    else:
      inp2_numeric = mean_numeric

    # for categorical data
    if calc_wtd_mean:    
      mean_categ = weighted_mean(bureau_categ_data[s_ind:e_ind+1,:])      
    else:
      mean_categ = np.mean(bureau_categ_data[s_ind:e_ind+1,:],axis=0).reshape(1,-1)
    
    if num_top_rec > 0:
      inp2_categ = np.append(inp2_categ,mean_categ,axis=0)
    else:
      inp2_categ = mean_categ

  else: #count_sel_recs == 0
    # zero padding 
    no_zero_recs = num_top_rec + 1 # no of all zero records required for padding

    # for numerical data
    noofcols_numeric = bureau_numeric_data.shape[1]
    padding_numeric = np.zeros((no_zero_recs,noofcols_numeric))
    inp2_numeric = padding_numeric

    # for categorical data
    noofcols_categ = bureau_categ_data.shape[1]
    padding_categ = np.zeros((no_zero_recs,noofcols_categ))
    inp2_categ = padding_categ
  
  #===========end of if elif block============#

  return inp2_numeric,inp2_categ,selected_recs

### Bureau Balance Table

In [None]:
def input3_calc(sk_id_bur_id):
  # fetch the file level flags for this file and bureau.csv file
  file_name = 'bureau_balance.csv'
  num_top_rec = dict_file_flags[file_name][0] # number of top records to be selected    
  #print(sk_id_bur_id.head())

  # extract distinct bureau id from list of sk_id_curr + bureau ID
  sk_id_burr = pd.DataFrame()
  sk_id_burr['SK_ID_BUREAU'] = sk_id_bur_id['SK_ID_BUREAU'].unique()
  #print(sk_id_burr.head())

  # array to store final data
  inp3_final = np.array([[]]).reshape(0,1) 
  # should be of shape (num_top_rec + 1) X (length of one row of (numeric + categ) values for bureau balance)

  # for each pair of SK_ID_CURR and SK_ID_BUREAU
  for i,r in sk_id_burr.iterrows():

    sk_id_bure = r['SK_ID_BUREAU']

    # fetch the columns for this sk_id_bureau
    selected_recs = bureau_bal_keys[bureau_bal_keys['SK_ID_BUREAU'] == sk_id_bure]
    selected_inds = selected_recs.index # indices of the selected records
    count_sel_recs = len(selected_inds) # no of selected records
    #print(count_sel_recs)

    if count_sel_recs == 0: # no records selected
      continue

    # calculate starting and ending indices
    s_ind = selected_inds[0] # first index
    e_ind = selected_inds[-1] # last index

    # store numerical data
    inp3_numeric = bureau_bal_numeric_data[s_ind:e_ind+1,:] 
    # will be of shape (count_sel_rec X length of one row of numeric values for bureau bal)
    
    # store categorical data
    inp3_categ = bureau_bal_categ_data[s_ind:e_ind+1,:] 
    # will be of shape (count_sel_rec X length of one row of categ values for bureau bal)

    # concat numerical data with categorical data, since this file only has one numerical column
    inp3 = np.hstack((inp3_numeric,inp3_categ))

    # append to final array
    if inp3_final.shape == (0,1):
      inp3_final = inp3
    else:
      inp3_final = np.append(inp3_final,inp3,axis=0)

  #====================end of for loop==================#

  # no of rows required in the final output other than last mean record
  no_of_rows_required = num_top_rec

  if inp3_final.shape[0] <= no_of_rows_required and inp3_final.shape[0] > 0:

    # zero padding 
    no_zero_recs = no_of_rows_required - inp3_final.shape[0] + 1 # no of all zero records required for padding

    noofcols = inp3_final.shape[1]
    padding_final = np.zeros((no_zero_recs,noofcols))
    inp3_final = np.append(inp3_final,padding_final,axis=0)

  elif inp3_final.shape[0] > no_of_rows_required:

    # calculate starting index, ending index will be last
    s_ind = no_of_rows_required # first index after required data
  
    # calculate columnwise mean of remaining data after no_of_rows_required

    if calc_wtd_mean:        
      mean_final = weighted_mean(inp3_final[s_ind:,:])
    else:
      mean_final = np.mean(inp3_final[s_ind:,:],axis=0).reshape(1,-1)    
    #print(mean_final)

    # keep only top no_of_rows_required in inp3_final
    inp3_final = inp3_final[:no_of_rows_required]  
    
    # append the mean row
    if no_of_rows_required > 0: # if any selected rows are required
      inp3_final = np.append(inp3_final,mean_final,axis=0)
    else:
      inp3_final = mean_final

  else: # inp3_final.shape[0] == 0
    # zero padding 
    no_zero_recs = no_of_rows_required + 1 # no of all zero records required for padding

    # no of columns required in the output  
    noofcols_final = bureau_bal_numeric_data.shape[1] + bureau_bal_categ_data.shape[1]

    padding_final = np.zeros((no_zero_recs,noofcols_final))
    inp3_final = padding_final

  #===========end of if elif block============#

  return inp3_final

### Previous Applications Table

In [None]:
def input4_calc(sk_id_curr):
  # fetch the file level flags for this file
  file_name = 'previous_application.csv'
  num_top_rec = dict_file_flags[file_name][0] # number of top records to be selected  

  # fetch the list of previous application ids for this sk_id_curr
  selected_recs = prev_app_keys[prev_app_keys['SK_ID_CURR'] == sk_id_curr]
  selected_inds = selected_recs.index # indices of the selected records
  count_sel_recs = len(selected_inds) # no of selected records
  #print(count_sel_recs)

  inp4_numeric = np.array([[]]) # should be of shape (num_top_rec + 1 X length of one row of numeric values for prev_app)
  inp4_categ = np.array([[]]) # should be of shape (num_top_rec + 1 X length of one row of categ values for prev_app)
  if count_sel_recs > 0 and count_sel_recs <= num_top_rec:
    # calculate starting and ending indices 
    s_ind = selected_inds[0] # first index
    e_ind = selected_inds[-1] # last index, since now there are <= num_top_rec records

    # store numerical data
    inp4_numeric = prev_app_numeric_data[s_ind:e_ind+1,:]
    # store categorical data
    inp4_categ = prev_app_categ_data[s_ind:e_ind+1,:]

    # zero padding 
    no_zero_recs = num_top_rec - count_sel_recs + 1 # no of all zero records required for padding

    # for numerical data
    noofcols_numeric = prev_app_numeric_data.shape[1]
    padding_numeric = np.zeros((no_zero_recs,noofcols_numeric))
    inp4_numeric = np.append(inp4_numeric,padding_numeric,axis=0)

    # for categorical data
    noofcols_categ = prev_app_categ_data.shape[1]
    padding_categ = np.zeros((no_zero_recs,noofcols_categ))
    inp4_categ = np.append(inp4_categ,padding_categ,axis=0)

  elif count_sel_recs > num_top_rec:

    if num_top_rec > 0: # if selected records are required
      # calculate starting and ending indices
      s_ind = selected_inds[0] # first index
      e_ind = selected_inds[num_top_rec-1] # ending index, since now there are >num_top_rec records

      # store numerical data
      inp4_numeric = prev_app_numeric_data[s_ind:e_ind+1,:]
      # store categorical data
      inp4_categ = prev_app_categ_data[s_ind:e_ind+1,:]
    
    # calculate columnwise mean of remaining data
    # calcualte starting and ending indices of remaining data
    s_ind = selected_inds[num_top_rec] # right after ending index
    e_ind = selected_inds[-1] # last index

    # for numerical data
    if calc_wtd_mean:            
      mean_numeric = weighted_mean(prev_app_numeric_data[s_ind:e_ind+1,:])
    else:
      mean_numeric = np.mean(prev_app_numeric_data[s_ind:e_ind+1,:],axis=0).reshape(1,-1)          

    if num_top_rec > 0: 
      inp4_numeric = np.append(inp4_numeric,mean_numeric,axis=0)
    else:
      inp4_numeric = mean_numeric

    # for numerical data
    if calc_wtd_mean:
      mean_categ = weighted_mean(prev_app_categ_data[s_ind:e_ind+1,:])    
    else:
      mean_categ = np.mean(prev_app_categ_data[s_ind:e_ind+1,:],axis=0).reshape(1,-1)      

    if num_top_rec > 0:
      inp4_categ = np.append(inp4_categ,mean_categ,axis=0)
    else:
      inp4_categ = mean_categ
  
  else: #count_sel_recs == 0
    # zero padding 
    no_zero_recs = num_top_rec + 1 # no of all zero records required for padding

    # for numerical data
    noofcols_numeric = prev_app_numeric_data.shape[1]
    padding_numeric = np.zeros((no_zero_recs,noofcols_numeric))
    inp4_numeric = padding_numeric

    # for categorical data
    noofcols_categ = prev_app_categ_data.shape[1]
    padding_categ = np.zeros((no_zero_recs,noofcols_categ))
    inp4_categ = padding_categ
  
  #===========end of if elif block============#

  return inp4_numeric,inp4_categ,selected_recs

### POS Cash Balance Table

In [None]:
def input5_calc(sk_id_prev_id):
  # fetch the file level flags for this file and bureau.csv file
  file_name = 'POS_CASH_balance.csv'
  num_top_rec = dict_file_flags[file_name][0] # number of top records to be selected    

  # array to store final data
  inp5_final = np.array([[]]).reshape(0,1) 
  # should be of shape (num_top_rec + 1) X (length of one row of (numeric + categ) values for POS CASH balance)

  # for each pair of SK_ID_CURR and SK_ID_PREV
  for i,r in sk_id_prev_id.iterrows():

    sk_id_curr = r['SK_ID_CURR']
    sk_id_prev = r['SK_ID_PREV']    

    # fetch the columns for this sk_id_curr and sk_id_prev
    selected_recs = pos_cash_bal_keys[(pos_cash_bal_keys['SK_ID_CURR'] == sk_id_curr) & (pos_cash_bal_keys['SK_ID_PREV'] == sk_id_prev)]
    selected_inds = selected_recs.index # indices of the selected records
    count_sel_recs = len(selected_inds) # no of selected records
    #print(count_sel_recs)

    if count_sel_recs == 0: # no records selected
      continue

    # calculate starting and ending indices
    s_ind = selected_inds[0] # first index
    e_ind = selected_inds[-1] # last index

    # store numerical data
    inp5_numeric = pos_cash_bal_numeric_data[s_ind:e_ind+1,:] 
    # will be of shape (count_sel_rec X length of one row of numeric values for pos cash bal)
    
    # store categorical data
    inp5_categ = pos_cash_bal_categ_data[s_ind:e_ind+1,:] 
    # will be of shape (count_sel_rec X length of one row of categ values for pos cash bal)

    # concat numerical data with categorical data, since this file only has one numerical column
    inp5 = np.hstack((inp5_numeric,inp5_categ))

    # append to final array
    if inp5_final.shape == (0,1):
      inp5_final = inp5
    else:
      inp5_final = np.append(inp5_final,inp5,axis=0)

  #====================end of for loop==================#

  # no of rows required in the final output other than last mean record
  no_of_rows_required = num_top_rec

  if inp5_final.shape[0] <= no_of_rows_required and inp5_final.shape[0] > 0:

    # zero padding 
    no_zero_recs = no_of_rows_required - inp5_final.shape[0] + 1 # no of all zero records required for padding

    noofcols = inp5_final.shape[1]
    padding_final = np.zeros((no_zero_recs,noofcols))
    inp5_final = np.append(inp5_final,padding_final,axis=0)

  elif inp5_final.shape[0] > no_of_rows_required:

    # calculate starting index, ending index will be last
    s_ind = no_of_rows_required # first index after required data
  
    # calculate columnwise mean of remaining data after no_of_rows_required
    if calc_wtd_mean:
      mean_final = weighted_mean(inp5_final[s_ind:,:])    
    else:
      mean_final = np.mean(inp5_final[s_ind:,:],axis=0).reshape(1,-1)      
    #print(mean_final)

    # keep only top no_of_rows_required in inp5_final
    inp5_final = inp5_final[:no_of_rows_required]  
    
    # append the mean row
    if no_of_rows_required > 0:
      inp5_final = np.append(inp5_final,mean_final,axis=0)
    else:
      inp5_final = mean_final

  else: #inp5_final.shape[0] == 0
    # zero padding 
    no_zero_recs = no_of_rows_required + 1 # no of all zero records required for padding

    # no of columns required in the output  
    noofcols_final = pos_cash_bal_numeric_data.shape[1] + pos_cash_bal_categ_data.shape[1]

    padding_final = np.zeros((no_zero_recs,noofcols_final))
    inp5_final = padding_final

  #===========end of if elif block============#

  return inp5_final

### Installments Payments Table

In [None]:
def input6_calc(sk_id_prev_id):
  # fetch the file level flags for this file and bureau.csv file
  file_name = 'installments_payments.csv'
  num_top_rec = dict_file_flags[file_name][0] # number of top records to be selected    

  # array to store final data
  inp6_final = np.array([[]]).reshape(0,1) 
  # should be of shape (num_top_rec + 1) X (length of one row of numeric values for Instalment payments)

  # for each pair of SK_ID_CURR and SK_ID_PREV
  for i,r in sk_id_prev_id.iterrows():

    sk_id_curr = r['SK_ID_CURR']
    sk_id_prev = r['SK_ID_PREV']    

    # fetch the columns for this pair
    selected_recs = instalm_paym_keys[(instalm_paym_keys['SK_ID_CURR'] == sk_id_curr) & (instalm_paym_keys['SK_ID_PREV'] == sk_id_prev)]
    selected_inds = selected_recs.index # indices of the selected records
    count_sel_recs = len(selected_inds) # no of selected records
    #print(count_sel_recs)

    if count_sel_recs == 0: # no records selected
      continue

    # calculate starting and ending indices
    s_ind = selected_inds[0] # first index
    e_ind = selected_inds[-1] # last index

    # store numerical data
    inp6_numeric = instalm_paym_numeric_data[s_ind:e_ind+1,:] 
    # will be of shape (count_sel_rec X length of one row of numeric values for instalments payments)
    
    # set numerical data as final data
    inp6 = inp6_numeric

    # append to final array
    if inp6_final.shape == (0,1):
      inp6_final = inp6
    else:
      inp6_final = np.append(inp6_final,inp6,axis=0)

  #====================end of for loop==================#

  # no of rows required in the final output other than last mean record
  no_of_rows_required = num_top_rec

  if inp6_final.shape[0] <= no_of_rows_required and inp6_final.shape[0] > 0:

    # zero padding 
    no_zero_recs = no_of_rows_required - inp6_final.shape[0] + 1 # no of all zero records required for padding

    noofcols = inp6_final.shape[1]
    padding_final = np.zeros((no_zero_recs,noofcols))
    inp6_final = np.append(inp6_final,padding_final,axis=0)

  elif inp6_final.shape[0] > no_of_rows_required:

    # calculate starting index, ending index will be last
    s_ind = no_of_rows_required # first index after required data
  
    # calculate columnwise mean of remaining data after no_of_rows_required
    if calc_wtd_mean:    
      mean_final = weighted_mean(inp6_final[s_ind:,:])        
    else:
      mean_final = np.mean(inp6_final[s_ind:,:],axis=0).reshape(1,-1)

    #print(mean_final)

    # keep only top no_of_rows_required in inp6_final
    inp6_final = inp6_final[:no_of_rows_required]  
    
    # append the mean row
    if no_of_rows_required > 0:
      inp6_final = np.append(inp6_final,mean_final,axis=0)
    else:
      inp6_final = mean_final

  else: #inp6_final.shape[0] == 0
    # zero padding 
    no_zero_recs = no_of_rows_required + 1 # no of all zero records required for padding

    # no of columns required in the output  
    noofcols_final = instalm_paym_numeric_data.shape[1]

    padding_final = np.zeros((no_zero_recs,noofcols_final))
    inp6_final = padding_final

  #===========end of if elif block============#

  return inp6_final

### Credit Card Balance Table

In [None]:
def input7_calc(sk_id_prev_id):
  # fetch the file level flags for this file and bureau.csv file
  file_name = 'credit_card_balance.csv'
  num_top_rec = dict_file_flags[file_name][0] # number of top records to be selected    

  # array to store final data
  inp7_final = np.array([[]]).reshape(0,1) 
  # should be of shape (num_top_rec + 1) X (length of one row of (numeric + categ) values for Credit Card Balance)

  # for each pair of SK_ID_CURR and SK_ID_PREV
  for i,r in sk_id_prev_id.iterrows():

    sk_id_curr = r['SK_ID_CURR']
    sk_id_prev = r['SK_ID_PREV']    

    # fetch the columns for this pair
    selected_recs = credit_bal_keys[(credit_bal_keys['SK_ID_CURR'] == sk_id_curr) & (credit_bal_keys['SK_ID_PREV'] == sk_id_prev)]
    selected_inds = selected_recs.index # indices of the selected records
    count_sel_recs = len(selected_inds) # no of selected records
    #print(count_sel_recs)

    if count_sel_recs == 0: # no records selected
      continue

    # calculate starting and ending indices
    s_ind = selected_inds[0] # first index
    e_ind = selected_inds[-1] # last index

    # store numerical data
    inp7_numeric = credit_bal_numeric_data[s_ind:e_ind+1,:] 
    # will be of shape (count_sel_rec X length of one row of numeric values for credit card balance)
    
    # store categorical data
    inp7_categ = credit_bal_categ_data[s_ind:e_ind+1,:] 
    # will be of shape (count_sel_rec X length of one row of categ values for credit card balance)

    # concat numerical data with categorical data, since this file only has one categorical column
    inp7 = np.hstack((inp7_numeric,inp7_categ))

    # append to final array
    if inp7_final.shape == (0,1):
      inp7_final = inp7
    else:
      inp7_final = np.append(inp7_final,inp7,axis=0)

  #====================end of for loop==================#

  # no of rows required in the final output other than last mean record
  no_of_rows_required = num_top_rec

  if inp7_final.shape[0] <= no_of_rows_required and inp7_final.shape[0] > 0:

    # zero padding 
    no_zero_recs = no_of_rows_required - inp7_final.shape[0] + 1 # no of all zero records required for padding

    noofcols = inp7_final.shape[1]
    padding_final = np.zeros((no_zero_recs,noofcols))
    inp7_final = np.append(inp7_final,padding_final,axis=0)

  elif inp7_final.shape[0] > no_of_rows_required:

    # calculate starting index, ending index will be last
    s_ind = no_of_rows_required # first index after required data
  
    # calculate columnwise mean of remaining data after no_of_rows_required
    if calc_wtd_mean:    
      mean_final = weighted_mean(inp7_final[s_ind:,:])        
    else:
      mean_final = np.mean(inp7_final[s_ind:,:],axis=0).reshape(1,-1)      
    #print(mean_final)

    # keep only top no_of_rows_required in inp5_final
    inp7_final = inp7_final[:no_of_rows_required]  
    
    # append the mean row
    if no_of_rows_required > 0:
      inp7_final = np.append(inp7_final,mean_final,axis=0)
    else:
      inp7_final = mean_final

  else: #inp7_final.shape[0] == 0
    # zero padding 
    no_zero_recs = no_of_rows_required + 1 # no of all zero records required for padding

    # no of columns required in the output  
    noofcols_final = credit_bal_numeric_data.shape[1] + credit_bal_categ_data.shape[1]

    padding_final = np.zeros((no_zero_recs,noofcols_final))
    inp7_final = padding_final

  #===========end of if elif block============#

  return inp7_final

##Generating data for each training point using above functions

In [None]:
def conv_3D_to_2D(array_3D):
  # to convert 3D array of shape (batch_size,rows,columns)
  # to 2D array of shape (batch_size*rows,columns)
  batch_size = int(array_3D.shape[0])
  rows = int(array_3D.shape[1])
  cols = int(array_3D.shape[2])
  return array_3D.reshape(batch_size*rows,cols)
##==========end of conv_3D_to_2D===========##

# start time
s = time.time()
# start time for batch
s1 = time.time()

# data corresponding to application train table
target_values = np.array([[]]) # final size should be no_of_rows X 1
input1_numeric_values = np.array([[]]) # final size should be no_of_rows X length of one row of numeric values for app_train
input1_categ_values = np.array([[]]) # final size should be no_of_rows X length of one hot categ values for app_train

# data corresponding to bureau table
input2_numeric_values = np.array([[]]) # final size should be no_of_rows X length of one row of numeric values for bureau
input2_categ_values = np.array([[]]) # final size should be no_of_rows X length of one hot categ values for bureau

# data corresponding to bureau balance table
input3_values = np.array([[]]) # final size should be no_of_rows X length of one row of numeric + categ values for bureau_bal

# data corresponding to previous application table
input3_values = np.array([[]]) # final size should be no_of_rows X length of one row of numeric + categ values for bureau_bal

# data corresponding to previous application table
input4_numeric_values = np.array([[]]) # final size should be no_of_rows X length of one row of numeric values for prev_app
input4_categ_values = np.array([[]]) # final size should be no_of_rows X length of one hot categ values for prev_app

# data corresponding to POS Cash Balance table
input5_values = np.array([[]]) # final size should be no_of_rows X length of one row of numeric + categ values for pos_cash_bal

# data corresponding to Instalments Payments table
input6_values = np.array([[]]) # final size should be no_of_rows X length of one row of numeric values for instalm_paym

# data corresponding to Credit Card Balance table
input7_values = np.array([[]]) # final size should be no_of_rows X length of one row of numeric + categ values for credit_card_bal

# for every SK_ID_CURR in app_train

# subset the data into batch
batch_no = 3
rec_count = 45000
s_row = (batch_no - 1) * rec_count
e_row = (batch_no) * rec_count
app_train_keys_batch = app_train_keys[s_row:e_row]
# set weighted mean flag
calc_wtd_mean = False

for i,r in app_train_keys_batch.iterrows():
  xi_id = r['SK_ID_CURR'] #loan ID
  yi = r['TARGET'].reshape(-1,1) #reshape to (1,1)

  # append yi to target_values
  if target_values.shape == (1,0): #first point
    target_values = yi
  else:
    target_values = np.append(target_values,yi,axis=0)

  #print(i) # test
  
  # calculate inputs for xi_id
  
  # input1 => corresponding to application_train table
  inp1_num = app_train_numeric_data[i,:].reshape(1,-1) # reshape to (1 X no of cols)
  #print(inp1_num.shape)
  inp1_cat = app_train_categ_data[i,:].reshape(1,-1) # reshape to (1 X no of cols)
  inp1_cat = np.expand_dims(inp1_cat,axis=0) # reshape to (1 X 1 X no of cols)
  #print(inp1_cat.shape)

  # input2 => corresponding to bureau table
  inp2_num,inp2_cat,id_plus_bureauid_keys = input2_calc(xi_id)
  #print(inp2_num.shape)  
  #print(inp2_cat.shape)  
  # add one dimension to shape into => (1 X rows X columns)
  inp2_num = np.expand_dims(inp2_num,axis=0)
  inp2_cat = np.expand_dims(inp2_cat,axis=0)

  # input3 => corresponding to bureau balance table
  inp3 = input3_calc(id_plus_bureauid_keys)
  # add one dimension to shape into => (1 X rows X columns)
  inp3 = np.expand_dims(inp3,axis=0)

  # input4 => corresponding to previous application table
  inp4_num,inp4_cat,id_plus_prev_id_keys = input4_calc(xi_id)
  #print(inp4_num.shape)  
  #print(inp4_cat.shape)  
  # add one dimension to shape into => (1 X rows X columns)
  inp4_num = np.expand_dims(inp4_num,axis=0)
  inp4_cat = np.expand_dims(inp4_cat,axis=0)

  # input5 => corresponding to POS cash balance table
  inp5 = input5_calc(id_plus_prev_id_keys)
  # add one dimension to shape into => (1 X rows X columns)
  inp5 = np.expand_dims(inp5,axis=0)

  # input5 => corresponding to Instalments payments table
  inp6 = input6_calc(id_plus_prev_id_keys)
  # add one dimension to shape into => (1 X rows X columns)
  inp6 = np.expand_dims(inp6,axis=0)

  # input7 => corresponding to Credit Card balance table
  inp7 = input7_calc(id_plus_prev_id_keys)
  # add one dimension to shape into => (1 X rows X columns)
  inp7 = np.expand_dims(inp7,axis=0)

  # append the above arrays to final data
  if input1_numeric_values.shape == (1,0): # first point
    
    # input 1
    input1_numeric_values = inp1_num
    input1_categ_values = inp1_cat

    # input 2
    input2_numeric_values = inp2_num
    input2_categ_values = inp2_cat

    # input 3
    input3_values = inp3

    # input 4
    input4_numeric_values = inp4_num
    input4_categ_values = inp4_cat

    # input 5
    input5_values = inp5

    # input 6
    input6_values = inp6

    # input 7
    input7_values = inp7    

  else:

    # input 1
    input1_numeric_values = np.append(input1_numeric_values, inp1_num, axis=0)
    input1_categ_values = np.append(input1_categ_values, inp1_cat, axis=0)

    # input 2
    input2_numeric_values = np.append(input2_numeric_values, inp2_num, axis=0)
    input2_categ_values = np.append(input2_categ_values, inp2_cat, axis=0)

    # input 3
    input3_values = np.append(input3_values, inp3, axis=0)

    # input 4
    input4_numeric_values = np.append(input4_numeric_values, inp4_num, axis=0)
    input4_categ_values = np.append(input4_categ_values, inp4_cat, axis=0)

    # input 5
    input5_values = np.append(input5_values, inp5, axis=0)

    # input 6
    input6_values = np.append(input6_values, inp6, axis=0)

    # input 7
    input7_values = np.append(input7_values, inp7, axis=0)

  # for every 1000 records processed
  # print time taken for 1000 records
  # and cumulative time taken
  if (i - s_row + 1) % 1000 == 0:
    print("{} records processed".format(i - s_row + 1))
    print("Time Taken (In seconds) : ", (time.time() - s1))
    s1 = time.time()
    print("Total Time Taken (In seconds) : ", (time.time() - s))
    print('='*120)
    #break
  
  # for given number of records
  # save the data
  if (i + 1) == e_row:
    
    # for categorical and sparse data, we will flatten them to 2D
    # then convert to CSR first and then save

    np.save("final_data/batch"+str(batch_no)+"/input1_numeric_values",input1_numeric_values)
    input1_categ_values_flat = conv_3D_to_2D(input1_categ_values)
    input1_categ_values_csr = csr_matrix(input1_categ_values_flat)
    save_npz("final_data/batch"+str(batch_no)+"/input1_categ_values_csr.npz",input1_categ_values_csr)
    
    np.save("final_data/batch"+str(batch_no)+"/target_values",target_values)

    np.save("final_data/batch"+str(batch_no)+"/input2_numeric_values",input2_numeric_values)
    input2_categ_values_flat = conv_3D_to_2D(input2_categ_values)
    input2_categ_values_csr = csr_matrix(input2_categ_values_flat)
    save_npz("final_data/batch"+str(batch_no)+"/input2_categ_values_csr.npz",input2_categ_values_csr)      
    
    input3_values_flat = conv_3D_to_2D(input3_values)
    input3_values_csr = csr_matrix(input3_values_flat)
    save_npz("final_data/batch"+str(batch_no)+"/input3_values_csr.npz",input3_values_csr)
    
    np.save("final_data/batch"+str(batch_no)+"/input4_numeric_values",input4_numeric_values)
    input4_categ_values_flat = conv_3D_to_2D(input4_categ_values)
    input4_categ_values_csr = csr_matrix(input4_categ_values_flat)
    save_npz("final_data/batch"+str(batch_no)+"/input4_categ_values_csr.npz",input4_categ_values_csr)      

    np.save("final_data/batch"+str(batch_no)+"/input5_values",input5_values)
    np.save("final_data/batch"+str(batch_no)+"/input6_values",input6_values)
    
    input7_values_flat = conv_3D_to_2D(input7_values)
    input7_values_csr = csr_matrix(input7_values_flat)
    save_npz("final_data/batch"+str(batch_no)+"/input7_values_csr.npz",input7_values_csr)

    print('='*5,'Data Saved for ' + str(e_row - s_row) + ' records','='*5)
    #break

#===========end of for loop=================#

#print("Time Taken (In seconds) :", (time.time() - s))

1000 records processed
Time Taken (In seconds) :  420.46372961997986
Total Time Taken (In seconds) :  420.46382451057434
2000 records processed
Time Taken (In seconds) :  427.01064443588257
Total Time Taken (In seconds) :  847.4754803180695
3000 records processed
Time Taken (In seconds) :  421.71967697143555
Total Time Taken (In seconds) :  1269.1952147483826
4000 records processed
Time Taken (In seconds) :  424.52227115631104
Total Time Taken (In seconds) :  1693.717541694641
5000 records processed
Time Taken (In seconds) :  411.06194710731506
Total Time Taken (In seconds) :  2104.7795383930206
6000 records processed
Time Taken (In seconds) :  448.05079221725464
Total Time Taken (In seconds) :  2552.830403327942
7000 records processed
Time Taken (In seconds) :  457.30199122428894
Total Time Taken (In seconds) :  3010.1324434280396
8000 records processed
Time Taken (In seconds) :  437.64425683021545
Total Time Taken (In seconds) :  3447.7772731781006
9000 records processed
Time Taken (

In [None]:
print(input1_numeric_values.shape)
print(input1_categ_values.shape)
print('='*120)
print(target_values.shape)
print('='*120)
print(input2_numeric_values.shape)
print(input2_categ_values.shape)
print('='*120)
print(input3_values.shape)
print('='*120)
print(input4_numeric_values.shape)
print(input4_categ_values.shape)
print('='*120)
print(input5_values.shape)
print('='*120)
print(input6_values.shape)
print('='*120)
print(input7_values.shape)
print('='*120)

(45000, 27)
(45000, 1, 188)
(45000, 1)
(45000, 3, 10)
(45000, 3, 23)
(45000, 2, 9)
(45000, 2, 4)
(45000, 2, 162)
(45000, 11, 13)
(45000, 6, 6)
(45000, 11, 20)
