## Feature Engineering

In this notebook, we will engineer the following features using the preprocessed data in <I>preprocessed</I> folder.
1. Credit Annuity Ratio
2. Credit Goods Price Ratio
3. Value of TARGET column for nearest 500 neighbours
4. Debt Credit Ratio
5. Mean of Days Credit

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# project directory
current_dir = 'Home Credit_Kaggle'

# set the project folder as current working directory
import os
complete_path = os.path.join('/content/drive/My Drive/Colab Notebooks/',current_dir)
os.chdir(complete_path)

In [None]:
import numpy as np
import pandas as pd
import time
from scipy.sparse import csr_matrix,save_npz

###Load Preprocessed Data and Column Index Dictionaries

In [None]:
# load preprocessed data of files

from scipy.sparse import load_npz

# Application Train table
app_train_keys = pd.read_csv('preprocessed/app_train_keys.csv')
app_train_numeric_data = np.load('preprocessed/app_train_numeric_data.npy')
app_train_categ_data = load_npz('preprocessed/app_train_categ_data_csr.npz').todense()
print(app_train_keys.shape)
print(app_train_numeric_data.shape)
print(app_train_categ_data.shape)
print('='*120)

# Bureau table
bureau_keys = pd.read_csv('preprocessed/bureau_keys.csv')
bureau_numeric_data = np.load('preprocessed/bureau_numeric_data.npy')
bureau_categ_data = load_npz('preprocessed/bureau_categ_data_csr.npz').todense()
print(bureau_keys.shape)
print(bureau_numeric_data.shape)
print(bureau_categ_data.shape)
print('='*120)

# load dictionaries
import pickle

app_train_col_index_file = open('preprocessors/app_train_col_index','rb')
app_train_col_index = pickle.load(app_train_col_index_file)
app_train_col_index_file.close()

bureau_col_index_file = open('preprocessors/bureau_col_index','rb')
bureau_col_index = pickle.load(bureau_col_index_file)
bureau_col_index_file.close()

(307511, 2)
(307511, 27)
(307511, 188)
(1716428, 2)
(1716428, 10)
(1716428, 23)


###Credit Annuity Ratio and Credit Goods Price Ratio

In [None]:
# first ratio can be calculated directly by dividing AMT_CREDIT column by AMT_ANNUITY column of numeric data of input1
# second ratio can be calculated directly by dividing AMT_CREDIT column by AMT_GOODS_PRICE column of numeric data of input1

###Train model for TARGET Mean of 500 nearest neighbours and Credit Annuity Ratio and Credit Goods Price Ratio

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer

def fetch_ind_app_train_cols(col_name):
  # function to fetch col_index of app_train columns
  field_key = 'application_train.csv' + col_name
  return app_train_col_index[field_key]

def target_mean_500_and_ratios():

  # calculate or fetch all features and fit the knn model  

  # Feature 1 and 2 : credit_annuity_ratio and credit_goods_price_ratio
  # These features can be calculated for all the loan ids in one go

  # fetch column values from numeric data corresponding to all loan id
  ind = fetch_ind_app_train_cols('AMT_CREDIT') # fetch column index
  amt_credit = app_train_numeric_data[:,ind]
  
  ind = fetch_ind_app_train_cols('AMT_ANNUITY') # fetch column index 
  amt_annuity = app_train_numeric_data[:,ind]
  
  ind = fetch_ind_app_train_cols('AMT_GOODS_PRICE') # fetch column index 
  amt_goods_price = app_train_numeric_data[:,ind]

  credit_annuity_ratio = (amt_credit/amt_annuity).reshape(-1,1)
  credit_goods_price_ratio = (amt_credit/amt_goods_price).reshape(-1,1)

  #credit_annuity_ratio = np.nan_to_num(credit_annuity_ratio)
  #credit_goods_price_ratio = np.nan_to_num(credit_goods_price_ratio)

  # features for KNN
  # need only ext source 2 and ext source 3
  # credit annuity ratio has been calculated above
  ind = fetch_ind_app_train_cols('EXT_SOURCE_2') # fetch column index   
  ext_source_2 = app_train_numeric_data[:,ind].reshape(-1,1)

  ind = fetch_ind_app_train_cols('EXT_SOURCE_3') # fetch column index     
  ext_source_3 = app_train_numeric_data[:,ind].reshape(-1,1)

  # feature set
  x_train_knn = np.hstack((ext_source_2,ext_source_3,credit_annuity_ratio))
  y_train_knn = app_train_keys['TARGET']

  # fit the model
  knn_model = KNeighborsClassifier(n_neighbors=500)
  knn_model.fit(x_train_knn,y_train_knn)

  # return the features, train dataset for knn and model
  return credit_annuity_ratio,credit_goods_price_ratio,x_train_knn,knn_model

###Debt Credit Ratio and Days Credit Mean

In [None]:
def fetch_ind_bureau_cols(col_name):
  # function to fetch col_index of app_train columns
  field_key = 'bureau.csv' + col_name
  return bureau_col_index[field_key]

def debt_credit_ratio_days_credit_mean(sk_id_curr):

  # fetch the list of bureau ids for this sk_id_curr
  selected_recs = bureau_keys[bureau_keys['SK_ID_CURR'] == sk_id_curr]
  selected_inds = selected_recs.index # indices of the selected records
  count_sel_recs = len(selected_inds) # no of selected records
  #print(count_sel_recs)

  debt_credit_ratio = 0
  days_credit_mean = 0

  if count_sel_recs > 0: # atleast one record selected

    # calculate starting and ending indices
    s_ind = selected_inds[0] # first index
    e_ind = selected_inds[-1] # last index

    # store sum of amount credit sum for all bureau records
    ind = fetch_ind_bureau_cols('AMT_CREDIT_SUM') - 1 # fetch column index     
    amt_credit_sum_sum = np.sum(bureau_numeric_data[s_ind:e_ind+1,ind])
    
    # store sum of amount credit sum debt for all bureau records    
    ind = fetch_ind_bureau_cols('AMT_CREDIT_SUM_DEBT') - 1 # fetch column index         
    amt_credit_sum_debt_sum = np.sum(bureau_numeric_data[s_ind:e_ind+1,ind])

    if amt_credit_sum_sum != 0:
      debt_credit_ratio = amt_credit_sum_debt_sum/amt_credit_sum_sum
    
    # find mean of days_credit field
    days_credit_mean = np.mean(bureau_numeric_data[s_ind:e_ind+1,0])

  return (debt_credit_ratio,days_credit_mean)

##Code to call above functions

In [None]:
def conv_3D_to_2D(array_3D):
  # to convert 3D array of shape (batch_size,rows,columns)
  # to 2D array of shape (batch_size*rows,columns)
  batch_size = int(array_3D.shape[0])
  rows = int(array_3D.shape[1])
  cols = int(array_3D.shape[2])
  return array_3D.reshape(batch_size*rows,cols)
##==========end of conv_3D_to_2D===========##

# start time
s = time.time()
# start time for batch
s1 = time.time()

# call below function to
# 1 calculate credit annuity ratio and credit goods_ratio
# 2 get train data set for knn and knn model itself, to calculate TARGET mean of 500 neigbors
credit_annuity_ratio,credit_goods_price_ratio,x_train_knn,knn_model_500 = target_mean_500_and_ratios()
# save knn model for future use (for test data)
import pickle
f = open('preprocessors/knn_model_500','wb')
pickle.dump(knn_model_500,f)
f.close()

# data corresponding to engineered features
input8_values = np.array([[]]) # final size should be (batches*rec_count) X 1 X no of engineered features

batches = 7
# for every batch
for b in range(batches):

  # subset the data into batch
  batch_no = b + 1
  rec_count = 45000
  s_row = (batch_no - 1) * rec_count
  if batch_no != batches: 
    e_row = batch_no * rec_count
  else:
    e_row = len(app_train_keys)
                
  app_train_keys_batch = app_train_keys[s_row:e_row]

  # for every SK_ID_CURR in app_train
  for i,r in app_train_keys_batch.iterrows():
    
    xi_id = r['SK_ID_CURR'] #loan ID
      
    # initialize a blank row
    inp8 = np.array([])

    # calculate engineered features and append to array

    # Feature 1 and 2 : credit_annuity_ratio and credit_goods_price_ratio
    inp8 = np.append(inp8, credit_annuity_ratio[i])  
    inp8 = np.append(inp8, credit_goods_price_ratio[i])  

    # Feature 3 and 4 : debt_credit_ratio and days_credit_mean
    debt_credit_ratio, days_credit_mean = debt_credit_ratio_days_credit_mean(xi_id)
    inp8 = np.append(inp8, float(debt_credit_ratio))
    inp8 = np.append(inp8, float(days_credit_mean))

    # Feature 5 : TARGET mean of 500 nearest neighbors
    # use model to fetch the mean target value
    # use [0,1] to fetch probability corresponding to only class 1
    target_mean_500 = knn_model_500.predict_proba(x_train_knn[i].reshape(1,-1))[0,1]
    target_mean_500 = float(target_mean_500)
    inp8 = np.append(inp8, target_mean_500)

    # reshape the row_arr
    # and expand dim to make 1 X 1 X row_size
    inp8 = inp8.reshape(1,-1)
    inp8 = np.expand_dims(inp8, axis=0)

    # append the above arrays to final data
    if i == 0: # first point
      # input 8
      input8_values = inp8

    else:
      # input 8
      input8_values = np.append(input8_values, inp8, axis=0)
    
    #if i == 100:
    #  break

    # for every 15000 records processed
    # print time taken for 15000 records
    # and cumulative time taken
    if (i - s_row + 1) % 15000 == 0:
      print("{} records processed".format(i - s_row + 1))
      print("Time Taken (In seconds) : ", (time.time() - s1))
      s1 = time.time()
      print("Total Time Taken (In seconds) : ", (time.time() - s))
      print('='*120)
      #break

    # for given batch
    # save the data
    if (i + 1) == e_row:
      np.save("final_data/batch"+str(batch_no)+"/input8_values",input8_values)
      print('='*5,'Data Saved for ' + str(e_row - s_row) + ' records','='*5)      

  #===========end of inner for loop=================#

  print('*'*120)
  print("Batch no ",str(b+1)," completed")
  print("Total Time Taken (In seconds) : ", (time.time() - s))
  s1 = time.time() # reset time for subset row count
  print('*'*120)

#===========end of for loop for batches===================#

print(input8_values.shape)
#print(input8_values)    

15000 records processed
Time Taken (In seconds) :  102.19401288032532
Total Time Taken (In seconds) :  102.19408774375916
30000 records processed
Time Taken (In seconds) :  101.04014134407043
Total Time Taken (In seconds) :  203.23488545417786
45000 records processed
Time Taken (In seconds) :  111.40992665290833
Total Time Taken (In seconds) :  314.644846200943
===== Data Saved for 45000 records =====
************************************************************************************************************************
Batch no  1  completed
Total Time Taken (In seconds) :  314.9170277118683
************************************************************************************************************************
15000 records processed
Time Taken (In seconds) :  107.77772903442383
Total Time Taken (In seconds) :  422.69528126716614
30000 records processed
Time Taken (In seconds) :  109.02164673805237
Total Time Taken (In seconds) :  531.7169797420502
45000 records processed
Time Taken (

In [None]:
# min max scale the engineered features
from sklearn.preprocessing import MinMaxScaler

# dictionary to hold all the scalers
eng_feat_preprocessors = {}

# scale credit_annuity_ratio 
# init values, scaler and transfore
field_scaler = MinMaxScaler(feature_range=(1e-3, 1))  
inp_array = input8_values[:,:,0].reshape(-1,1)
inp_array_scaled = field_scaler.fit_transform(inp_array)
# store scaler
eng_feat_preprocessors.update({'CREDIT_ANNUITY_RATIO':field_scaler})
# resize and append to final array
inp_array_scaled = np.expand_dims(inp_array_scaled, axis=2)
input8_values_scaled = inp_array_scaled # first value

# scale credit_goods_price_ratio
field_scaler = MinMaxScaler(feature_range=(1e-3, 1))  
inp_array = input8_values[:,:,1].reshape(-1,1)
inp_array_scaled = field_scaler.fit_transform(inp_array)
eng_feat_preprocessors.update({'CREDIT_GOODS_PRICE_RATIO':field_scaler})
inp_array_scaled = np.expand_dims(inp_array_scaled, axis=2)
input8_values_scaled = np.append(input8_values_scaled,inp_array_scaled,axis = 2)

# scale debt_credit_ratio
field_scaler = MinMaxScaler(feature_range=(1e-3, 1))  
inp_array = input8_values[:,:,2].reshape(-1,1)
inp_array_scaled = field_scaler.fit_transform(inp_array)
eng_feat_preprocessors.update({'DEBT_CREDIT_RATIO':field_scaler})
inp_array_scaled = np.expand_dims(inp_array_scaled, axis=2)
input8_values_scaled = np.append(input8_values_scaled,inp_array_scaled,axis=2)

# scale days_credit_mean
field_scaler = MinMaxScaler(feature_range=(1e-3, 1))  
inp_array = input8_values[:,:,3].reshape(-1,1)
inp_array_scaled = field_scaler.fit_transform(inp_array)
eng_feat_preprocessors.update({'DAYS_CREDIT_MEAN':field_scaler})
inp_array_scaled = np.expand_dims(inp_array_scaled, axis=2)
input8_values_scaled = np.append(input8_values_scaled,inp_array_scaled,axis=2)

# scale TARGET mean of 500 neighbors
field_scaler = MinMaxScaler(feature_range=(1e-3, 1))  
inp_array = input8_values[:,:,4].reshape(-1,1)
inp_array_scaled = field_scaler.fit_transform(inp_array)
eng_feat_preprocessors.update({'TARGET_MEAN_500':field_scaler})
inp_array_scaled = np.expand_dims(inp_array_scaled, axis=2)
input8_values_scaled = np.append(input8_values_scaled,inp_array_scaled,axis=2)

np.save("final_data/input8_values_scaled",input8_values_scaled)

eng_feat_preprocessors_file = open('preprocessors/eng_feat_preprocessors','wb')
pickle.dump(eng_feat_preprocessors,eng_feat_preprocessors_file)
eng_feat_preprocessors_file.close()