# Feature Engineering

As we have no duplicates or null values we can just directly import the dataset again and do ur feature engineering from there.

In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy  as np
DATA_DIR = "../Data/"

train_data = pd.read_csv(DATA_DIR +"train.csv")
test_data = pd.read_csv(DATA_DIR + "test.csv")



def preprocess_data(train_data : pd.DataFrame, test_data : pd.DataFrame) -> pd.DataFrame:
    # Function for preprocessing data
    # Drop id
    train_data.drop("id",axis = 1, inplace = True)

    # Get column types
    numerical_columns = list(train_data.select_dtypes(exclude=["object"]).columns)
    cat_columns = list(train_data.select_dtypes(include="object").columns)
    test_numerical_columns = list(test_data.select_dtypes(exclude=["object"]).columns)
    
    # Encode cat data with one hot encoding
    train_data = pd.get_dummies(train_data, cat_columns, drop_first= True, dtype = int)
    test_data = pd.get_dummies(test_data, cat_columns, drop_first= True, dtype = int)

    return train_data, test_data


In [12]:
def feature_engineer(train_data :pd.DataFrame, test_data : pd.DataFrame) -> pd.DataFrame:
    '''
    Function for engineering new features into the data frames

    '''

    # Create new feature for income per year of employment
    train_data["income_per_year_emp"] = train_data["person_income"] / ( 1 + train_data["person_emp_length"])
    test_data["income_per_year_emp"] = test_data["person_income"] / ( 1 + test_data["person_emp_length"])
    # Creat a risk flag feature
    train_data["risk_flag"] = np.where((train_data["cb_person_default_on_file_Y"] == 1) & ((train_data["loan_grade_C"] ==1) | (train_data["loan_grade_D"] ==1) |(train_data["loan_grade_E"] ==1)) ,1, 0)
    test_data["risk_flag"] = np.where((test_data["cb_person_default_on_file_Y"] == 1) & ((test_data["loan_grade_C"] ==1) | (test_data["loan_grade_D"] ==1) |(test_data["loan_grade_E"] ==1) ) ,1 ,0)
    # Create loan utalisation as loan amout * int / person income
    train_data["utilsation"] = ((1 + train_data["loan_int_rate"] /100)  * train_data["loan_amnt"])  / train_data["person_income"]
    test_data["utilsation"] = ((1 + test_data["loan_int_rate"] /100) * test_data["loan_amnt"])  / test_data["person_income"]
    # Create feature for owning home and using loan for home improvement
    train_data["own_home_and_use_loan_for_h_i"] = (train_data["person_home_ownership_OWN"] & train_data["loan_intent_HOMEIMPROVEMENT"])
    test_data["own_home_and_use_loan_for_h_i"] = (test_data["person_home_ownership_OWN"] & test_data["loan_intent_HOMEIMPROVEMENT"])
    # Create new features for loan per credit history and loan per int
    train_data["int_per_y_cred"] = train_data["loan_int_rate"] / (1 +train_data["cb_person_cred_hist_length"])
    train_data["loan_per_y_cred"] = train_data["loan_amnt"] / (1 +train_data["cb_person_cred_hist_length"])
    test_data["int_per_y_cred"] = test_data["loan_int_rate"] / (1 +test_data["cb_person_cred_hist_length"])
    test_data["loan_per_y_cred"] = test_data["loan_amnt"] / (1 +test_data["cb_person_cred_hist_length"])
                                 
    return train_data, test_data

In [13]:
preprocessed_train, preprocessted_test = preprocess_data(train_data, test_data)

eng_train, eng_test = feature_engineer(preprocessed_train, preprocessted_test)

eng_train.to_csv("../Data/processed_train.csv")
eng_test.to_csv("../Data/processed_test.csv")

In [14]:
preprocessed_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58645 entries, 0 to 58644
Data columns (total 29 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   person_age                     58645 non-null  int64  
 1   person_income                  58645 non-null  int64  
 2   person_emp_length              58645 non-null  float64
 3   loan_amnt                      58645 non-null  int64  
 4   loan_int_rate                  58645 non-null  float64
 5   loan_percent_income            58645 non-null  float64
 6   cb_person_cred_hist_length     58645 non-null  int64  
 7   loan_status                    58645 non-null  int64  
 8   person_home_ownership_OTHER    58645 non-null  int32  
 9   person_home_ownership_OWN      58645 non-null  int32  
 10  person_home_ownership_RENT     58645 non-null  int32  
 11  loan_intent_EDUCATION          58645 non-null  int32  
 12  loan_intent_HOMEIMPROVEMENT    58645 non-null 

In [15]:
print(eng_train.columns)

Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length',
       'loan_status', 'person_home_ownership_OTHER',
       'person_home_ownership_OWN', 'person_home_ownership_RENT',
       'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT',
       'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 'loan_intent_VENTURE',
       'loan_grade_B', 'loan_grade_C', 'loan_grade_D', 'loan_grade_E',
       'loan_grade_F', 'loan_grade_G', 'cb_person_default_on_file_Y',
       'income_per_year_emp', 'risk_flag', 'utilsation',
       'own_home_and_use_loan_for_h_i', 'int_per_y_cred', 'loan_per_y_cred'],
      dtype='object')
