# Import Libraries

In [1]:
import numpy as np
import pandas as pd

# Load Data

In [2]:
applications = pd.read_csv("../Dataset/application.csv")

# Cleaning data
Change column names and remove unnecessary attributes.

In [3]:
applications = applications.rename(columns={
    'Unnamed: 0': 'entry',
    'mobile': 'own_mobile',
    'work_phone': 'own_work_phone',
    'phone': 'own_phone',
    'email': 'has_email',
    'CNT_FAM_MEMBERS': 'family_size'
})

applications = applications.drop(columns=['entry'])

How is the distribution of each attribute like?

In [4]:
for attribute in applications.columns:
    if attribute != "id":
        print(f"Attribute: {attribute}")
        print(applications[attribute].value_counts(normalize=True) * 100)

Attribute: gender
F    67.138365
M    32.861635
Name: gender, dtype: float64
Attribute: own_car
N    62.810307
Y    37.189693
Name: own_car, dtype: float64
Attribute: own_realty
Y    69.335115
N    30.664885
Name: own_realty, dtype: float64
Attribute: num_child
0     69.334431
1     20.185974
2      9.094371
3      1.238151
4      0.110818
5      0.030327
7      0.002052
9      0.001140
12     0.000912
6      0.000912
14     0.000684
19     0.000228
Name: num_child, dtype: float64
Attribute: income
135000.0    10.689374
157500.0     9.067920
180000.0     8.644714
112500.0     8.431971
225000.0     7.782341
              ...    
151425.0     0.000228
133461.0     0.000228
265950.0     0.000228
201150.0     0.000228
36679.5      0.000228
Name: income, Length: 866, dtype: float64
Attribute: income_type
Working                 51.556354
Commercial associate    22.974665
Pensioner               17.213954
State servant            8.251151
Student                  0.003876
Name: income_type, 

We remove the attribute `own_mobile` since there is no variability in the data.

In [5]:
applications = applications.drop(columns=['own_mobile'])

We adjust the employment length for all positive values to 0.

In [6]:
applications.loc[applications['employment_length'] >= 0, 'employment_length'] = 0

We convert all Y/N data to 1/0

In [7]:
applications.loc[applications['own_car'] == "Y", 'own_car'] = 1
applications.loc[applications['own_car'] == "N", 'own_car'] = 0

applications.loc[applications['own_realty'] == "Y", 'own_realty'] = 1
applications.loc[applications['own_realty'] == "N", 'own_realty'] = 0

# One Hot Encoding
We are going to OHE all non-numerical data.

These are the attributes to encode:
* gender
* income_type
* education_level
* family_status
* house_type
* job

OHE `gender`

In [8]:
gender_dummy = pd.get_dummies(applications['gender'])
gender_dummy = gender_dummy.drop(['M'], axis=1)
gender_dummy = gender_dummy.rename(columns={'F': 'is_female'})

OHE `income_type`

In [9]:
income_type_dummy = pd.get_dummies(applications['income_type'])
income_type_dummy = income_type_dummy.drop(['Student'], axis=1)
income_type_dummy = income_type_dummy.rename(columns={
    'Commercial associate': 'is_com_assoc',
    'Pensioner': 'is_pensioner',
    'State servant': 'is_state_servant',
    'Working': 'is_working'
})

OHE `education_level`

In [10]:
education_level_dummy = pd.get_dummies(applications['education_level'])
education_level_dummy = education_level_dummy.drop(['Secondary / secondary special'], axis=1)
education_level_dummy = education_level_dummy.rename(columns={
    'Academic degree': 'has_academic_degree',
    'Higher education': 'has_higher_education',
    'Incomplete higher': 'has_incomplete_higher_education',
    'Lower secondary': 'has_lower_secondary_education'
})

OHE `family_status`

In [11]:
family_status_dummy = pd.get_dummies(applications['family_status'])
family_status_dummy = family_status_dummy.drop(['Civil marriage'], axis=1)
family_status_dummy = family_status_dummy.rename(columns={
    'Married': 'is_married',
    'Separated': 'is_separated',
    'Single / not married': 'is_single',
    'Widow': 'is_widowed'
})

OHE `house_type`

In [12]:
house_type_dummy = pd.get_dummies(applications['house_type'])
house_type_dummy = house_type_dummy.drop(['With parents'], axis=1)
house_type_dummy = house_type_dummy.rename(columns={
    'Co-op apartment': 'lives_in_co-op',
    'House / apartment': 'lives_in_house',
    'Municipal apartment': 'lives_in_municipal',
    'Office apartment': 'lives_in_office',
    'Rented apartment': 'lives_in_rented'
})

OHE `job`

For job, as there are a lot of types, we decide to categorise them to these types:
* Labor_Work: Laborers, Low-skill Laborers
* Office_Work: Accountants, Secretaries, HR staff, Realty agents, High skill tech staff, IT staff
* Healthcare_Work: Medicine staff
* Tech_Work: High skill tech staff, IT staff
* On-site_Work: Managers, Core staff, Drivers, Security staff, Cleaning staff
* Service_Work: Sales staff, Waiters/barmen staff, Cooking staff, Cleaning staff, Drivers, Medicine staff
* Private_Service_Work: Private service staff 

In [13]:
job_dummy = pd.get_dummies(applications['job'])
job_dummy_new = pd.DataFrame()

job_dummy_new["Labor_Work"] = job_dummy['Laborers'] + job_dummy['Low-skill Laborers']
job_dummy_new["Office_Work"] = job_dummy['Accountants'] + job_dummy['Secretaries'] + job_dummy['HR staff'] + job_dummy['Realty agents'] + job_dummy['High skill tech staff'] + job_dummy['IT staff']
job_dummy_new["Healthcare_Work"] = job_dummy['Medicine staff']
job_dummy_new["Tech_Work"] = job_dummy['High skill tech staff'] + job_dummy['IT staff']
job_dummy_new["On-site_Work"] = job_dummy['Managers'] + job_dummy['Core staff'] + job_dummy['Drivers'] + job_dummy['Security staff'] + job_dummy['Cleaning staff']
job_dummy_new["Service_Work"] = job_dummy['Sales staff'] + job_dummy['Waiters/barmen staff'] + job_dummy['Cooking staff'] + job_dummy['Cleaning staff'] + job_dummy['Drivers'] + job_dummy['Medicine staff']


Merge all encoded columns

In [14]:
to_OHE = [
    'gender',
    'income_type',
    'education_level',
    'family_status',
    'house_type',
    'job'
]
applications = applications.drop(to_OHE, axis=1)
encoded_applications = pd.concat([
    applications, 
    gender_dummy, 
    income_type_dummy, 
    education_level_dummy, 
    family_status_dummy, 
    house_type_dummy, 
    job_dummy_new
], axis=1)

# Feature scaling

We will normalise our value attributes.

Even though standardisation is a more logical fit, due to the large number of OHE, we will try normalising first.

In [15]:
to_scale = [
    "num_child",
    "income",
    "birth_day",
    "employment_length"
]

for attribute in to_scale:
    max_value = encoded_applications[attribute].max()
    min_value = encoded_applications[attribute].min()
    encoded_applications[attribute] = (encoded_applications[attribute] - min_value) / (max_value - min_value)

Export the processed `application.csv` dataset (commented until needed)

In [16]:
encoded_applications.to_csv("../Dataset/processed_applications.csv", index=False)

In [17]:
encoded_applications

Unnamed: 0,id,own_car,own_realty,num_child,income,birth_day,employment_length,own_work_phone,own_phone,has_email,...,lives_in_house,lives_in_municipal,lives_in_office,lives_in_rented,Labor_Work,Office_Work,Healthcare_Work,Tech_Work,On-site_Work,Service_Work
0,5008804,1,1,0.0,0.059697,0.745032,0.740916,1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,5008805,1,1,0.0,0.059697,0.745032,0.740916,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,5008806,1,1,0.0,0.012850,0.210422,0.935315,0,0,0,...,1,0,0,0,0,0,0,0,1,0
3,5008808,0,1,0.0,0.036274,0.343891,0.825965,0,1,1,...,1,0,0,0,0,0,0,0,0,1
4,5008809,0,1,0.0,0.036274,0.343891,0.825965,0,1,1,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438552,6840104,0,1,0.0,0.016196,0.140244,1.000000,0,0,0,...,1,0,0,0,0,0,0,0,0,0
438553,6840222,0,0,0.0,0.011511,0.522922,0.828475,0,0,0,...,1,0,0,0,1,0,0,0,0,0
438554,6841878,0,0,0.0,0.004149,0.961608,0.978780,1,0,0,...,0,0,0,0,0,0,0,0,0,1
438555,6842765,0,1,0.0,0.006826,0.199187,1.000000,0,0,0,...,1,0,0,0,0,0,0,0,0,0
