# Dealing with Categorical Data in ML

In [1]:
# initial imports
import pandas as pd
from path import Path

## Dataset Information

The file `loans_data.csv`, contains simulated data about loans, there are a total of 500 records. Each row represents a loan application along an arbitrary year, where every column represents the following data about every loan application.

* `amount`: The loan amount in USD.
* `term`: The loan term in months.
* `month`: The month of the year when the loan was requested.
* `age`: Age of the loan applicant.
* `education`: Educational level of the loan applicant.
* `gender`: Gender of the loan applicant.
* `bad`: Stands for a bad or good loan applicant (`1` - bad, `0` - good).

In [6]:
# Load data
file_path = Path("../Resources/loans_data.csv")
loans_df = pd.read_csv(file_path)
loans_df.head()

Unnamed: 0,amount,term,month,age,education,gender,bad
0,1000,30,June,45,High School or Below,male,0
1,1000,30,July,50,Bachelor,female,0
2,1000,30,August,33,Bachelor,female,0
3,1000,15,September,27,college,male,0
4,1000,30,October,28,college,female,0


In [9]:
from sklearn.preprocessing import LabelEncoder
# Encode education
le = LabelEncoder()
loans_df['education'] = le.fit_transform(loans_df['education'])

In [10]:
# Encode gender
le = LabelEncoder()
loans_df['gender'] = le.fit_transform(loans_df['gender'])

In [11]:
# Months dictionary
months_num = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12,
}


In [18]:
# encode months using months dictionary
loans_df['months_num'] = loans_df['month'].apply(lambda x: months_num[x])
loans_df = loans_df.drop(['month'], axis = 1)

KeyError: 'month'

In [19]:
loans_df

Unnamed: 0,amount,term,age,education,gender,bad,months_num
0,1000,30,45,1,1,0,6
1,1000,30,50,0,0,0,7
2,1000,30,33,0,0,0,8
3,1000,15,27,3,1,0,9
4,1000,30,28,3,0,0,10
...,...,...,...,...,...,...,...
495,1000,30,28,1,1,1,12
496,1000,15,26,1,1,1,7
497,800,15,30,3,1,1,6
498,1000,30,38,3,0,1,3


In [20]:
# save dataframe as loans_data_encoded.csv
loans_df.to_csv("../Resources/loans_data_encoded_mine.csv")