In [99]:
import pandas as pd
import numpy as np

In [100]:
# LOAD THE DATA AND RENAME THE 'ID' COLUMN
df = pd.read_csv("../data/loans.csv")
df.rename(columns={'id': 'loan_id'}, inplace=True)

In [101]:
df

Unnamed: 0,loan_id,account_id,date,amount,payments,24_A,12_B,12_A,60_D,48_C,...,60_C,24_B,48_D,24_D,48_B,36_A,36_B,60_B,12_D,60_A
0,4959,2,1994-01-05,80952,3373,X,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
1,4961,19,1996-04-29,30276,2523,-,X,-,-,-,...,-,-,-,-,-,-,-,-,-,-
2,4962,25,1997-12-08,30276,2523,-,-,X,-,-,...,-,-,-,-,-,-,-,-,-,-
3,4967,37,1998-10-14,318480,5308,-,-,-,X,-,...,-,-,-,-,-,-,-,-,-,-
4,4968,38,1998-04-19,110736,2307,-,-,-,-,X,...,-,-,-,-,-,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
677,7294,11327,1998-09-27,39168,1632,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
678,7295,11328,1998-07-18,280440,4674,-,-,-,-,-,...,X,-,-,-,-,-,-,-,-,-
679,7304,11349,1995-10-29,419880,6998,-,-,-,-,-,...,X,-,-,-,-,-,-,-,-,-
680,7305,11359,1996-08-06,54024,4502,-,-,X,-,-,...,-,-,-,-,-,-,-,-,-,-


In [102]:
# REMOVE SPACES FROM COLUMN NAMES
df.rename(columns=lambda x: x.strip(), inplace=True)

In [103]:
# CONVERT DATE COLUMN TO DATE FORMAT
df['date'] = pd.to_datetime(df['date'])

In [104]:
# MELT THE DATAFRAME AND DROP A COLUMN
melt_df = df.copy()
melt_df = melt_df.melt(id_vars=['loan_id','account_id','date','amount','payments'])
melt_df = melt_df.drop(melt_df[melt_df['value'] != 'X'].index)

# REPLACE ALL CELLS THAT ARE ENTIRELY SPACE (OR EMPTY) WITH NAN 
melt_df.replace(r'^\s*$', np.nan, regex=True, inplace = True) # regex: regular expression

# COUNT THE NUMBER OF MISSING VALUES IN EACH COLUMN; none
missing_values_num = melt_df.isna().sum()
print(missing_values_num)

loan_id       0
account_id    0
date          0
amount        0
payments      0
variable      0
value         0
dtype: int64


In [105]:
# TYPE CAST THE ID COLUMN TO NUMERIC VALUE AND SORT
melt_df['loan_id'] = pd.to_numeric(melt_df['loan_id'])
melt_df = melt_df.sort_values(by='loan_id', ascending=True)

melt_df.head(5)

Unnamed: 0,loan_id,account_id,date,amount,payments,variable,value
0,4959,2,1994-01-05,80952,3373,24_A,X
683,4961,19,1996-04-29,30276,2523,12_B,X
1366,4962,25,1997-12-08,30276,2523,12_A,X
2049,4967,37,1998-10-14,318480,5308,60_D,X
2732,4968,38,1998-04-19,110736,2307,48_C,X


In [106]:
# DROP A COLUMN
melt_df = melt_df.drop(columns=['value'])
melt_df.head(5)

Unnamed: 0,loan_id,account_id,date,amount,payments,variable
0,4959,2,1994-01-05,80952,3373,24_A
683,4961,19,1996-04-29,30276,2523,12_B
1366,4962,25,1997-12-08,30276,2523,12_A
2049,4967,37,1998-10-14,318480,5308,60_D
2732,4968,38,1998-04-19,110736,2307,48_C


In [107]:
# SPLIT THE DATA INTO NEW COLUMNS
split_data = melt_df['variable'].str.split('_', expand=True)
melt_df['terms'] = split_data[0]
melt_df['status'] = split_data[1]

# DROP THE COLUMN
melt_df = melt_df.drop(columns=['variable'])

melt_df

Unnamed: 0,loan_id,account_id,date,amount,payments,terms,status
0,4959,2,1994-01-05,80952,3373,24,A
683,4961,19,1996-04-29,30276,2523,12,B
1366,4962,25,1997-12-08,30276,2523,12,A
2049,4967,37,1998-10-14,318480,5308,60,D
2732,4968,38,1998-04-19,110736,2307,48,C
...,...,...,...,...,...,...,...
6815,7294,11327,1998-09-27,39168,1632,24,C
7498,7295,11328,1998-07-18,280440,4674,60,C
7499,7304,11349,1995-10-29,419880,6998,60,C
2044,7305,11359,1996-08-06,54024,4502,12,A


In [93]:
# SAVE THE DATAFRAME TO CSV FILE 
melt_df.to_csv('loans_py.csv', index = False, encoding='utf-8')