In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import make_column_selector as selector
from cat_to_num import map_cat_to_num, cat_num_embed

In [2]:
df = pd.read_csv('../datasets/sf/lending_club_orig.csv')

In [3]:
df.head()

Unnamed: 0,loan_amnt,pub_rec_bankruptcies,emp_length,annual_inc,dti,term,grade,home_ownership,purpose,loan_status
0,12000.0,0.0,11.0,85000.0,20.91,2.0,7.0,4.0,5.0,1
1,20000.0,0.0,11.0,115000.0,16.84,2.0,6.0,1.0,1.0,0
2,30000.0,0.0,11.0,110000.0,15.08,1.0,2.0,4.0,11.0,0
3,1000.0,0.0,3.0,75704.0,17.83,2.0,6.0,1.0,6.0,1
4,10000.0,3.0,4.0,40000.0,9.84,2.0,5.0,1.0,5.0,0


In [4]:
target = 'loan_status'
y = np.array(df[target])
df_sub = df.loc[:, df.columns != target]
cols = list(df_sub.columns)
num_cols = ['loan_amnt', 'pub_rec_bankruptcies', 'annual_inc', 'dti']

In [5]:
# transformer for numerical values
transformer = MinMaxScaler()
temp = df.loc[:, df.columns != target]
x_sc = np.array(temp)
# convert numpy array to df for normalizing numerical features
df_label_sc = pd.DataFrame(x_sc, columns=cols)
# normalize numerical feature values
df_label_sc[num_cols] = transformer.fit_transform(df_label_sc[num_cols])
df_label_sc[target] = y

In [6]:
df_label_sc.head()

Unnamed: 0,loan_amnt,pub_rec_bankruptcies,emp_length,annual_inc,dti,term,grade,home_ownership,purpose,loan_status
0,0.282051,0.0,11.0,0.009528,0.020931,2.0,7.0,4.0,5.0,1
1,0.487179,0.0,11.0,0.012899,0.016857,2.0,6.0,1.0,1.0,0
2,0.74359,0.0,11.0,0.012337,0.015095,1.0,2.0,4.0,11.0,0
3,0.0,0.0,3.0,0.008484,0.017848,2.0,6.0,1.0,6.0,1
4,0.230769,0.272727,4.0,0.004472,0.00985,2.0,5.0,1.0,5.0,0


In [7]:
df_label_sc.to_csv('../datasets/sf/lending_club.csv', index=False)

In [6]:
df.home_ownership = df.home_ownership.replace({
                1: 'RENT',
                2: 'ANY',
                3: 'OTHER',
                4: 'MORTGAGE',
                5: 'OWN',
                6: 'NONE',
            })

df.grade = df.grade.replace({
                1: 'G',
                2: 'F',
                3: 'E',
                4: 'D',
                5: 'C',
                6: 'B',
                7: 'A',
            })

df.emp_length = df.emp_length.replace({
                1: '< 1 year',
                2: '1 year',
                3: '2 years',
                4: '3 years',
                5: '4 years',
                6: '5 years',
                7: '6 years',
                8: '7 years',
                9: '8 years',
                10: '9 years',
                11: '10+ years',

            })

df.purpose = df.purpose.replace({
                1: 'small_business',
                2: 'house',
                3: 'renewable_energy',
                4: 'moving',
                5: 'debt_consolidation',
                6: 'other',
                7: 'medical',
                8: 'educational',
                9: 'major_purchase',
                10: 'vacation',
                11: 'home_improvement',
                12: 'credit_card',
                13: 'car',
                14: 'wedding',

            })

df.term = df.term.replace({
                1: ' 60 months',
                2: ' 36 months',
            })

In [7]:
df.head()

Unnamed: 0,loan_amnt,pub_rec_bankruptcies,emp_length,annual_inc,dti,term,grade,home_ownership,purpose,loan_status
0,12000.0,0.0,10+ years,85000.0,20.91,36 months,A,MORTGAGE,debt_consolidation,1
1,20000.0,0.0,10+ years,115000.0,16.84,36 months,B,RENT,small_business,0
2,30000.0,0.0,10+ years,110000.0,15.08,60 months,F,MORTGAGE,home_improvement,0
3,1000.0,0.0,2 years,75704.0,17.83,36 months,B,RENT,other,1
4,10000.0,3.0,3 years,40000.0,9.84,36 months,C,RENT,debt_consolidation,0


In [8]:
df.dtypes

loan_amnt               float64
pub_rec_bankruptcies    float64
emp_length               object
annual_inc              float64
dti                     float64
term                     object
grade                    object
home_ownership           object
purpose                  object
loan_status               int64
dtype: object

In [9]:
target = 'loan_status'

In [10]:
distance_metric = 'm_estimate'

In [11]:
y = np.array(df[target])

In [12]:
# select numerical and categorical data
df_sub = df.loc[:, df.columns != target]

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

num_cols = numerical_columns_selector(df_sub)
cat_cols = categorical_columns_selector(df_sub)

num_idx = [df.columns.get_loc(col) for col in num_cols]
cat_idx = [df.columns.get_loc(col) for col in cat_cols]

In [13]:
print(cat_idx)
print(num_idx)

[2, 5, 6, 7, 8]
[0, 1, 3, 4]


In [14]:
cols = list(df_sub.columns)
cols

['loan_amnt',
 'pub_rec_bankruptcies',
 'emp_length',
 'annual_inc',
 'dti',
 'term',
 'grade',
 'home_ownership',
 'purpose']

In [15]:
# get numerical embeddings of categorical variables
cat_embed = cat_num_embed(df, target, distance_metric)
cat_embed

{2: {'< 1 year': 0.4665400719366,
  '1 year': 0.4791376828117066,
  '3 years': 0.4901687631170416,
  '8 years': 0.49225084602822355,
  '5 years': 0.49232166073648365,
  '4 years': 0.4978390959475574,
  '2 years': 0.5002896555679831,
  '6 years': 0.5055264101874272,
  '7 years': 0.5069179505354375,
  '10+ years': 0.5073694546576176,
  '9 years': 0.5111782095713101},
 5: {' 60 months': 0.3443330775541757, ' 36 months': 0.5683341614903963},
 6: {'G': 0.19058206040315268,
  'F': 0.24035543642297152,
  'E': 0.2964478430459445,
  'D': 0.35905495954091704,
  'C': 0.4494535475878429,
  'B': 0.6009572729536394,
  'A': 0.7746511130618768},
 7: {'ANY': 0.380373372635716,
  'RENT': 0.45335737165054607,
  'OTHER': 0.4580979284369115,
  'OWN': 0.48594102944964435,
  'MORTGAGE': 0.5387552625023224,
  'NONE': 0.6994350282485875},
 8: {'small_business': 0.3444768313601305,
  'renewable_energy': 0.36994350282485877,
  'house': 0.4153253435726383,
  'moving': 0.4379865652386672,
  'other': 0.480413840842

In [16]:
dataset_cp = df.copy()
dataset_embed = map_cat_to_num(dataset_cp, target, distance_metric)
dataset_embed.head()

Unnamed: 0,loan_amnt,pub_rec_bankruptcies,emp_length,annual_inc,dti,term,grade,home_ownership,purpose,loan_status
0,12000.0,0.0,0.507369,85000.0,20.91,0.568334,0.774651,0.538755,0.482618,1
1,20000.0,0.0,0.507369,115000.0,16.84,0.568334,0.600957,0.453357,0.344477,0
2,30000.0,0.0,0.507369,110000.0,15.08,0.344333,0.240355,0.538755,0.526145,0
3,1000.0,0.0,0.50029,75704.0,17.83,0.568334,0.600957,0.453357,0.480414,1
4,10000.0,3.0,0.490169,40000.0,9.84,0.568334,0.449454,0.453357,0.482618,0


In [17]:
# transformer for numerical values
transformer = MinMaxScaler()

# normalize the dataset
x_sc = np.array(df_sub)
# normalize categorical feature values
for item in x_sc:
    for idx in cat_idx:
        item[idx] = float(cat_embed[idx][item[idx]])

# convert numpy array to df for normalizing numerical features
df_sc = pd.DataFrame(x_sc, columns=cols)
# normalize numerical feature values
df_sc[num_cols] = transformer.fit_transform(df_sc[num_cols])
df_sc[target] = y

In [18]:
df_sc.head()

Unnamed: 0,loan_amnt,pub_rec_bankruptcies,emp_length,annual_inc,dti,term,grade,home_ownership,purpose,loan_status
0,0.282051,0.0,0.507369,0.009528,0.020931,0.568334,0.774651,0.538755,0.482618,1
1,0.487179,0.0,0.507369,0.012899,0.016857,0.568334,0.600957,0.453357,0.344477,0
2,0.74359,0.0,0.507369,0.012337,0.015095,0.344333,0.240355,0.538755,0.526145,0
3,0.0,0.0,0.50029,0.008484,0.017848,0.568334,0.600957,0.453357,0.480414,1
4,0.230769,0.272727,0.490169,0.004472,0.00985,0.568334,0.449454,0.453357,0.482618,0


In [19]:
df_sc.to_csv('../datasets/sf/lending_club_sc.csv', index=False)

In [15]:
df_sc = pd.read_csv('../datasets/sf/lending_club_sc.csv')

In [16]:
df_sc.head()

Unnamed: 0,loan_amnt,pub_rec_bankruptcies,emp_length,annual_inc,dti,term,grade,home_ownership,purpose,loan_status
0,0.282051,0.0,0.507369,0.009528,0.020931,0.568334,0.774651,0.538755,0.482618,1
1,0.487179,0.0,0.507369,0.012899,0.016857,0.568334,0.600957,0.453357,0.344477,0
2,0.74359,0.0,0.507369,0.012337,0.015095,0.344333,0.240355,0.538755,0.526145,0
3,0.0,0.0,0.50029,0.008484,0.017848,0.568334,0.600957,0.453357,0.480414,1
4,0.230769,0.272727,0.490169,0.004472,0.00985,0.568334,0.449454,0.453357,0.482618,0


In [17]:
df_sc = df_sc.iloc[1000]

In [19]:
df_sc

loan_amnt               0.282051
pub_rec_bankruptcies    0.000000
emp_length              0.507369
annual_inc              0.006157
dti                     0.020881
term                    0.568334
grade                   0.600957
home_ownership          0.453357
purpose                 0.521885
loan_status             0.000000
Name: 1000, dtype: float64