In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import make_column_selector as selector
from cat_to_num import map_cat_to_num, cat_num_embed

In [2]:
data = 'default_credit_card'
df = pd.read_csv('../datasets/'+data+'.csv')

In [3]:
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,Default
0,20000.0,Female,University,Married,24,Three Month Delay,Three Month Delay,Pay Duly,Pay Duly,Pay Duly,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,0
1,120000.0,Female,University,Single,26,Pay Duly,Three Month Delay,One Month Delay,One Month Delay,One Month Delay,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,0
2,90000.0,Female,University,Single,34,One Month Delay,One Month Delay,One Month Delay,One Month Delay,One Month Delay,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,1
3,50000.0,Female,University,Married,37,One Month Delay,One Month Delay,One Month Delay,One Month Delay,One Month Delay,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,1
4,50000.0,Male,University,Married,57,Pay Duly,One Month Delay,Pay Duly,One Month Delay,One Month Delay,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,1


In [4]:
# Default payment means fail to make payment on time
# in original dataset, 1 -> Default (fail), 0 -> Not Default (pass)
# change the labels so that the positive label (Default) is 1 (i.e. Not Default)
df = df.replace({'Default': {0: 1, 1: 0}})

In [5]:
target = 'Default'

In [6]:
distance_metric = 'm_estimate'

In [7]:
y = np.array(df[target])

In [8]:
# select numerical and categorical data
df_sub = df.loc[:, df.columns != target]

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

num_cols = numerical_columns_selector(df_sub)
cat_cols = categorical_columns_selector(df_sub)

num_idx = [df.columns.get_loc(col) for col in num_cols]
cat_idx = [df.columns.get_loc(col) for col in cat_cols]

In [9]:
print(num_cols)
print(cat_cols)
print(num_idx)
print(cat_idx)

['LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
[0, 4, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
[1, 2, 3, 5, 6, 7, 8, 9, 10]


In [10]:
cols = list(df_sub.columns)
#cols

In [11]:
# get numerical embeddings of categorical variables
cat_embed = cat_num_embed(df, target, distance_metric)
#cat_embed

In [12]:
dataset_cp = df.copy()
dataset_embed = map_cat_to_num(dataset_cp, target, distance_metric)
dataset_embed.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,Default
0,20000.0,0.207764,0.237347,0.234716,24,0.691237,0.556064,0.167919,0.173498,0.177694,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,0.207764,0.237347,0.209284,26,0.156195,0.556064,0.174515,0.18329,0.188531,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,90000.0,0.207764,0.237347,0.209284,34,0.128119,0.159127,0.174515,0.18329,0.188531,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,0.207764,0.237347,0.234716,37,0.128119,0.159127,0.174515,0.18329,0.188531,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,50000.0,0.241671,0.237347,0.234716,57,0.156195,0.159127,0.167919,0.18329,0.188531,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [13]:
# transformer for numerical values
transformer = MinMaxScaler()

# normalize the dataset
x_sc = np.array(df_sub)
# normalize categorical feature values
for item in x_sc:
    for idx in cat_idx:
        item[idx] = float(cat_embed[idx][item[idx]])

# convert numpy array to df for normalizing numerical features
df_sc = pd.DataFrame(x_sc, columns=cols)
# normalize numerical feature values
df_sc[num_cols] = transformer.fit_transform(df_sc[num_cols])
df_sc[target] = y

In [14]:
df_sc.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,Default
0,0.010101,0.207764,0.237347,0.234716,0.051724,0.691237,0.556064,0.167919,0.173498,0.177694,...,0.160138,0.080648,0.260979,0.0,0.000409,0.0,0.0,0.0,0.0,1
1,0.111111,0.207764,0.237347,0.209284,0.086207,0.156195,0.556064,0.174515,0.18329,0.188531,...,0.16322,0.084074,0.263485,0.0,0.000594,0.001116,0.00161,0.0,0.003783,1
2,0.080808,0.207764,0.237347,0.209284,0.224138,0.128119,0.159127,0.174515,0.18329,0.188531,...,0.173637,0.09547,0.272928,0.001738,0.000891,0.001116,0.00161,0.002345,0.009458,0
3,0.040404,0.207764,0.237347,0.234716,0.275862,0.128119,0.159127,0.174515,0.18329,0.188531,...,0.186809,0.109363,0.283685,0.00229,0.001199,0.001339,0.001771,0.002506,0.001892,0
4,0.040404,0.241671,0.237347,0.234716,0.62069,0.156195,0.159127,0.167919,0.18329,0.188531,...,0.179863,0.099633,0.275681,0.00229,0.021779,0.01116,0.014493,0.001615,0.001284,0


In [15]:
df_sc.to_csv('../datasets/sf/'+data+'_sc.csv', index=False)

In [16]:
label_dict = {}
for key, inner_dict in cat_embed.items():
    label_dict[key] = {sub_key: i + 1 for i, (sub_key, _) in enumerate(sorted(inner_dict.items(), key=lambda x: x[1]))}

In [17]:
#label_dict

In [20]:
df_label = df.copy()
for idx in cat_idx:
    df_label.iloc[:,idx] = df_label.iloc[:,idx].replace(label_dict[idx])

In [21]:
df_label.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,Default
0,20000.0,1,3,2,24,8,6,1,1,1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,1,3,1,26,2,6,2,2,2,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,90000.0,1,3,1,34,1,2,2,2,2,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,1,3,2,37,1,2,2,2,2,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,50000.0,2,3,2,57,2,2,1,2,2,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [22]:
temp = df_label.loc[:, df_label.columns != target]
x_sc = np.array(temp)
# convert numpy array to df for normalizing numerical features
df_label_sc = pd.DataFrame(x_sc, columns=cols)
# normalize numerical feature values
df_label_sc[num_cols] = transformer.fit_transform(df_label_sc[num_cols])
df_label_sc[target] = y

In [23]:
df_label_sc.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,Default
0,0.010101,1.0,3.0,2.0,0.051724,8.0,6.0,1.0,1.0,1.0,...,0.160138,0.080648,0.260979,0.0,0.000409,0.0,0.0,0.0,0.0,1
1,0.111111,1.0,3.0,1.0,0.086207,2.0,6.0,2.0,2.0,2.0,...,0.16322,0.084074,0.263485,0.0,0.000594,0.001116,0.00161,0.0,0.003783,1
2,0.080808,1.0,3.0,1.0,0.224138,1.0,2.0,2.0,2.0,2.0,...,0.173637,0.09547,0.272928,0.001738,0.000891,0.001116,0.00161,0.002345,0.009458,0
3,0.040404,1.0,3.0,2.0,0.275862,1.0,2.0,2.0,2.0,2.0,...,0.186809,0.109363,0.283685,0.00229,0.001199,0.001339,0.001771,0.002506,0.001892,0
4,0.040404,2.0,3.0,2.0,0.62069,2.0,2.0,1.0,2.0,2.0,...,0.179863,0.099633,0.275681,0.00229,0.021779,0.01116,0.014493,0.001615,0.001284,0


In [24]:
df_label_sc.to_csv('../datasets/sf/'+data+'.csv', index=False)

In [25]:
dict3 = {}
for key1, value1 in cat_embed.items():
    dict3[key1] = {}
    for key2, value2 in label_dict[key1].items():
        dict3[key1][value2] = value1[key2]

In [26]:
dict3

{1: {1: 0.207763551040689, 2: 0.24167055261165782},
 2: {1: 0.07083411513859275,
  2: 0.1923503873039864,
  3: 0.23734738792673366,
  4: 0.2515699877999187},
 3: {1: 0.20928413404321955, 2: 0.234716046852123, 3: 0.23603492063492063},
 5: {1: 0.12811922920341973,
  2: 0.15619479043334122,
  3: 0.3394473298997018,
  4: 0.48967407407407404,
  5: 0.5184333333333333,
  6: 0.56106,
  7: 0.6781974025974027,
  8: 0.6912373313343328,
  9: 0.72212,
  10: 0.756102786377709},
 6: {1: 0.1106,
  2: 0.15912664166295848,
  3: 0.16853668259941015,
  4: 0.1800413793103448,
  5: 0.502212,
  6: 0.5560644602851323,
  7: 0.5819619047619048,
  8: 0.5854307692307692,
  9: 0.6153553516819572,
  10: 0.709323076923077},
 7: {1: 0.16791911412609736,
  2: 0.17451450681890263,
  3: 0.24424,
  4: 0.5155029319371728,
  5: 0.5553,
  6: 0.5555090909090908,
  7: 0.5735319502074689,
  8: 0.5743012987012988,
  9: 0.59255,
  10: 0.7936142857142857},
 8: {1: 0.1734975288959745,
  2: 0.18329005833738454,
  3: 0.3702000000000