In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import make_column_selector as selector
from cat_to_num import map_cat_to_num, cat_num_embed

In [2]:
data = 'heloc'
df = pd.read_csv('../datasets/'+data+'.csv')

In [3]:
df.head()

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MaxDelq2PublicRecLast12M,MaxDelqEver,...,NumTradesOpeninLast12M,PercentInstallTrades,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,class
0,56.0,145.0,5.0,85.0,21.0,4.0,1.0,84.0,4.0,6.0,...,2.0,44.0,1.0,1.0,34.0,9.0,2.0,2.0,70.0,0
1,68.0,67.0,6.0,25.0,10.0,1.0,1.0,101.0,8.0,9.0,...,5.0,45.0,5.0,5.0,54.0,5.0,3.0,2.0,87.0,0
2,67.0,170.0,2.0,74.0,29.0,2.0,2.0,94.0,7.0,7.0,...,4.0,58.0,6.0,5.0,73.0,7.0,5.0,4.0,92.0,0
3,82.0,334.0,28.0,133.0,13.0,1.0,1.0,101.0,8.0,9.0,...,1.0,26.0,2.0,2.0,52.0,4.0,2.0,1.0,81.0,0
4,60.0,138.0,12.0,79.0,32.0,1.0,1.0,92.0,5.0,7.0,...,2.0,48.0,1.0,1.0,63.0,13.0,5.0,4.0,95.0,0


In [4]:
df.shape

(8291, 21)

In [5]:
target = 'class'

In [6]:
distance_metric = 'm_estimate'

In [7]:
y = np.array(df[target])

In [8]:
# select numerical and categorical data
df_sub = df.loc[:, df.columns != target]

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

num_cols = numerical_columns_selector(df_sub)
cat_cols = categorical_columns_selector(df_sub)

num_idx = [df.columns.get_loc(col) for col in num_cols]
cat_idx = [df.columns.get_loc(col) for col in cat_cols]

In [9]:
print(num_cols)
print(cat_cols)
print(num_idx)
print(cat_idx)

['ExternalRiskEstimate', 'MSinceOldestTradeOpen', 'MSinceMostRecentTradeOpen', 'AverageMInFile', 'NumSatisfactoryTrades', 'NumTrades60Ever2DerogPubRec', 'NumTrades90Ever2DerogPubRec', 'PercentTradesNeverDelq', 'MaxDelq2PublicRecLast12M', 'MaxDelqEver', 'NumTotalTrades', 'NumTradesOpeninLast12M', 'PercentInstallTrades', 'NumInqLast6M', 'NumInqLast6Mexcl7days', 'NetFractionRevolvingBurden', 'NumRevolvingTradesWBalance', 'NumInstallTradesWBalance', 'NumBank2NatlTradesWHighUtilization', 'PercentTradesWBalance']
[]
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
[]


In [10]:
cols = list(df_sub.columns)

In [11]:
# get numerical embeddings of categorical variables
cat_embed = cat_num_embed(df, target, distance_metric)
cat_embed

{}

In [12]:
dataset_cp = df.copy()
dataset_embed = map_cat_to_num(dataset_cp, target, distance_metric)
dataset_embed.head()

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MaxDelq2PublicRecLast12M,MaxDelqEver,...,NumTradesOpeninLast12M,PercentInstallTrades,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,class
0,56.0,145.0,5.0,85.0,21.0,4.0,1.0,84.0,4.0,6.0,...,2.0,44.0,1.0,1.0,34.0,9.0,2.0,2.0,70.0,0
1,68.0,67.0,6.0,25.0,10.0,1.0,1.0,101.0,8.0,9.0,...,5.0,45.0,5.0,5.0,54.0,5.0,3.0,2.0,87.0,0
2,67.0,170.0,2.0,74.0,29.0,2.0,2.0,94.0,7.0,7.0,...,4.0,58.0,6.0,5.0,73.0,7.0,5.0,4.0,92.0,0
3,82.0,334.0,28.0,133.0,13.0,1.0,1.0,101.0,8.0,9.0,...,1.0,26.0,2.0,2.0,52.0,4.0,2.0,1.0,81.0,0
4,60.0,138.0,12.0,79.0,32.0,1.0,1.0,92.0,5.0,7.0,...,2.0,48.0,1.0,1.0,63.0,13.0,5.0,4.0,95.0,0


In [13]:
# transformer for numerical values
transformer = MinMaxScaler()

# normalize the dataset
x_sc = np.array(df_sub)
# normalize categorical feature values
for item in x_sc:
    for idx in cat_idx:
        item[idx] = float(cat_embed[idx][item[idx]])

# convert numpy array to df for normalizing numerical features
df_sc = pd.DataFrame(x_sc, columns=cols)
# normalize numerical feature values
df_sc[num_cols] = transformer.fit_transform(df_sc[num_cols])
df_sc[target] = y

In [14]:
df_sc.head()

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MaxDelq2PublicRecLast12M,MaxDelqEver,...,NumTradesOpeninLast12M,PercentInstallTrades,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,class
0,0.327586,0.177278,0.037736,0.333333,0.24359,0.157895,0.0,0.7875,0.333333,0.5,...,0.052632,0.450549,0.0,0.0,0.142241,0.25,0.0,0.055556,0.666667,0
1,0.534483,0.0799,0.04717,0.083333,0.102564,0.0,0.0,1.0,0.777778,1.0,...,0.210526,0.461538,0.060606,0.060606,0.228448,0.125,0.045455,0.055556,0.849462,0
2,0.517241,0.208489,0.009434,0.2875,0.346154,0.052632,0.052632,0.9125,0.666667,0.666667,...,0.157895,0.604396,0.075758,0.060606,0.310345,0.1875,0.136364,0.166667,0.903226,0
3,0.775862,0.413233,0.254717,0.533333,0.141026,0.0,0.0,1.0,0.777778,1.0,...,0.0,0.252747,0.015152,0.015152,0.219828,0.09375,0.0,0.0,0.784946,0
4,0.396552,0.168539,0.103774,0.308333,0.384615,0.0,0.0,0.8875,0.444444,0.666667,...,0.052632,0.494505,0.0,0.0,0.267241,0.375,0.136364,0.166667,0.935484,0


In [15]:
df_sc.to_csv('../datasets/sf/'+data+'_sc.csv', index=False)

In [16]:
label_dict = {}
for key, inner_dict in cat_embed.items():
    label_dict[key] = {sub_key: i + 1 for i, (sub_key, _) in enumerate(sorted(inner_dict.items(), key=lambda x: x[1]))}

In [17]:
label_dict

{}

In [18]:
df_label = df.copy()
for idx in cat_idx:
    df_label.iloc[:,idx] = df_label.iloc[:,idx].replace(label_dict[idx])

In [19]:
df_label.head()

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MaxDelq2PublicRecLast12M,MaxDelqEver,...,NumTradesOpeninLast12M,PercentInstallTrades,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,class
0,56.0,145.0,5.0,85.0,21.0,4.0,1.0,84.0,4.0,6.0,...,2.0,44.0,1.0,1.0,34.0,9.0,2.0,2.0,70.0,0
1,68.0,67.0,6.0,25.0,10.0,1.0,1.0,101.0,8.0,9.0,...,5.0,45.0,5.0,5.0,54.0,5.0,3.0,2.0,87.0,0
2,67.0,170.0,2.0,74.0,29.0,2.0,2.0,94.0,7.0,7.0,...,4.0,58.0,6.0,5.0,73.0,7.0,5.0,4.0,92.0,0
3,82.0,334.0,28.0,133.0,13.0,1.0,1.0,101.0,8.0,9.0,...,1.0,26.0,2.0,2.0,52.0,4.0,2.0,1.0,81.0,0
4,60.0,138.0,12.0,79.0,32.0,1.0,1.0,92.0,5.0,7.0,...,2.0,48.0,1.0,1.0,63.0,13.0,5.0,4.0,95.0,0


In [20]:
temp = df_label.loc[:, df_label.columns != target]
x_sc = np.array(temp)
# convert numpy array to df for normalizing numerical features
df_label_sc = pd.DataFrame(x_sc, columns=cols)
# normalize numerical feature values
df_label_sc[num_cols] = transformer.fit_transform(df_label_sc[num_cols])
df_label_sc[target] = y

In [21]:
df_label_sc.head()

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MaxDelq2PublicRecLast12M,MaxDelqEver,...,NumTradesOpeninLast12M,PercentInstallTrades,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,class
0,0.327586,0.177278,0.037736,0.333333,0.24359,0.157895,0.0,0.7875,0.333333,0.5,...,0.052632,0.450549,0.0,0.0,0.142241,0.25,0.0,0.055556,0.666667,0
1,0.534483,0.0799,0.04717,0.083333,0.102564,0.0,0.0,1.0,0.777778,1.0,...,0.210526,0.461538,0.060606,0.060606,0.228448,0.125,0.045455,0.055556,0.849462,0
2,0.517241,0.208489,0.009434,0.2875,0.346154,0.052632,0.052632,0.9125,0.666667,0.666667,...,0.157895,0.604396,0.075758,0.060606,0.310345,0.1875,0.136364,0.166667,0.903226,0
3,0.775862,0.413233,0.254717,0.533333,0.141026,0.0,0.0,1.0,0.777778,1.0,...,0.0,0.252747,0.015152,0.015152,0.219828,0.09375,0.0,0.0,0.784946,0
4,0.396552,0.168539,0.103774,0.308333,0.384615,0.0,0.0,0.8875,0.444444,0.666667,...,0.052632,0.494505,0.0,0.0,0.267241,0.375,0.136364,0.166667,0.935484,0


In [22]:
df_label_sc.to_csv('../datasets/sf/'+data+'.csv', index=False)

In [23]:
dict3 = {}
for key1, value1 in cat_embed.items():
    dict3[key1] = {}
    for key2, value2 in label_dict[key1].items():
        dict3[key1][value2] = value1[key2]

In [24]:
dict3

{}