In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import make_column_selector as selector
from cat_to_num import map_cat_to_num, cat_num_embed

In [2]:
data = 'german_credit'
df = pd.read_csv('../datasets/'+data+'.csv')

In [3]:
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,1
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,0
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,1
3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,1
4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,0


In [25]:
df.shape

(1000, 21)

In [4]:
target = 'class'

In [5]:
distance_metric = 'm_estimate'

In [6]:
y = np.array(df[target])

In [7]:
# select numerical and categorical data
df_sub = df.loc[:, df.columns != target]

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

num_cols = numerical_columns_selector(df_sub)
cat_cols = categorical_columns_selector(df_sub)

num_idx = [df.columns.get_loc(col) for col in num_cols]
cat_idx = [df.columns.get_loc(col) for col in cat_cols]

In [8]:
print(num_cols)
print(cat_cols)
print(num_idx)
print(cat_idx)

['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']
[1, 4, 7, 10, 12, 15, 17]
[0, 2, 3, 5, 6, 8, 9, 11, 13, 14, 16, 18, 19]


In [9]:
cols = list(df_sub.columns)

In [10]:
# get numerical embeddings of categorical variables
cat_embed = cat_num_embed(df, target, distance_metric)
cat_embed

{0: {'<0': 0.508,
  '0<=X<200': 0.61,
  '>=200': 0.7765625,
  'no checking': 0.8827848101265823},
 2: {'no credits/all paid': 0.3829268292682927,
  'all paid': 0.434,
  'existing paid': 0.6811676082862523,
  'delayed previously': 0.6820224719101123,
  'critical/other existing credit': 0.8289115646258503},
 3: {'education': 0.5627450980392157,
  'other': 0.5923076923076923,
  'new car': 0.62,
  'repairs': 0.6391304347826087,
  'business': 0.65,
  'domestic appliance': 0.6692307692307692,
  'furniture/equipment': 0.6796703296703297,
  'radio/tv': 0.7782918149466191,
  'used car': 0.8336538461538462,
  'retraining': 0.8699999999999999},
 5: {'<100': 0.6402317880794702,
  '100<=X<500': 0.6701923076923078,
  '500<=X<1000': 0.8234375,
  'no known savings': 0.8244565217391304,
  '>=1000': 0.8714285714285714},
 6: {'<1': 0.5936416184971098,
  'unemployed': 0.6301587301587303,
  '1<=X<4': 0.6932352941176471,
  '>=7': 0.7468503937007873,
  '4<=X<7': 0.7754285714285714},
 8: {'male div/sep': 0.60

In [11]:
dataset_cp = df.copy()
dataset_embed = map_cat_to_num(dataset_cp, target, distance_metric)
dataset_embed.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,0.508,6,0.828912,0.778292,1169,0.824457,0.74685,4,0.733515,0.70011,...,0.786926,67,0.724785,0.739076,2,0.704754,1,0.720247,0.692635,1
1,0.61,48,0.681168,0.778292,5951,0.640232,0.693235,2,0.648553,0.70011,...,0.786926,22,0.724785,0.739076,1,0.704754,1,0.686265,0.692635,0
2,0.882785,12,0.828912,0.562745,2096,0.640232,0.775429,2,0.733515,0.70011,...,0.786926,49,0.724785,0.739076,1,0.7199,2,0.686265,0.692635,1
3,0.508,42,0.681168,0.67967,7882,0.640232,0.775429,2,0.733515,0.80566,...,0.693991,45,0.724785,0.593578,1,0.704754,2,0.686265,0.692635,1
4,0.508,24,0.682022,0.62,4870,0.640232,0.693235,3,0.733515,0.70011,...,0.565806,53,0.724785,0.593578,2,0.704754,2,0.686265,0.692635,0


In [12]:
# transformer for numerical values
transformer = MinMaxScaler()

# normalize the dataset
x_sc = np.array(df_sub)
# normalize categorical feature values
for item in x_sc:
    for idx in cat_idx:
        item[idx] = float(cat_embed[idx][item[idx]])

# convert numpy array to df for normalizing numerical features
df_sc = pd.DataFrame(x_sc, columns=cols)
# normalize numerical feature values
df_sc[num_cols] = transformer.fit_transform(df_sc[num_cols])
df_sc[target] = y

In [13]:
df_sc.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,0.508,0.029412,0.828912,0.778292,0.050567,0.824457,0.74685,1.0,0.733515,0.70011,...,0.786926,0.857143,0.724785,0.739076,0.333333,0.704754,0.0,0.720247,0.692635,1
1,0.61,0.647059,0.681168,0.778292,0.31369,0.640232,0.693235,0.333333,0.648553,0.70011,...,0.786926,0.053571,0.724785,0.739076,0.0,0.704754,0.0,0.686265,0.692635,0
2,0.882785,0.117647,0.828912,0.562745,0.101574,0.640232,0.775429,0.333333,0.733515,0.70011,...,0.786926,0.535714,0.724785,0.739076,0.0,0.7199,1.0,0.686265,0.692635,1
3,0.508,0.558824,0.681168,0.67967,0.419941,0.640232,0.775429,0.333333,0.733515,0.80566,...,0.693991,0.464286,0.724785,0.593578,0.0,0.704754,1.0,0.686265,0.692635,1
4,0.508,0.294118,0.682022,0.62,0.254209,0.640232,0.693235,0.666667,0.733515,0.70011,...,0.565806,0.607143,0.724785,0.593578,0.333333,0.704754,1.0,0.686265,0.692635,0


In [14]:
df_sc.to_csv('../datasets/sf/'+data+'_sc.csv', index=False)

In [15]:
label_dict = {}
for key, inner_dict in cat_embed.items():
    label_dict[key] = {sub_key: i + 1 for i, (sub_key, _) in enumerate(sorted(inner_dict.items(), key=lambda x: x[1]))}

In [16]:
label_dict

{0: {'<0': 1, '0<=X<200': 2, '>=200': 3, 'no checking': 4},
 2: {'no credits/all paid': 1,
  'all paid': 2,
  'existing paid': 3,
  'delayed previously': 4,
  'critical/other existing credit': 5},
 3: {'education': 1,
  'other': 2,
  'new car': 3,
  'repairs': 4,
  'business': 5,
  'domestic appliance': 6,
  'furniture/equipment': 7,
  'radio/tv': 8,
  'used car': 9,
  'retraining': 10},
 5: {'<100': 1,
  '100<=X<500': 2,
  '500<=X<1000': 3,
  'no known savings': 4,
  '>=1000': 5},
 6: {'<1': 1, 'unemployed': 2, '1<=X<4': 3, '>=7': 4, '4<=X<7': 5},
 8: {'male div/sep': 1,
  'female div/dep/mar': 2,
  'male mar/wid': 3,
  'male single': 4},
 9: {'co applicant': 1, 'none': 2, 'guarantor': 3},
 11: {'no known property': 1, 'car': 2, 'life insurance': 3, 'real estate': 4},
 13: {'bank': 1, 'stores': 2, 'none': 3},
 14: {'for free': 1, 'rent': 2, 'own': 3},
 16: {'high qualif/self emp/mgmt': 1,
  'unemp/unskilled non res': 2,
  'skilled': 3,
  'unskilled resident': 4},
 18: {'none': 1, 'yes

In [17]:
df_label = df.copy()
for idx in cat_idx:
    df_label.iloc[:,idx] = df_label.iloc[:,idx].replace(label_dict[idx])

In [18]:
df_label.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,1,6,5,8,1169,4,4,4,4,2,...,4,67,3,3,2,3,1,2,1,1
1,2,48,3,8,5951,1,3,2,2,2,...,4,22,3,3,1,3,1,1,1,0
2,4,12,5,1,2096,1,5,2,4,2,...,4,49,3,3,1,4,2,1,1,1
3,1,42,3,7,7882,1,5,2,4,3,...,3,45,3,1,1,3,2,1,1,1
4,1,24,4,3,4870,1,3,3,4,2,...,1,53,3,1,2,3,2,1,1,0


In [19]:
temp = df_label.loc[:, df_label.columns != target]
x_sc = np.array(temp)
# convert numpy array to df for normalizing numerical features
df_label_sc = pd.DataFrame(x_sc, columns=cols)
# normalize numerical feature values
df_label_sc[num_cols] = transformer.fit_transform(df_label_sc[num_cols])
df_label_sc[target] = y

In [20]:
df_label_sc.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,1,0.029412,5,8,0.050567,4,4,1.0,4,2,...,4,0.857143,3,3,0.333333,3,0.0,2,1,1
1,2,0.647059,3,8,0.31369,1,3,0.333333,2,2,...,4,0.053571,3,3,0.0,3,0.0,1,1,0
2,4,0.117647,5,1,0.101574,1,5,0.333333,4,2,...,4,0.535714,3,3,0.0,4,1.0,1,1,1
3,1,0.558824,3,7,0.419941,1,5,0.333333,4,3,...,3,0.464286,3,1,0.0,3,1.0,1,1,1
4,1,0.294118,4,3,0.254209,1,3,0.666667,4,2,...,1,0.607143,3,1,0.333333,3,1.0,1,1,0


In [21]:
df_label_sc.to_csv('../datasets/sf/'+data+'.csv', index=False)

In [22]:
dict3 = {}
for key1, value1 in cat_embed.items():
    dict3[key1] = {}
    for key2, value2 in label_dict[key1].items():
        dict3[key1][value2] = value1[key2]

In [23]:
dict3

{0: {1: 0.508, 2: 0.61, 3: 0.7765625, 4: 0.8827848101265823},
 2: {1: 0.3829268292682927,
  2: 0.434,
  3: 0.6811676082862523,
  4: 0.6820224719101123,
  5: 0.8289115646258503},
 3: {1: 0.5627450980392157,
  2: 0.5923076923076923,
  3: 0.62,
  4: 0.6391304347826087,
  5: 0.65,
  6: 0.6692307692307692,
  7: 0.6796703296703297,
  8: 0.7782918149466191,
  9: 0.8336538461538462,
  10: 0.8699999999999999},
 5: {1: 0.6402317880794702,
  2: 0.6701923076923078,
  3: 0.8234375,
  4: 0.8244565217391304,
  5: 0.8714285714285714},
 6: {1: 0.5936416184971098,
  2: 0.6301587301587303,
  3: 0.6932352941176471,
  4: 0.7468503937007873,
  5: 0.7754285714285714},
 8: {1: 0.6019607843137255,
  2: 0.6485530546623793,
  3: 0.7279569892473119,
  4: 0.7335154826958106},
 9: {1: 0.5642857142857143, 2: 0.7001101321585903, 3: 0.8056603773584906},
 11: {1: 0.5658064516129032,
  2: 0.6927927927927927,
  3: 0.6939914163090128,
  4: 0.7869257950530035},
 13: {1: 0.5907142857142857, 2: 0.5979166666666667, 3: 0.72478