In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import make_column_selector as selector
from cat_to_num import map_cat_to_num, cat_num_embed

In [2]:
data = 'diabetes'
df = pd.read_csv('../datasets/'+data+'.csv')

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,89,66,23,94,28.1,0.167,21,0
1,0,137,40,35,168,43.1,2.288,33,1
2,3,78,50,32,88,31.0,0.248,26,1
3,2,197,70,45,543,30.5,0.158,53,1
4,1,189,60,23,846,30.1,0.398,59,1


In [4]:
# change the labels so that the positive label (under the limit) is 1
df = df.replace({'Outcome': {1: 0, 0: 1}})

In [5]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,89,66,23,94,28.1,0.167,21,1
1,0,137,40,35,168,43.1,2.288,33,0
2,3,78,50,32,88,31.0,0.248,26,0
3,2,197,70,45,543,30.5,0.158,53,0
4,1,189,60,23,846,30.1,0.398,59,0


In [6]:
target = 'Outcome'

In [7]:
distance_metric = 'm_estimate'

In [8]:
y = np.array(df[target])

In [9]:
# select numerical and categorical data
df_sub = df.loc[:, df.columns != target]

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

num_cols = numerical_columns_selector(df_sub)
cat_cols = categorical_columns_selector(df_sub)

num_idx = [df.columns.get_loc(col) for col in num_cols]
cat_idx = [df.columns.get_loc(col) for col in cat_cols]

In [10]:
print(num_cols)
print(cat_cols)
print(num_idx)
print(cat_idx)

['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
[]
[0, 1, 2, 3, 4, 5, 6, 7]
[]


In [11]:
cols = list(df_sub.columns)

In [12]:
# get numerical embeddings of categorical variables
cat_embed = cat_num_embed(df, target, distance_metric)
cat_embed

{}

In [13]:
dataset_cp = df.copy()
dataset_embed = map_cat_to_num(dataset_cp, target, distance_metric)
dataset_embed.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,89,66,23,94,28.1,0.167,21,1
1,0,137,40,35,168,43.1,2.288,33,0
2,3,78,50,32,88,31.0,0.248,26,0
3,2,197,70,45,543,30.5,0.158,53,0
4,1,189,60,23,846,30.1,0.398,59,0


In [14]:
# transformer for numerical values
transformer = MinMaxScaler()

# normalize the dataset
x_sc = np.array(df_sub)
# normalize categorical feature values
for item in x_sc:
    for idx in cat_idx:
        item[idx] = float(cat_embed[idx][item[idx]])

# convert numpy array to df for normalizing numerical features
df_sc = pd.DataFrame(x_sc, columns=cols)
# normalize numerical feature values
df_sc[num_cols] = transformer.fit_transform(df_sc[num_cols])
df_sc[target] = y

In [15]:
df_sc.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.058824,0.232394,0.488372,0.285714,0.096154,0.202454,0.035118,0.0,1
1,0.0,0.570423,0.186047,0.5,0.185096,0.509202,0.943469,0.2,0
2,0.176471,0.15493,0.302326,0.446429,0.088942,0.261759,0.069807,0.083333,0
3,0.117647,0.992958,0.534884,0.678571,0.635817,0.251534,0.031263,0.533333,0
4,0.058824,0.93662,0.418605,0.285714,1.0,0.243354,0.134047,0.633333,0


In [16]:
df_sc.to_csv('../datasets/sf/'+data+'_sc.csv', index=False)

In [17]:
label_dict = {}
for key, inner_dict in cat_embed.items():
    label_dict[key] = {sub_key: i + 1 for i, (sub_key, _) in enumerate(sorted(inner_dict.items(), key=lambda x: x[1]))}

In [18]:
label_dict

{}

In [19]:
df_label = df.copy()
for idx in cat_idx:
    df_label.iloc[:,idx] = df_label.iloc[:,idx].replace(label_dict[idx])

In [20]:
df_label.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,89,66,23,94,28.1,0.167,21,1
1,0,137,40,35,168,43.1,2.288,33,0
2,3,78,50,32,88,31.0,0.248,26,0
3,2,197,70,45,543,30.5,0.158,53,0
4,1,189,60,23,846,30.1,0.398,59,0


In [21]:
temp = df_label.loc[:, df_label.columns != target]
x_sc = np.array(temp)
# convert numpy array to df for normalizing numerical features
df_label_sc = pd.DataFrame(x_sc, columns=cols)
# normalize numerical feature values
df_label_sc[num_cols] = transformer.fit_transform(df_label_sc[num_cols])
df_label_sc[target] = y

In [22]:
df_label_sc.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.058824,0.232394,0.488372,0.285714,0.096154,0.202454,0.035118,0.0,1
1,0.0,0.570423,0.186047,0.5,0.185096,0.509202,0.943469,0.2,0
2,0.176471,0.15493,0.302326,0.446429,0.088942,0.261759,0.069807,0.083333,0
3,0.117647,0.992958,0.534884,0.678571,0.635817,0.251534,0.031263,0.533333,0
4,0.058824,0.93662,0.418605,0.285714,1.0,0.243354,0.134047,0.633333,0


In [23]:
df_label_sc.to_csv('../datasets/sf/'+data+'.csv', index=False)

In [24]:
dict3 = {}
for key1, value1 in cat_embed.items():
    dict3[key1] = {}
    for key2, value2 in label_dict[key1].items():
        dict3[key1][value2] = value1[key2]

In [25]:
dict3

{}