In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import make_column_selector as selector
from cat_to_num import map_cat_to_num, cat_num_embed

In [2]:
data = 'blood_alcohol'
df = pd.read_csv('../datasets/'+data+'.csv')

In [3]:
df.head()

Unnamed: 0,gender,meal,units_consumed,weight,duration,class
0,Male,No,4.0,79.0,122.0,0
1,Male,No,5.0,74.0,110.0,1
2,Male,No,5.0,75.0,74.0,1
3,Male,Yes,4.0,83.0,119.0,0
4,Male,No,8.0,76.0,85.0,1


In [4]:
# change the labels so that the positive label (under the limit) is 1
df = df.replace({'class': {0: 1, 1: 0}})

In [5]:
target = 'class'

In [6]:
distance_metric = 'm_estimate'

In [7]:
y = np.array(df[target])

In [8]:
# select numerical and categorical data
df_sub = df.loc[:, df.columns != target]

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

num_cols = numerical_columns_selector(df_sub)
cat_cols = categorical_columns_selector(df_sub)

num_idx = [df.columns.get_loc(col) for col in num_cols]
cat_idx = [df.columns.get_loc(col) for col in cat_cols]

In [9]:
print(num_cols)
print(cat_cols)
print(num_idx)
print(cat_idx)

['units_consumed', 'weight', 'duration']
['gender', 'meal']
[2, 3, 4]
[0, 1]


In [10]:
cols = list(df_sub.columns)

In [11]:
# get numerical embeddings of categorical variables
cat_embed = cat_num_embed(df, target, distance_metric)
cat_embed

{0: {'Male': 0.17003484848484848, 'Female': 0.49538982213438737},
 1: {'Yes': 0.3140204795204795, 'No': 0.35497952047952047}}

In [12]:
dataset_cp = df.copy()
dataset_embed = map_cat_to_num(dataset_cp, target, distance_metric)
dataset_embed.head()

Unnamed: 0,gender,meal,units_consumed,weight,duration,class
0,0.170035,0.35498,4.0,79.0,122.0,1
1,0.170035,0.35498,5.0,74.0,110.0,0
2,0.170035,0.35498,5.0,75.0,74.0,0
3,0.170035,0.31402,4.0,83.0,119.0,1
4,0.170035,0.35498,8.0,76.0,85.0,0


In [13]:
# transformer for numerical values
transformer = MinMaxScaler()

# normalize the dataset
x_sc = np.array(df_sub)
# normalize categorical feature values
for item in x_sc:
    for idx in cat_idx:
        item[idx] = float(cat_embed[idx][item[idx]])

# convert numpy array to df for normalizing numerical features
df_sc = pd.DataFrame(x_sc, columns=cols)
# normalize numerical feature values
df_sc[num_cols] = transformer.fit_transform(df_sc[num_cols])
df_sc[target] = y

In [14]:
df_sc.head()

Unnamed: 0,gender,meal,units_consumed,weight,duration,class
0,0.170035,0.35498,0.272727,0.513158,0.691729,1
1,0.170035,0.35498,0.363636,0.447368,0.601504,0
2,0.170035,0.35498,0.363636,0.460526,0.330827,0
3,0.170035,0.31402,0.272727,0.565789,0.669173,1
4,0.170035,0.35498,0.636364,0.473684,0.413534,0


In [15]:
df_sc.to_csv('../datasets/sf/'+data+'_sc.csv', index=False)

In [16]:
label_dict = {}
for key, inner_dict in cat_embed.items():
    label_dict[key] = {sub_key: i + 1 for i, (sub_key, _) in enumerate(sorted(inner_dict.items(), key=lambda x: x[1]))}

In [17]:
label_dict

{0: {'Male': 1, 'Female': 2}, 1: {'Yes': 1, 'No': 2}}

In [18]:
df_label = df.copy()
for idx in cat_idx:
    df_label.iloc[:,idx] = df_label.iloc[:,idx].replace(label_dict[idx])

In [19]:
df_label.head()

Unnamed: 0,gender,meal,units_consumed,weight,duration,class
0,1,2,4.0,79.0,122.0,1
1,1,2,5.0,74.0,110.0,0
2,1,2,5.0,75.0,74.0,0
3,1,1,4.0,83.0,119.0,1
4,1,2,8.0,76.0,85.0,0


In [20]:
temp = df_label.loc[:, df_label.columns != target]
x_sc = np.array(temp)
# convert numpy array to df for normalizing numerical features
df_label_sc = pd.DataFrame(x_sc, columns=cols)
# normalize numerical feature values
df_label_sc[num_cols] = transformer.fit_transform(df_label_sc[num_cols])
df_label_sc[target] = y

In [21]:
df_label_sc.head()

Unnamed: 0,gender,meal,units_consumed,weight,duration,class
0,1.0,2.0,0.272727,0.513158,0.691729,1
1,1.0,2.0,0.363636,0.447368,0.601504,0
2,1.0,2.0,0.363636,0.460526,0.330827,0
3,1.0,1.0,0.272727,0.565789,0.669173,1
4,1.0,2.0,0.636364,0.473684,0.413534,0


In [22]:
df_label_sc.to_csv('../datasets/sf/'+data+'.csv', index=False)

In [21]:
dict3 = {}
for key1, value1 in cat_embed.items():
    dict3[key1] = {}
    for key2, value2 in label_dict[key1].items():
        dict3[key1][value2] = value1[key2]

In [22]:
dict3

{0: {1: 0.17003484848484848, 2: 0.49538982213438737},
 1: {1: 0.3140204795204795, 2: 0.35497952047952047}}