In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import make_column_selector as selector
from cat_to_num import map_cat_to_num, cat_num_embed

In [2]:
data = 'adult_income'
df = pd.read_csv('../datasets/'+data+'.csv')

In [3]:
df.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,country,income
0,39,State-gov,Bachelors,Never-Married,Admin,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,Married,White-Collar,Husband,White,Male,0,0,13,United-States,0
2,38,Private,High School,Separated,Blue-Collar,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,School,Married,Blue-Collar,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,Married,Professional,Wife,Black,Female,0,0,40,Other,0


In [4]:
target = 'income'

In [5]:
distance_metric = 'm_estimate'

In [6]:
y = np.array(df[target])

In [7]:
# select numerical and categorical data
df_sub = df.loc[:, df.columns != target]

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

num_cols = numerical_columns_selector(df_sub)
cat_cols = categorical_columns_selector(df_sub)

num_idx = [df.columns.get_loc(col) for col in num_cols]
cat_idx = [df.columns.get_loc(col) for col in cat_cols]

In [8]:
print(num_cols)
print(cat_cols)
print(num_idx)
print(cat_idx)

['age', 'capital_gain', 'capital_loss', 'hours_per_week']
['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'country']
[0, 8, 9, 10]
[1, 2, 3, 4, 5, 6, 7, 11]


In [9]:
cols = list(df_sub.columns)

In [10]:
# get numerical embeddings of categorical variables
cat_embed = cat_num_embed(df, target, distance_metric)
cat_embed

{1: {'Without-pay': 0.017324792765636775,
  'Never-worked': 0.032483986435568955,
  'Other': 0.11998717137761922,
  'Private': 0.23723854431434338,
  'State-gov': 0.27538451379181966,
  'Self-emp-not-inc': 0.29481942564713876,
  'Local-gov': 0.29937525032316603,
  'Federal-gov': 0.38497831022789286,
  'Self-emp-inc': 0.5546677784976795},
 2: {'School': 0.06781777800273409,
  'High School': 0.1856248971100108,
  'Associates': 0.2587878225259045,
  'Bachelors': 0.41259349464816814,
  'Masters': 0.547978130684918,
  'Prof-School': 0.7288533895793379,
  'Doctorate': 0.7326547689615002},
 3: {'Never-Married': 0.05811495471754306,
  'Widowed': 0.08642038142716364,
  'Separated': 0.1057510649538709,
  'Married': 0.4832713460545632},
 4: {'Service': 0.04595611193641975,
  'Military': 0.12598718914845516,
  'Admin': 0.15115294535020268,
  'Blue-Collar': 0.18842907742711015,
  'Other': 0.21283775087264106,
  'Sales': 0.28228183825231207,
  'Professional': 0.45058545903924474,
  'White-Collar': 0

In [11]:
dataset_cp = df.copy()
dataset_embed = map_cat_to_num(dataset_cp, target, distance_metric)
dataset_embed.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,country,income
0,39,0.275385,0.412593,0.058115,0.151153,0.117782,0.280489,0.334997,2174,0,40,0.267856,0
1,50,0.294819,0.412593,0.483271,0.480326,0.49238,0.280489,0.334997,0,0,13,0.267856,0
2,38,0.237239,0.185625,0.105751,0.188429,0.117782,0.280489,0.334997,0,0,40,0.267856,0
3,53,0.237239,0.067818,0.483271,0.188429,0.49238,0.130886,0.334997,0,0,40,0.267856,0
4,28,0.237239,0.412593,0.483271,0.450585,0.470917,0.130886,0.121875,0,0,40,0.272946,0


In [12]:
# transformer for numerical values
transformer = MinMaxScaler()

# normalize the dataset
x_sc = np.array(df_sub)
# normalize categorical feature values
for item in x_sc:
    for idx in cat_idx:
        item[idx] = float(cat_embed[idx][item[idx]])

# convert numpy array to df for normalizing numerical features
df_sc = pd.DataFrame(x_sc, columns=cols)
# normalize numerical feature values
df_sc[num_cols] = transformer.fit_transform(df_sc[num_cols])
df_sc[target] = y

In [13]:
df_sc.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,country,income
0,0.30137,0.275385,0.412593,0.058115,0.151153,0.117782,0.280489,0.334997,0.02174,0.0,0.397959,0.267856,0
1,0.452055,0.294819,0.412593,0.483271,0.480326,0.49238,0.280489,0.334997,0.0,0.0,0.122449,0.267856,0
2,0.287671,0.237239,0.185625,0.105751,0.188429,0.117782,0.280489,0.334997,0.0,0.0,0.397959,0.267856,0
3,0.493151,0.237239,0.067818,0.483271,0.188429,0.49238,0.130886,0.334997,0.0,0.0,0.397959,0.267856,0
4,0.150685,0.237239,0.412593,0.483271,0.450585,0.470917,0.130886,0.121875,0.0,0.0,0.397959,0.272946,0


In [14]:
df_sc.to_csv('../datasets/sf/'+data+'_sc.csv', index=False)

In [15]:
label_dict = {}
for key, inner_dict in cat_embed.items():
    label_dict[key] = {sub_key: i + 1 for i, (sub_key, _) in enumerate(sorted(inner_dict.items(), key=lambda x: x[1]))}

In [16]:
label_dict

{1: {'Without-pay': 1,
  'Never-worked': 2,
  'Other': 3,
  'Private': 4,
  'State-gov': 5,
  'Self-emp-not-inc': 6,
  'Local-gov': 7,
  'Federal-gov': 8,
  'Self-emp-inc': 9},
 2: {'School': 1,
  'High School': 2,
  'Associates': 3,
  'Bachelors': 4,
  'Masters': 5,
  'Prof-School': 6,
  'Doctorate': 7},
 3: {'Never-Married': 1, 'Widowed': 2, 'Separated': 3, 'Married': 4},
 4: {'Service': 1,
  'Military': 2,
  'Admin': 3,
  'Blue-Collar': 4,
  'Other': 5,
  'Sales': 6,
  'Professional': 7,
  'White-Collar': 8},
 5: {'Own-child': 1,
  'Other-relative': 2,
  'Unmarried': 3,
  'Not-in-family': 4,
  'Wife': 5,
  'Husband': 6},
 6: {'Other': 1,
  'Amer-Indian-Eskimo': 2,
  'Black': 3,
  'Asian-Pac-Islander': 4,
  'White': 5},
 7: {'Female': 1, 'Male': 2},
 11: {'Latin-America': 1,
  'South-America': 2,
  'Euro_2': 3,
  'SE-Asia': 4,
  'United-States': 5,
  'Other': 6,
  'China': 7,
  'Euro_1': 8,
  'British-Commonwealth': 9,
  'Yugoslavia': 10}}

In [17]:
df_label = df.copy()
for idx in cat_idx:
    df_label.iloc[:,idx] = df_label.iloc[:,idx].replace(label_dict[idx])

In [18]:
df_label.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,country,income
0,39,5,4,1,3,4,5,2,2174,0,40,5,0
1,50,6,4,4,8,6,5,2,0,0,13,5,0
2,38,4,2,3,4,4,5,2,0,0,40,5,0
3,53,4,1,4,4,6,3,2,0,0,40,5,0
4,28,4,4,4,7,5,3,1,0,0,40,6,0


In [19]:
temp = df_label.loc[:, df_label.columns != target]
x_sc = np.array(temp)
# convert numpy array to df for normalizing numerical features
df_label_sc = pd.DataFrame(x_sc, columns=cols)
# normalize numerical feature values
df_label_sc[num_cols] = transformer.fit_transform(df_label_sc[num_cols])
df_label_sc[target] = y

In [20]:
df_label_sc.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,country,income
0,0.30137,5,4,1,3,4,5,2,0.02174,0.0,0.397959,5,0
1,0.452055,6,4,4,8,6,5,2,0.0,0.0,0.122449,5,0
2,0.287671,4,2,3,4,4,5,2,0.0,0.0,0.397959,5,0
3,0.493151,4,1,4,4,6,3,2,0.0,0.0,0.397959,5,0
4,0.150685,4,4,4,7,5,3,1,0.0,0.0,0.397959,6,0


In [21]:
df_label_sc.to_csv('../datasets/sf/'+data+'.csv', index=False)

In [24]:
dict3 = {}
for key1, value1 in cat_embed.items():
    dict3[key1] = {}
    for key2, value2 in label_dict[key1].items():
        dict3[key1][value2] = value1[key2]

In [25]:
dict3

{1: {1: 0.017324792765636775,
  2: 0.032483986435568955,
  3: 0.11998717137761922,
  4: 0.23723854431434338,
  5: 0.27538451379181966,
  6: 0.29481942564713876,
  7: 0.29937525032316603,
  8: 0.38497831022789286,
  9: 0.5546677784976795},
 2: {1: 0.06781777800273409,
  2: 0.1856248971100108,
  3: 0.2587878225259045,
  4: 0.41259349464816814,
  5: 0.547978130684918,
  6: 0.7288533895793379,
  7: 0.7326547689615002},
 3: {1: 0.05811495471754306,
  2: 0.08642038142716364,
  3: 0.1057510649538709,
  4: 0.4832713460545632},
 4: {1: 0.04595611193641975,
  2: 0.12598718914845516,
  3: 0.15115294535020268,
  4: 0.18842907742711015,
  5: 0.21283775087264106,
  6: 0.28228183825231207,
  7: 0.45058545903924474,
  8: 0.48032584145028573},
 5: {1: 0.018714488561904438,
  2: 0.03968037475131475,
  3: 0.06953165718110371,
  4: 0.11778179016343256,
  5: 0.47091746801295514,
  6: 0.49237991294760675},
 6: {1: 0.09286717607163437,
  2: 0.11811033189408648,
  3: 0.1308857688205829,
  4: 0.266917750619791