In [16]:
import pandas as pd
import gower
import numpy as np

In [17]:
"""
Features:

 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 2   family_history_with_overweight  2111 non-null   object 
 3   FAVC                            2111 non-null   object 
 4   FCVC                            2111 non-null   float64
 5   NCP                             2111 non-null   float64
 6   CAEC                            2111 non-null   object 
 7   SMOKE                           2111 non-null   object 
 8   CH2O                            2111 non-null   float64
 9   SCC                             2111 non-null   object 
 10  FAF                             2111 non-null   float64
 11  TUE                             2111 non-null   float64
 12  CALC                            2111 non-null   object 
 13  MTRANS                          2111 non-null   object 
 14  NObeyesdad                      2111 non-null   object 
"""

# Keep only specified features and remove Unnamed and Height/Weight columns
COLUMNS_TO_KEEP = [
    'Gender', 'Age', 'Height', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP',
    'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'NObeyesdad'
]
AGE_FEATURE = ['Age']
GENDER_FEATURE = []
NUMERICAL_FEATURES = ['Height']
BOOLEAN_FEATURES = ['family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
ORDINAL_FEATURES = ['CAEC', 'CALC']
INTEGER_FEATURES = ['FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
ONE_HOT_FEATURES = ['MTRANS', 'Gender'] #dont want to drop the first column thats why its not all in one array

"""
- Move gender to ONE HOT
- Move Age to NUMERICAL
- Remove Weight

TODO:
- Round age, height, weight
- Integer Round FCVC, NCP, CH20, FAF, TUE
"""


'\n- Move gender to ONE HOT\n- Move Age to NUMERICAL\n- Remove Weight\n\nTODO:\n- Round age, height, weight\n- Integer Round FCVC, NCP, CH20, FAF, TUE\n'

In [18]:
## Gower distance
# Main function to remove similarities
def remove_similar(X_test, synth, threshold=.2):
    to_remove = []
    removed_map = {}

    for i, row in X_test.iterrows():
        # Compute Gower distance between the real record and the synthetic data
        gower_matrix = gower.gower_matrix(row.to_frame().T, synth)
        # Find indices of synthetic records within the threshold
        matches = np.where(gower_matrix <= threshold)
        unique_indices = np.unique(matches[1])
        
        # Save the indices of the removed records
        for idx in unique_indices:
            removed_map.setdefault(idx, []).append(i)

        # Save removed indices for later analysis (number of removed records)    
        to_remove.extend(unique_indices)
    
    # Remove found similarities from synthetic data
    synth = synth.drop(index=to_remove)

    # print(f"Number of unique indices to remove: {len(np.unique(to_remove))}")
    # print(f"Length of synth before: {len(synth_raw)}")
    # print(f"Length of synth after: {len(synth)}")
    # print(f"Total synth rows removed: {len(synth_raw) - len(synth)}")

    return synth, removed_map


In [19]:
from sklearn.model_selection import train_test_split
# Load the datasets

TEST_SIZE = 0.2

raw_pseudoreal_data = pd.read_csv("./datasets/real-data-20250501-154339.csv")
raw_real_data = raw_pseudoreal_data[0:477] # as in the paper
raw_pseudoreal_data = raw_pseudoreal_data[478:]

synthetic_data = pd.read_csv("./synth.csv")
synthetic_data = synthetic_data.drop(columns=['id'])
real_test_gower = train_test_split(raw_real_data, test_size=TEST_SIZE, random_state=42, stratify=raw_real_data['NObeyesdad'])
raw_synthetic_data, removed_map = remove_similar(real_test_gower[0], synthetic_data)

raw_synthetic_data = synthetic_data

# raw_synthetic_data = pd.concat([raw_pseudoreal_data, raw_synthetic_data])

RAW_DATA = {
    'pseudoreal': raw_pseudoreal_data,
    'real': raw_real_data,
    'synthetic': raw_synthetic_data
}

for dataset_type, data in RAW_DATA.items():
    print(f"Dataset type: {dataset_type}")
    print(data.info())

Dataset type: pseudoreal
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1633 entries, 478 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          1633 non-null   object 
 1   Age                             1633 non-null   float64
 2   Height                          1633 non-null   float64
 3   Weight                          1633 non-null   float64
 4   family_history_with_overweight  1633 non-null   object 
 5   FAVC                            1633 non-null   object 
 6   FCVC                            1633 non-null   float64
 7   NCP                             1633 non-null   float64
 8   CAEC                            1633 non-null   object 
 9   SMOKE                           1633 non-null   object 
 10  CH2O                            1633 non-null   float64
 11  SCC                             1633 non-null   object 
 12  FAF    

In [20]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer

import numpy as np

# yes/no to 1/0
class BooleanToBinaryTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.replace({'yes': 1, 'no': 0})

# made a custom mapper to have control over value
class GenderToBinaryTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column='Gender'):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.column] = X[self.column].map({'Male': 0, 'Female': 1})
        return X

#made a custom mapper to have control over value
class OrdinalMapper(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mapping = {
            'no': 0,
            'Sometimes': 1,
            'Frequently': 2,
            'Always': 3
        }

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # X = X.copy()
        # # Apply the mapping to all specified columns
        # for col, mapping in self.mapping.items():
        #     if col in X.columns:
        #         X[col] = X[col].map(mapping)
        return X.replace({
            'no': 0,
            'Sometimes': 1,
            'Frequently': 2,
            'Always': 3
        })

freq_map = {
    'no': 0,
    'Sometimes': 1,
    'Frequently': 2,
    'Always': 3
}

#Pipeline for Age
round_then_scale = Pipeline([
    ('round', FunctionTransformer(np.round, validate=False)),
    ('scale', StandardScaler())
])

# ('gender', GenderToBinaryTransformer(), ['Gender']),
# Combine all transformers into one preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('age', StandardScaler(), ['Age']), # was round_then_scale
        ('height', StandardScaler(), NUMERICAL_FEATURES),
        ('bool', BooleanToBinaryTransformer(), BOOLEAN_FEATURES),
        ('ordinal', OrdinalMapper(), ORDINAL_FEATURES),
        ('int', FunctionTransformer(np.round, validate=False), INTEGER_FEATURES),
        ('one-hot', OneHotEncoder(sparse_output=False, drop="if_binary"), ONE_HOT_FEATURES)
    ],
    remainder='drop'
)

In [21]:
#TODO update
def get_final_feature_names(preprocessor, X_df):
    """
    Reconstructs the full list of feature names after transformation.

    Parameters:
        preprocessor: the fitted ColumnTransformer
        X_df: the original unprocessed DataFrame (e.g., X_train)
        age, gender, boolean_features, ordinal_features, integer_features, one_hot_features: lists of assigned features

    Returns:
        A list of final feature names in the order they appear in the transformed array
    """
    #return variable
    final_feature_names = []


    #Assigned features
    assigned_features = AGE_FEATURE + GENDER_FEATURE + NUMERICAL_FEATURES + BOOLEAN_FEATURES + ORDINAL_FEATURES + INTEGER_FEATURES + ONE_HOT_FEATURES
    print(f"Assigned features ({len(assigned_features)}): {assigned_features}")


    #Passthrough features (not transformed)
    all_features = list(X_df.columns)
    passthrough_features = []
    #f for f in all_features if f not in assigned_features

    # for loop, so that the column names can be assigned as the preprocessor transformed them
    for name, transformer, columns in preprocessor.transformers_:
        
        # Handle each transformer according to its type
        if name == 'age':
            final_feature_names.extend(AGE_FEATURE)

        elif name == 'height':
            final_feature_names.extend(NUMERICAL_FEATURES)
        
        elif name == 'gender':
            final_feature_names.extend(GENDER_FEATURE)
        
        elif name == 'bool':
            final_feature_names.extend(BOOLEAN_FEATURES)
        
        elif name == 'ordinal':
            final_feature_names.extend(ORDINAL_FEATURES)
        
        elif name == 'int':
            final_feature_names.extend(INTEGER_FEATURES)
        
        elif name == 'one-hot':
            cat_ohe = preprocessor.named_transformers_['one-hot']
            cat_feature_names = cat_ohe.get_feature_names_out(ONE_HOT_FEATURES)
            print(cat_feature_names)
            final_feature_names.extend(cat_feature_names)

    final_feature_names.extend(passthrough_features)

    return final_feature_names

In [22]:
from sklearn.model_selection import train_test_split

# Preprocess the data
X_real = raw_real_data.drop(columns=['NObeyesdad'])
y_real = raw_real_data['NObeyesdad']

# Real Real data 477, splitted
X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(X_real, y_real, test_size=TEST_SIZE, random_state=42, stratify=y_real)

# Only pseudoreal data ~1666
X_pseudoreal = raw_pseudoreal_data.drop(columns=['NObeyesdad'])
y_pseudoreal = raw_pseudoreal_data['NObeyesdad']

# Only synthetic data ~20k
X_synth = raw_synthetic_data.drop(columns=['NObeyesdad'])
y_synth = raw_synthetic_data['NObeyesdad']

# Pseudoreal + real-real train
X_pseudoreal_real = pd.concat([X_train_real, X_pseudoreal])
y_pseudoreal_real = pd.concat([y_train_real, y_pseudoreal])

# Pseudoreal + real + synthetic
X_synth_pseudoreal_real = pd.concat([X_pseudoreal_real, X_synth])
Y_synth_pseudoreal_real = pd.concat([y_pseudoreal_real, y_synth])

test_data = {
    'X': X_test_real,
    'y': y_test_real
}


real_real_data = {
    'X': X_train_real,
    'y': y_train_real,
}

real_pseudoreal_data = {
    'X': X_pseudoreal_real,
    'y': y_pseudoreal_real,
}

synthetic_pseudoreal_data = {
    'X': X_synth_pseudoreal_real,
    'y': Y_synth_pseudoreal_real,
}


def preprocess(data):
    X = pd.DataFrame(preprocessor.fit_transform(data['X']), columns=get_final_feature_names(preprocessor, data['X']))
    print(X.shape)

    # y = pd.DataFrame(preprocessor.transform(data['y']), columns=get_final_feature_names(preprocessor, data['y'], AGE_FEATURE, GENDER_FEATURE, BOOLEAN_FEATURES, ORDINAL_FEATURES, INTEGER_FEATURES, ONE_HOT_FEATURES))
    # print(y.shape)

    return {
        'X': X,
        'y': data['y']
    }

In [23]:
# Check if features are equal, otherwise insert 0 in place for missing features
PROCESSED_DATA = {
    'real': preprocess(real_real_data),
    'pseudoreal': preprocess(real_pseudoreal_data),
    'synthetic': preprocess(synthetic_pseudoreal_data),
    'test': preprocess(test_data)
}

# # Get all unique columns from both datasets
# all_columns = None

# for dataset_type in PROCESSED_DATA.keys():
#     if all_columns is None:
#         all_columns = set(PROCESSED_DATA[dataset_type]['X_train'].columns)
#     else:
#         all_columns = all_columns.intersection(set(PROCESSED_DATA[dataset_type]['X_train'].columns))


# # Add missing columns with zeros to both datasets
# for dataset_type in PROCESSED_DATA.keys():
#     for split in ['X_train', 'X_test']:
#         missing_cols = all_columns - set(A[dataset_type][split].columns)
#         for col in missing_cols:
#             PROCESSED_DATA[dataset_type][split][col] = 0
            
#         # Ensure columns are in the same order
#         PROCESSED_DATA[dataset_type][split] = PROCESSED_DATA[dataset_type][split][sorted(all_columns)]

# # Verify that columns are now equal
# for dataset_type in PROCESSED_DATA.keys():
#     assert set(PROCESSED_DATA[dataset_type]['X_train'].columns) == set(PROCESSED_DATA[dataset_type]['X_test'].columns)

# print("Features are now aligned between real and synthetic datasets")

Assigned features (15): ['Age', 'Height', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC', 'CAEC', 'CALC', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 'MTRANS', 'Gender']
['MTRANS_Automobile' 'MTRANS_Bike' 'MTRANS_Motorbike'
 'MTRANS_Public_Transportation' 'MTRANS_Walking' 'Gender_Male']
(381, 19)
Assigned features (15): ['Age', 'Height', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC', 'CAEC', 'CALC', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 'MTRANS', 'Gender']
['MTRANS_Automobile' 'MTRANS_Bike' 'MTRANS_Motorbike'
 'MTRANS_Public_Transportation' 'MTRANS_Walking' 'Gender_Male']
(2014, 19)
Assigned features (15): ['Age', 'Height', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC', 'CAEC', 'CALC', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 'MTRANS', 'Gender']
['MTRANS_Automobile' 'MTRANS_Bike' 'MTRANS_Motorbike'
 'MTRANS_Public_Transportation' 'MTRANS_Walking' 'Gender_Male']
(22772, 19)
Assigned features (15): ['Age', 'Height', 'family_history_with_overweight', 'FAVC', 'SMOKE', '

  return X.replace({'yes': 1, 'no': 0})
  return X.replace({
  return X.replace({'yes': 1, 'no': 0})
  return X.replace({
  return X.replace({'yes': 1, 'no': 0})
  return X.replace({
  return X.replace({'yes': 1, 'no': 0})
  return X.replace({


In [24]:
# Write the processed data to CSV

PATH_PREFIX = './datasets/preprocessed/'

for name, data in PROCESSED_DATA.items():
    data['X'].to_csv(PATH_PREFIX + name + '_X.csv', index=False)
    data['y'].to_csv(PATH_PREFIX + name + '_y.csv', index=False)

In [38]:
mean_test = X_test_real.mean(numeric_only=True)

mean_real = X_train_real.mean(numeric_only=True)
mean_pseudoreal = X_pseudoreal.mean(numeric_only=True)
mean_synthetic = X_synth.mean(numeric_only=True)

print("Difference between test set and real set:")
print(mean_test - mean_real)

print("Difference between test set and pseudoreal set:")
print(mean_test - mean_pseudoreal)

print("Difference between test set and synthetic set:")
print(mean_test - mean_synthetic)

Difference between test set and real set:
Age      -1.094488
Height   -0.010535
Weight   -0.563279
FCVC      0.083497
NCP      -0.007710
CH2O     -0.067339
FAF      -0.032644
TUE       0.065289
dtype: float64
Difference between test set and pseudoreal set:
Age       -2.305919
Height    -0.028281
Weight   -22.261665
FCVC      -0.049115
NCP       -0.049427
CH2O      -0.143311
FAF        0.182736
TUE        0.050186
dtype: float64
Difference between test set and synthetic set:
Age       -1.508471
Height    -0.022328
Weight   -18.617977
FCVC      -0.050075
NCP       -0.115499
CH2O      -0.144002
FAF        0.164087
TUE        0.091577
dtype: float64


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=a441f35e-4b4c-4c50-b56a-1aea6b800ed8' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>