In [None]:
import pandas as pd
import gower
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

## Loading the datasets
We load the datasets, we decided to split them into 3 different datasets and concatenate them later on based the use case. The concatination step is done before any preprocessing.

TODO from 12.05.2025 21:54 (add more and when chaning the todo add a new date)
- encode target variabels
- remove columns Smoke(bad representation + small correlation value) and SCC(bad representation aswell, unsure about correlation value (0.224), have a discussion about it.)


In [None]:
# Load the raw data

TEST_SIZE = 0.2

raw_pseudoreal_data = pd.read_csv("./datasets/real-data-20250501-154339.csv")

# Split the data into actual real data and generated with SMOTE(pseudoreal)
raw_real_data = raw_pseudoreal_data[0:477]
raw_pseudoreal_data = raw_pseudoreal_data[478:]

# Synthetic data generated using DeepLearning model
raw_synthetic_data = pd.read_csv("./datasets/synth.csv")
raw_synthetic_data = raw_synthetic_data.drop(columns=['id'])

# Print the shape of the datasets
print(f"Shape of raw_real_data: {raw_real_data.shape}")
print(f"Shape of raw_pseudoreal_data: {raw_pseudoreal_data.shape}")
print(f"Shape of synthetic_data: {raw_synthetic_data.shape}")

Shape of raw_real_data: (477, 17)
Shape of raw_pseudoreal_data: (1633, 17)
Shape of synthetic_data: (20758, 17)


## Gower distance
to remove similar records between the pseudoreal-real dataset and synth, we use Gowers similarity measure. The function below removes the similarities between the test set and the synthetical dataset, so that information leakage can be handled accordingly. 

In [16]:
# Function to remove similarities based on Gower distance
def remove_similar(X_test, synth, threshold=.2):
    '''
    Remove similarities between real and synthetic data

    Parameters
    ----------
    X_test : DataFrame
        Test data
    synth : DataFrame
        Synthetic data
    threshold : float, optional
        Threshold for similarity, by default .2
        
    Returns
    -------
    DataFrame
        Synthetic data without similar records
    dict
        Map of removed records
    '''
    to_remove = []
    removed_map = {}

    for i, row in X_test.iterrows():
        # Compute Gower distance between the real record and the synthetic data
        gower_matrix = gower.gower_matrix(row.to_frame().T, synth)
        # Find indices of synthetic records within the threshold
        matches = np.where(gower_matrix <= threshold)
        unique_indices = np.unique(matches[1])
        
        # Save the indices of the removed records
        for idx in unique_indices:
            removed_map.setdefault(idx, []).append(i)

        # Save removed indices for later analysis (number of removed records)    
        to_remove.extend(unique_indices)
    
    # Remove found similarities from synthetic data
    synth = synth.drop(index=to_remove)
    print(f"Removed {len(to_remove)} similar records from synthetic data.")

    return synth, removed_map


Removing a row since it has only 1 entry of it over all the datasets

In [17]:
# Remove rows where the column 'CALC' has the value 'Always'
filtered_data = raw_real_data[raw_real_data['CALC'] != 'Always']


## Initial data split
we splitted the data initially between pseudoreal-real and synth data, because at this point we haven't noticed the problem with the dataset. In this format the test-set that we use for comparison is created from the pseudoreal-real dataset. 

In [18]:
# Split the data into train and test sets for different approaches
# Initial approach: train on real+pseudoreal, train on real+pseudoreal+synthetic, test on split real+pseudoreal
# Extra approach: train on real, train on real+pseudoreal, train on real+pseudoreal+synthetic, test on split real

# Save all splits in a dictionary
data = {
    'Initial': {},
    'Extra': {}
}


# Initial approach

# Real + pseudoreal data based on different train/test split
real_pseudoreal_data = pd.concat([raw_real_data, raw_pseudoreal_data])
X_real_pseudoreal = real_pseudoreal_data.drop(columns=['NObeyesdad'])
Y_real_pseudoreal = real_pseudoreal_data['NObeyesdad']
X_train_real_pseudoreal, X_test_real_pseudoreal, y_train_real_pseudoreal, y_test_real_pseudoreal = train_test_split(
    X_real_pseudoreal, Y_real_pseudoreal, test_size=TEST_SIZE, random_state=42, stratify=Y_real_pseudoreal)

data['Initial']['real_train'] = {
    'X': X_train_real_pseudoreal,
    'y': y_train_real_pseudoreal
}

# Test split on real + pseudoreal
data['Initial']['test'] = {
    'X': X_test_real_pseudoreal,
    'y': y_test_real_pseudoreal
}

# Remove similarities between real and synthetic data
real_pseudoreal_test = pd.concat([X_test_real_pseudoreal, y_test_real_pseudoreal], axis=1)
synthetic_data_clean, removed_map = remove_similar(real_pseudoreal_test, raw_synthetic_data)

# Real + pseudoreal + synthetic train (different from the train set below)
X_synth = synthetic_data_clean.drop(columns=['NObeyesdad'])
y_synth = synthetic_data_clean['NObeyesdad']
X_real_pseudoreal_synth = pd.concat([X_train_real_pseudoreal, X_synth])
Y_real_pseudoreal_synth = pd.concat([y_train_real_pseudoreal, y_synth])


data['Initial']['synth_train'] = {
    'X': X_real_pseudoreal_synth,
    'y': Y_real_pseudoreal_synth
}


Removed 2410 similar records from synthetic data.


## Extra data split
In this step we split the data three ways, since we saw the flaw in the data. According to the paper we found online, where the dataset originates from:
<ul>
<li>entries 0-477 from "real-data.csv" are from a survey, we named them <strong>real</strong>.</li>
<li>entries 478-2111 from "real-data.csv" are created by SMOTE to even out the target class distribution, we named them <strong>pseudoreal</strong>.</li>
<li>the 20758 entries from "synth.csv" are created by a DL model, which isnt specified, so we named them <strong>synth</strong>.</li>
</ul>

In [19]:

# Extra approach

X_real = raw_real_data.drop(columns=['NObeyesdad'])
y_real = raw_real_data['NObeyesdad']

# Real Real data 477, splitted
X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(X_real, y_real, test_size=TEST_SIZE, random_state=42, stratify=y_real)

data['Extra']['real_train'] = {
    'X': X_train_real,
    'y': y_train_real,
}

# Test split on real data only
data['Extra']['test'] = {
    'X': X_test_real,
    'y': y_test_real
}

# Only pseudoreal data ~1666
X_pseudoreal = raw_pseudoreal_data.drop(columns=['NObeyesdad'])
y_pseudoreal = raw_pseudoreal_data['NObeyesdad']

# Real + pseudoreal train
X_pseudoreal_real = pd.concat([X_train_real, X_pseudoreal])
y_pseudoreal_real = pd.concat([y_train_real, y_pseudoreal])

data['Extra']['real_pseudoreal_train'] = {
    'X': X_pseudoreal_real,
    'y': y_pseudoreal_real,
}

# Remove similarities between real test data and synthetic data
real_test = pd.concat([X_test_real, y_test_real], axis=1)
synthetic_data_clean, removed_map = remove_similar(real_test, raw_synthetic_data)

# Only synthetic data ~20k
X_synth = synthetic_data_clean.drop(columns=['NObeyesdad'])
y_synth = synthetic_data_clean['NObeyesdad']

# Real + pseudoreal + synthetic train
X_synth_pseudoreal_real = pd.concat([X_pseudoreal_real, X_synth])
Y_synth_pseudoreal_real = pd.concat([y_pseudoreal_real, y_synth])


data['Extra']['real_pseudoreal_synth_train'] = {
    'X': X_synth_pseudoreal_real,
    'y': Y_synth_pseudoreal_real,
}


Removed 628 similar records from synthetic data.


We split the features in categories

In [None]:
# Define features for preprocessing

# Numeric features with rounding to int
numeric_features_int = ['Age']
# Numeric features with rounding to 2 decimal places
numeric_features_2dp = ['Height']
# Boolean features
boolean_features = ['family_history_with_overweight', 'FAVC', 'SCC']
# Categorical features
categorical_features = ['MTRANS', 'Gender']
# Ordinal features with rounding to int
ordinal_features_int = ['FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
# Ordinal features with mapping to int
ordinal_features_map_int = ['CAEC', 'CALC']
# Drop features (Just for the info)
drop_features = ['Weight', 'SMOKE']

## Preprocessor
We define custom preprocessors here and also align them in a pipeline, for more clarity. Afterwards we assign them to a columntransformer, so that the preprocessing step can be done easily on multiple similar datasets.

In [None]:
# Create preprocessing pipeline

# Numerical tranformer
class NumericalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, round_type='int'):
        self.scaler = StandardScaler()
        self.round_type = round_type
        self.features = []
    
    def _round(self, X):
        if self.round_type == 'int':
            return np.round(X).astype(int)
        elif self.round_type == 'float':
            return np.round(X, 2)
        return X    

    def fit(self, X, y=None):
        X_rounded = self._round(X)
        self.scaler.fit(X_rounded)
        self.features = X.columns.tolist()
        return self

    def transform(self, X):
        X_rounded = self._round(X)
        return self.scaler.transform(X_rounded)
    
    def get_feature_names_out(self, input_features=None):
        return np.array(self.features)


# Boolean transformer
class BooleanTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.features = []

    def fit(self, X, y=None):
        self.features = X.columns.tolist()
        return self

    def transform(self, X):
        return X.replace({'yes': 1, 'no': 0})
    
    def get_feature_names_out(self, input_features=None):
        return np.array(self.features)


# Ordinal transformer
class OrdinalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, rounding=False):
        self.rounding = rounding
        self.mapping = {
            'no': 0,
            'Sometimes': 1,
            'Frequently': 2,
            'Always': 3
        }
        self.features = []

    def fit(self, X, y=None):
        self.features = X.columns.tolist()
        return self

    def transform(self, X):
        if self.rounding:
            return np.round(X).astype(int)
        else:
            return X.replace(self.mapping)
    
    def get_feature_names_out(self, input_features=None):
        return np.array(self.features)
   

# Numeric int pipeline
numeric_int_pipeline = Pipeline(steps=[
    ('scaler_int', NumericalTransformer(round_type='int'))
])

# Numeric 2dp pipeline
numeric_2dp_pipeline = Pipeline(steps=[
    ('scaler_2dp', NumericalTransformer(round_type='float'))
])

# Boolean pipeline
boolean_pipeline = Pipeline(steps=[
    ('boolean', BooleanTransformer())
])

# Ordinal int pipeline
ordinal_int_pipeline = Pipeline(steps=[
    ('ordinal_int', OrdinalTransformer(rounding=True))
])

# Ordinal map int pipeline
ordinal_map_int_pipeline = Pipeline(steps=[
    ('ordinal_map', OrdinalTransformer(rounding=False))
])

# Categorical pipeline
categorical_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric_int', numeric_int_pipeline, numeric_features_int),
        ('numeric_2dp', numeric_2dp_pipeline, numeric_features_2dp),
        ('boolean', boolean_pipeline, boolean_features),
        ('ordinal_int', ordinal_int_pipeline, ordinal_features_int),
        ('ordinal_map_int', ordinal_map_int_pipeline, ordinal_features_map_int),
        ('categorical', categorical_pipeline, categorical_features)
    ],
    remainder='drop'
)


We use the preprocessor defined above for every training dataset

In [None]:

# Preprocess the data
for approach, datasets in data.items():
    for name, dataset in datasets.items():
        if 'train' in name:
            
            X = dataset['X']
            # Fit the preprocessor on the training data
            preprocessor.fit(X)
            
            # Transform the training and test data
            X_train = preprocessor.transform(X)
            X_test = preprocessor.transform(datasets['test']['X'])
            

            # Encode the target variable
            target_map = {
                'Insufficient_Weight': 0,
                'Normal_Weight': 1,
                'Overweight_Level_I': 2,
                'Overweight_Level_II': 3,
                'Obesity_Type_I': 4,
                'Obesity_Type_II': 5,
                'Obesity_Type_III': 6
            }

            y_train = pd.DataFrame(dataset['y'].replace(target_map))
            y_test = pd.DataFrame(datasets['test']['y'].replace(target_map))
            
            # Convert to DataFrame
            X_train = pd.DataFrame(X_train, columns=preprocessor.get_feature_names_out())
            X_test = pd.DataFrame(X_test, columns=preprocessor.get_feature_names_out())
            
            # Save the preprocessed data
            folder = f"./datasets/preprocessed/{approach}/{name.replace('train', 'data')}/"
            os.makedirs(folder, exist_ok=True)
            X_train.to_csv(os.path.join(folder, "X_train.csv"), index=False)
            y_train.to_csv(os.path.join(folder, "y_train.csv"), index=False)
            X_test.to_csv(os.path.join(folder, "X_test.csv"), index=False)
            y_test.to_csv(os.path.join(folder, "y_test.csv"), index=False)
            print(f"Preprocessed data saved for {approach} - {name}")
        
            




Preprocessed data saved for Initial - real_train


  return X.replace({'yes': 1, 'no': 0})
  return X.replace(self.mapping)
  return X.replace({'yes': 1, 'no': 0})
  return X.replace(self.mapping)
  return X.replace({'yes': 1, 'no': 0})
  return X.replace(self.mapping)
  return X.replace({'yes': 1, 'no': 0})
  return X.replace(self.mapping)
  return X.replace({'yes': 1, 'no': 0})
  return X.replace(self.mapping)
  return X.replace({'yes': 1, 'no': 0})
  return X.replace(self.mapping)


Preprocessed data saved for Initial - synth_train
Preprocessed data saved for Extra - real_train
Preprocessed data saved for Extra - real_pseudoreal_train


  return X.replace({'yes': 1, 'no': 0})
  return X.replace(self.mapping)
  return X.replace({'yes': 1, 'no': 0})
  return X.replace(self.mapping)
  return X.replace({'yes': 1, 'no': 0})
  return X.replace(self.mapping)
  return X.replace({'yes': 1, 'no': 0})
  return X.replace(self.mapping)
  return X.replace({'yes': 1, 'no': 0})
  return X.replace(self.mapping)
  return X.replace({'yes': 1, 'no': 0})
  return X.replace(self.mapping)
  return X.replace({'yes': 1, 'no': 0})
  return X.replace(self.mapping)
  return X.replace({'yes': 1, 'no': 0})
  return X.replace(self.mapping)
  return X.replace({'yes': 1, 'no': 0})
  return X.replace(self.mapping)


Preprocessed data saved for Extra - real_pseudoreal_synth_train
