In [2]:
import pandas as pd
from pandas_profiling import ProfileReport
from IPython.display import display

# Load the datasets
real_data = pd.read_csv("./datasets/real-data-20250501-154339.csv")
synth_data = pd.read_csv("./datasets/synth.csv")
synth_data = synth_data.drop(columns=['id'])
all_data = pd.concat([real_data, synth_data], ignore_index=True)
# Generate profile reports
profileInitialReal = ProfileReport(real_data, title="Real Data Profile Report")
profileInitialSynth = ProfileReport(synth_data, title="Synthetic Data Profile Report")

datasetsInitial = {}
datasetsInitial["Real Data"] = real_data
datasetsInitial["Synthetic Data"] = synth_data

value_counts = real_data['NObeyesdad'].value_counts()
print(value_counts)
print(value_counts/5)

In [3]:
raw_pseudoreal_data = pd.read_csv("./datasets/real-data-20250501-154339.csv")

# Split the data into actual real data and generated with SMOTE(pseudoreal)
raw_real_data = raw_pseudoreal_data[0:477]
raw_pseudoreal_data = raw_pseudoreal_data[478:]

# Synthetic data generated using DeepLearning model
raw_synthetic_data = pd.read_csv("./datasets/synth.csv")
raw_synthetic_data = raw_synthetic_data.drop(columns=['id'])

# Generate profile reports
profileRawReal = ProfileReport(raw_real_data, title="Raw Real Data Profile Report")
profileRawPseudoreal = ProfileReport(raw_pseudoreal_data, title="Raw Pseudoreal Data Profile Report")
profileRawSynth = ProfileReport(raw_synthetic_data, title="Raw Synthetic Data Profile Report")

datasetsExtra = {}
datasetsExtra["Raw Real Data"] = raw_real_data
datasetsExtra["Raw Pseudoreal Data"] = raw_pseudoreal_data
datasetsExtra["Raw Synthetic Data"] = raw_synthetic_data

value_counts = raw_real_data['NObeyesdad'].value_counts()
print(value_counts)
print(value_counts/5)


In [4]:
def display_data(df_name, df):
    print(50*"=")
    print(f"dataset: {df_name} \n" + 50*"-")
    print(f"\nShape of {df_name}:\n" + 50*"-")
    display(df.shape)
    print(f"\ndataset info {df_name}:\n" + 50*"-")
    display(df.info())
    
    print(f"\nhead and tail of {df_name}:\n" + 50*"-")
    display(df.head())
    display(df.tail())
    
    print(f"\ndescribe {df_name}:\n" + 50*"-")
    display(df.describe())
    
    print(f"\nMissing Values in {df_name}:\n" + 50 * "-")
    display(df.isnull().sum())
    


## Displaying basic data value
to get a basic idea how the data is consisting of and to see if there are any missing values, what kind of datatypes we have and so on.

In [5]:

for df_name, df in datasetsInitial.items():
    display_data(df_name, df)
    

## Profile reports
we are displaying both profile reports, to have a look at correlations matrices, key patterns and distributions

In [None]:
# Display the reports
display(profileInitialReal)
display(profileInitialSynth)
display(profileRawReal)
display(profileRawPseudoreal)
display(profileRawSynth)

gender tarogt on obesity 2 and obesity 3 are badly distributed, maybe we just bin them.

In [7]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

ordered_categories = ['Insufficient_Weight', 'Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III']


## Understanding the data Distribution
firstly we create functions the plot different data distributions, to see how features are distributed in regards to the target, since we have the assumption that synthetical data (Smote and Synth) have strange distributions, because of the data they originate from.

In [8]:
def plot_stacked_bars_multiple_dfs(dfs, df_names, target, feature, title, ordered_categories = None, ncols=2):
    """
    Plot stacked bar charts for the same feature across multiple DataFrames.

    Parameters:
    - dfs: List of DataFrames to plot.
    - df_names: List of names for each DataFrame (for titles).
    - target: The target column name.
    - feature: The feature column name to plot.
    - title: The title for the whole plot.
    - ncols: Number of columns in the plot grid (default is 2).
    """
            
    # Calculate the number of rows needed
    nrows = (len(dfs) + ncols - 1) // ncols

    # Create a figure with multiple subplots
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12 * ncols, 8 * nrows))
    fig.suptitle(f'Stacked Distribution of {feature} on {title}', fontsize=24, y=1.05)

    # Flatten the axes array for easy indexing
    axes = axes.flatten() if len(dfs) > 1 else [axes]

    for i, df in enumerate(dfs):
        df_copy = df.copy()
        
        if(feature == 'family_history'):
            feature = 'family_history_with_overweight'
            
        if(df_copy[feature].dtype == 'float64'):
            df_copy[feature] = df_copy[feature].round()
            
            
        # Create a DataFrame with counts of the feature for each target category
        counts_df = df_copy.groupby([target, feature]).size().unstack(fill_value=0)

        # Reorder the DataFrame according to the specified order of categories
        counts_df = counts_df.reindex(ordered_categories)
        if(feature == 'family_history_with_overweight'):
            feature = 'family_history'
            
        # Plot the stacked bar plot
        counts_df.plot(kind='bar', stacked=True, ax=axes[i], colormap='coolwarm') # color=['#ff9999', '#66b3ff']
        axes[i].set_title(df_names[i])
        axes[i].set_ylabel(f'Counts of {feature} Levels')
        axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)
        axes[i].legend(title=f'{feature} Level', bbox_to_anchor=(1.05, 1), loc='upper left')

    # Remove empty subplots if the number of dataframes is less than the number of subplots
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

function for boxplots

In [9]:
def plot_boxplots_multiple_dfs(dfs, df_names, target, feature, title, ncols=2):
    """
    Plot stacked bar charts for the same feature across multiple DataFrames.

    Parameters:
    - dfs: List of DataFrames to plot.
    - df_names: List of names for each DataFrame (for titles).
    - target: The target column name.
    - feature: The feature column name to plot.
    - title: The title for the whole plot.
    - ncols: Number of columns in the plot grid (default is 2).
    """
            
    # Calculate the number of rows needed
    nrows = (len(dfs) + ncols - 1) // ncols

    # Create a figure with multiple subplots
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12 * ncols, 8 * nrows))
    fig.suptitle(f'Stacked Distribution of {feature} on {title}', fontsize=24, y=1.05)

    # Flatten the axes array for easy indexing
    axes = axes.flatten() if len(dfs) > 1 else [axes]

    for i, df in enumerate(dfs):
                        
        if i >= len(axes):
            break

        # Plot the stacked bar plot
        sns.boxplot(x=target, y=feature, data=df, ax=axes[i], order=ordered_categories, color='lightgrey')
        axes[i].set_title(df_names[i])
        axes[i].set_xlabel(f'{target} Categories')
        axes[i].set_ylabel(f'Counts of {feature} Levels')
        axes[i].set_xticks(range(len(ordered_categories)))
        axes[i].set_xticklabels(ordered_categories, rotation=45)

    # Remove empty subplots if the number of dataframes is less than the number of subplots
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.show()

displaying all the plots

In [10]:
df_names_initial = ()
dfs_initial = ()
for df_name, df in datasetsInitial.items():
    df_names_initial = df_names_initial + (df_name,)
    dfs_initial = dfs_initial  + (df,)

df_names_extra = ()
dfs_extra = ()
for df_name, df in datasetsExtra.items():
    df_names_extra = df_names_extra + (df_name,)
    dfs_extra = dfs_extra + (df,)
    
df_features_for_stacking = ('Gender', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 'CALC','SCC')

df_features_for_boxplot = ('Height', 'Weight', 'Age')

for feature in df_features_for_stacking:
    plot_stacked_bars_multiple_dfs(dfs_initial, df_names_initial, 'NObeyesdad', feature, 'Initial Approach', ordered_categories)
    plot_stacked_bars_multiple_dfs(dfs_extra, df_names_extra, 'NObeyesdad', feature, 'Extra Approach', ordered_categories, ncols=3)

for feature in df_features_for_boxplot:
    plot_boxplots_multiple_dfs(dfs_initial, df_names_initial, 'NObeyesdad', feature, 'Initial Approach')
    plot_boxplots_multiple_dfs(dfs_extra, df_names_extra, 'NObeyesdad', feature, 'Extra Approach', ncols=3)

## Plotting features
plotting interesting features from the ProfileReport, to see how they correlate and get a better understanding of their interaction and if it changes between datasets

In [29]:
def plot_scatterplot_multiple_dfs(dfs, df_names, feature_0, feature_1, title):
    """
    Plot scatter plots for the same feature across multiple DataFrames.

    Parameters:
    - dfs: List of DataFrames to plot.
    - df_names: List of names for each DataFrame (for titles).
    - target: The target column name.
    - feature: The feature column name to plot.
    - title: The title for the whole plot.
    - ncols: Number of columns in the plot grid (default is 2).
    """
    
    # Predefined color mapping for consistency
    fixed_colors = {
        'Normal_Weight': '#1f77b4',       # Blue
        'Overweight_Level_I': '#ff7f0e',  # Orange
        'Overweight_Level_II': '#2ca02c', # Green
        'Obesity_Type_I': '#d62728',      # Red
        'Obesity_Type_II': '#9467bd',     # Purple
        'Obesity_Type_III': '#8c564b',    # Brown
        'Insufficient_Weight': '#e377c2'  # Pink
    }
    
    # Calculate the number of rows needed
    ncols = len(dfs)
    nrows = 1

    # Create a figure with multiple subplots
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(6 * ncols, 6))
    fig.suptitle(f'Scatterplot of {feature_1} and {feature_0} w.r.t. Obesity Categories \n on {title}', fontsize=24, y=1.05)

    # Flatten the axes array for easy indexing
    axes = axes.flatten() if len(dfs) > 1 else [axes]

    for i, df in enumerate(dfs):
                        
        if i >= len(axes):
            break

        # Plot the scatter plot
        sns.scatterplot(
            x=feature_0, y=feature_1, hue=df['NObeyesdad'], hue_order=ordered_categories,
            data=df, ax=axes[i], palette = fixed_colors, alpha=0.7
            )
        axes[i].set_title(df_names[i])
        axes[i].set_xlabel(f'{feature_0}')
        axes[i].set_ylabel(f'{feature_1}')
        axes[i].grid(True)

    # Remove empty subplots if the number of dataframes is less than the number of subplots
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.show()

In [30]:
df_features_for_scattering = (('Age', 'Height'),('Age', 'Weight'),('Height', 'Weight'),('FCVC','Gender'),('FAF','Height'),('Age','TUE'),('Gender','FAVC'),('NCP','family_history_with_overweight'))

for feature in df_features_for_scattering:
    plot_scatterplot_multiple_dfs(dfs_initial, df_names_initial, feature[0], feature[1], 'Initial Approach')
    plot_scatterplot_multiple_dfs(dfs_extra, df_names_extra, feature[0], feature[1], 'Extra Approach')
    

## Duplicates
9 instances are duplicated for 33 times, which is within 1% of the entire dataset and we decided to keep the values, since we assume that there are people who have the same body features and lifestyles.
We have a heavily underrepresented data for example in Overweight_I, the duplicates are actually SMOTE values from the heavily underrepresented targets. Since the SMOTE values are a linear combination of K nearest neighbours and since there are not so many different values the neighbours are going to be alot of the time the same.

In [13]:
# Print duplicated rows from real dataset
duplicatedRows = real_data[real_data.duplicated(keep=False)]
print(duplicatedRows)
print(f"Total number of duplicate entries (including repeated instances): {len(duplicatedRows)}")


display extra datasets