In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
import random

Generate extra data for the model

In [6]:
def generate_body_type_data(body_type, num_samples):
    """
    Generates a specified number of data points for a given body type,
    based on anthropometric rules
    Args:
        body_type (str): The body type to generate data for (e.g., 'Bottom Hourglass')
        num_samples (int): The number of new samples to generate
    """
    generated_data = []
    
    # Define realistic ranges for measurements (in inches)
    bust_range = (30, 45)
    waist_range = (22, 35)
    hips_range = (32, 48)
    
    # Epsilon for safety in calculations
    safety_epsilon = 1e-6

    for _ in range(num_samples):
        bust = random.uniform(*bust_range)
        waist = random.uniform(*waist_range)
        hips = random.uniform(*hips_range)

        # Apply specific logic to fit the desired body type
        if body_type == 'Bottom Hourglass':
            # Hips slightly wider than bust, very defined waist, high_hip/waist ratio below 1.193
            bust = random.uniform(hips - 3.5, hips - 1.1)
            waist = random.uniform(bust - 9.0, bust - 5.0)
        elif body_type == 'Spoon':
            # Hips noticeably wider than bust, hips/waist diff < 9.0, bust/waist diff < 9.0
            # and a high_hip/waist ratio >= 1.193
            hips = random.uniform(bust + 3.6, bust + 9.9)
            bust = random.uniform(waist + 5.0, waist + 8.9)
        elif body_type == 'Hourglass':
            # Bust and hips are similar, waist is well-defined
            hips = random.uniform(bust - 1.0, bust + 1.0)
            waist = random.uniform(bust - 9.0, bust - 5.0)
        elif body_type == 'Rectangle':
            # All measurements are similar, waist is not well-defined
            bust = random.uniform(hips - 3.5, hips + 3.5)
            waist = random.uniform(bust - 5.0, bust - 0.1)
        elif body_type == 'Triangle':
            # Hips wider than bust, waist not well-defined
            hips = random.uniform(bust + 3.6, bust + 9.9)
            waist = random.uniform(hips - 9.0, hips - 5.0)
        elif body_type == 'Inverted Triangle':
            # Bust wider than hips, waist not well-defined
            bust = random.uniform(hips + 3.6, hips + 9.9)
            waist = random.uniform(bust - 9.0, bust - 5.0)
        elif body_type == 'Apple':
            # WHR >= 0.85
            waist = random.uniform(hips * 0.85, hips * 1.0)
            bust = random.uniform(hips * 0.8, hips * 1.2)
        
        # Ensure measurements are non-negative and somewhat realistic
        bust = max(bust, 25)
        waist = max(waist, 18)
        hips = max(hips, 28)
        
        generated_data.append({
            'bust': bust,
            'waist': waist,
            'hips': hips,
        })

    return pd.DataFrame(generated_data)

In [9]:
def generate_balanced_dataset(df, target_classes, num_per_class=20):
    """
    Generates synthetic data to balance the dataset
    Args:
        df (pd.DataFrame): The original DataFrame.
        target_classes (list): A list of the body types to generate data for.
        num_per_class (int): The number of samples to generate for each class.
    """
    original_classes = df['body_type'].unique()
    new_dataframes = []

    for body_type in target_classes:
        if body_type not in original_classes:
            print(f"Generating {num_per_class} samples for new class: {body_type}")
            generated_df = generate_body_type_data(body_type, num_per_class)
            generated_df['body_type'] = body_type
            new_dataframes.append(generated_df)
        else:
            current_count = df[df['body_type'] == body_type].shape[0]
            if current_count < num_per_class:
                samples_needed = num_per_class - current_count
                print(f"Generating {samples_needed} samples for class: {body_type}")
                generated_df = generate_body_type_data(body_type, samples_needed)
                generated_df['body_type'] = body_type
                new_dataframes.append(generated_df)

    # Combine the original data with the new synthetic data
    balanced_df = pd.concat([data] + new_dataframes, ignore_index=True)
    return balanced_df

In [21]:
if __name__ == '__main__':
    # Load your original DataFrame (from ML Body Type)
    data = pd.read_csv('body_type_merged_data.csv')
    
    # Classify the original data so we know the counts
    safety_epsilon = 1e-6
    data['whr'] = data['waist'] / (data['hips'] + safety_epsilon)
    data['bhr'] = data['bust'] / (data['hips'] + safety_epsilon)

    diff_bust_hips_small = 1.0
    diff_hips_bust_small = 3.6
    diff_waist_defined_bust = 9.0
    diff_waist_defined_hips = 10.0
    whr_apple_threshold = 0.85
    
    # Define conditions and choices in a clear, matching order
    conditions = [
        # Condition 1: Hourglass
        ( (data['bust'] - data['hips']).abs() <= diff_bust_hips_small) &
        ( (data['hips'] - data['bust']) < diff_hips_bust_small) &
        ( (data['bust'] - data['waist']) >= diff_waist_defined_bust) |
        ( (data['hips'] - data['waist']) >= diff_waist_defined_hips),
        
        # Condition 2: Bottom Hourglass
        ( (data['hips'] - data['bust']) >= diff_hips_bust_small) &
        ( (data['hips'] - data['bust']) < diff_waist_defined_hips) &
        ( (data['hips'] - data['waist']) >= diff_waist_defined_bust),
        
        # Condition 3: Top Hourglass
        ( (data['bust'] - data['hips']) > diff_bust_hips_small) &
        ( (data['bust'] - data['hips']) < diff_waist_defined_hips) &
        ( (data['bust'] - data['waist']) >= diff_waist_defined_bust),
        
        # Condition 4: Triangle
        ( (data['hips'] - data['bust']) >= diff_hips_bust_small) &
        ( (data['hips'] - data['waist']) < diff_waist_defined_bust),
        
        # Condition 5: Inverted Triangle
        ( (data['bust'] - data['hips']) >= diff_hips_bust_small) &
        ( (data['bust'] - data['waist']) < diff_waist_defined_bust),
        
        # Condition 6: Apple
        (data['whr'] >= whr_apple_threshold),

        # Condition 7: Rectangle
        ( (data['hips'] - data['bust']).abs() < diff_hips_bust_small) &
        ( (data['bust'] - data['waist']) < diff_waist_defined_bust) &
        ( (data['hips'] - data['waist']) < diff_waist_defined_hips)
    ]
    
    choices = [
        'Hourglass',
        'Bottom Hourglass',
        'Top Hourglass',
        'Triangle',
        'Inverted Triangle',
        'Apple',
        'Rectangle',
    ]
    data['body_type'] = np.select(conditions, choices, default='Undetermined')
    
    # Generate a balanced dataset
    all_body_types = [
        'Hourglass', 'Bottom Hourglass', 'Top Hourglass',
        'Triangle', 'Inverted Triangle', 'Apple', 'Rectangle'
    ]
    balanced_df = generate_balanced_dataset(data, all_body_types, num_per_class=20)
    
    # Calculate all the ratios on the final combined dataset
    # This ensures all rows, both original and generated, have these values
    balanced_df['whr'] = balanced_df['waist'] / (balanced_df['hips'] + safety_epsilon)
    balanced_df['bhr'] = balanced_df['bust'] / (balanced_df['hips'] + safety_epsilon)

    # Save the new, balanced dataset to a CSV file
    balanced_df.to_csv('balanced_df.csv', index=False)
    print("\nBalanced dataset generated and saved to 'balanced_data.csv'")
    print("\nNew dataset shape:", balanced_df.shape)
    print("\nNew class distribution:\n", balanced_df['body_type'].value_counts())


Generating 2 samples for class: Hourglass
Generating 19 samples for class: Bottom Hourglass
Generating 20 samples for new class: Top Hourglass
Generating 18 samples for class: Inverted Triangle
Generating 5 samples for class: Rectangle

Balanced dataset generated and saved to 'balanced_data.csv'

New dataset shape: (160, 8)

New class distribution:
 body_type
Apple                33
Triangle             26
Rectangle            20
Bottom Hourglass     20
Hourglass            20
Inverted Triangle    20
Top Hourglass        20
Undetermined          1
Name: count, dtype: int64


In [None]:
import pandas as pd
import numpy as np
import random

def generate_body_type_data(body_type, num_samples):
    """
    Generates a specified number of data points for a given body type,
    based on anthropometric rules.

    Args:
        body_type (str): The body type to generate data for (e.g., 'Bottom Hourglass').
        num_samples (int): The number of new samples to generate.

    Returns:
        pd.DataFrame: A DataFrame containing the newly generated data.
    """
    generated_data = []
    
    # Define realistic ranges for measurements (in inches)
    bust_range = (30, 45)
    waist_range = (22, 35)
    hips_range = (32, 48)
    high_hip_range = (28, 40)
    
    # Epsilon for safety in calculations
    safety_epsilon = 1e-6

    for _ in range(num_samples):
        bust = random.uniform(*bust_range)
        waist = random.uniform(*waist_range)
        hips = random.uniform(*hips_range)
        high_hip = random.uniform(*high_hip_range)

        # Apply specific logic to fit the desired body type
        if body_type == 'Bottom Hourglass':
            # Hips slightly wider than bust, very defined waist, high_hip/waist ratio below 1.193
            bust = random.uniform(hips - 3.5, hips - 1.1)
            waist = random.uniform(bust - 9.0, bust - 5.0)
            high_hip = random.uniform(waist, waist * 1.192)
        elif body_type == 'Spoon':
            # Hips noticeably wider than bust, hips/waist diff < 9.0, bust/waist diff < 9.0
            # and a high_hip/waist ratio >= 1.193
            hips = random.uniform(bust + 3.6, bust + 9.9)
            bust = random.uniform(waist + 5.0, waist + 8.9)
            high_hip = random.uniform(waist * 1.193, waist * 1.25)
        elif body_type == 'Hourglass':
            # Bust and hips are similar, waist is well-defined
            hips = random.uniform(bust - 1.0, bust + 1.0)
            waist = random.uniform(bust - 9.0, bust - 5.0)
        elif body_type == 'Rectangle':
            # All measurements are similar, waist is not well-defined
            bust = random.uniform(hips - 3.5, hips + 3.5)
            waist = random.uniform(bust - 5.0, bust - 0.1)
        elif body_type == 'Triangle':
            # Hips wider than bust, waist not well-defined
            hips = random.uniform(bust + 3.6, bust + 9.9)
            waist = random.uniform(hips - 9.0, hips - 5.0)
        elif body_type == 'Inverted Triangle':
            # Bust wider than hips, waist not well-defined
            bust = random.uniform(hips + 3.6, hips + 9.9)
            waist = random.uniform(bust - 9.0, bust - 5.0)
        elif body_type == 'Apple':
            # WHR >= 0.85
            waist = random.uniform(hips * 0.85, hips * 1.0)
            bust = random.uniform(hips * 0.8, hips * 1.2)
        
        # Ensure measurements are non-negative and somewhat realistic
        bust = max(bust, 25)
        waist = max(waist, 18)
        hips = max(hips, 28)
        high_hip = max(high_hip, 25)

        generated_data.append({
            'bust': bust,
            'waist': waist,
            'hips': hips,
            'high_hip': high_hip
        })

    return pd.DataFrame(generated_data)

def generate_balanced_dataset(df, target_classes, num_per_class=20):
    """
    Generates synthetic data to balance the dataset.

    Args:
        df (pd.DataFrame): The original DataFrame.
        target_classes (list): A list of the body types to generate data for.
        num_per_class (int): The number of samples to generate for each class.

    Returns:
        pd.DataFrame: A new DataFrame with the original and generated data combined.
    """
    original_classes = df['Body_type'].unique()
    new_dataframes = []

    for body_type in target_classes:
        if body_type not in original_classes:
            print(f"Generating {num_per_class} samples for new class: {body_type}")
            generated_df = generate_body_type_data(body_type, num_per_class)
            generated_df['Body_type'] = body_type
            new_dataframes.append(generated_df)
        else:
            current_count = df[df['Body_type'] == body_type].shape[0]
            if current_count < num_per_class:
                samples_needed = num_per_class - current_count
                print(f"Generating {samples_needed} samples for class: {body_type}")
                generated_df = generate_body_type_data(body_type, samples_needed)
                generated_df['Body_type'] = body_type
                new_dataframes.append(generated_df)

    # Combine the original data with the new synthetic data
    balanced_df = pd.concat([df] + new_dataframes, ignore_index=True)
    return balanced_df

if __name__ == '__main__':
    # --- 1. Load your original DataFrame (from train_model.py) ---
    data = {
        'person_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'bust': [36, 34, 40, 32, 38, 35, 33, 37, 39, 36],
        'waist': [26, 28, 32, 24, 30, 27, 29, 25, 31, 28],
        'hips': [37, 39, 35, 38, 42, 36, 34, 37, 40, 38],
        'high_hip': [32, 34, 31, 33, 37, 31, 30, 32, 35, 33]
    }
    df = pd.DataFrame(data)

    # --- 2. Classify the original data so we know the counts ---
    safety_epsilon = 1e-6
    df['WHR'] = df['waist'] / (df['hips'] + safety_epsilon)
    df['BHR'] = df['bust'] / (df['hips'] + safety_epsilon)
    if 'high_hip' in df.columns:
        df['HHR_WHR'] = df['high_hip'] / (df['waist'] + safety_epsilon)
    else:
        df['HHR_WHR'] = np.nan

    # Define thresholds
    diff_bust_hips_small = 1.0
    diff_hips_bust_small = 3.6
    diff_waist_defined_bust = 9.0
    diff_waist_defined_hips = 10.0
    whr_apple_threshold = 0.85
    ratio_high_hip_waist_spoon = 1.193

    # Define conditions and choices in a clear, matching order
    conditions = [
        # Condition 1: Hourglass
        ( (df['bust'] - df['hips']).abs() <= diff_bust_hips_small) &
        ( (df['hips'] - df['bust']) < diff_hips_bust_small) &
        ( (df['bust'] - df['waist']) >= diff_waist_defined_bust) |
        ( (df['hips'] - df['waist']) >= diff_waist_defined_hips),
        
        # Condition 2: Bottom Hourglass
        ( (df['hips'] - df['bust']) >= diff_hips_bust_small) &
        ( (df['hips'] - df['bust']) < diff_waist_defined_hips) &
        ( (df['hips'] - df['waist']) >= diff_waist_defined_bust) &
        ( (df['HHR_WHR'] < ratio_high_hip_waist_spoon)),
        
        # Condition 3: Top Hourglass
        ( (df['bust'] - df['hips']) > diff_bust_hips_small) &
        ( (df['bust'] - df['hips']) < diff_waist_defined_hips) &
        ( (df['bust'] - df['waist']) >= diff_waist_defined_bust),
        
        # Condition 4: Triangle
        ( (df['hips'] - df['bust']) >= diff_hips_bust_small) &
        ( (df['hips'] - df['waist']) < diff_waist_defined_bust),
        
        # Condition 5: Inverted Triangle
        ( (df['bust'] - df['hips']) >= diff_hips_bust_small) &
        ( (df['bust'] - df['waist']) < diff_waist_defined_bust),
        
        # Condition 6: Apple
        (df['WHR'] >= whr_apple_threshold),
        
        # Condition 7: Rectangle
        ( (df['hips'] - df['bust']).abs() < diff_hips_bust_small) &
        ( (df['bust'] - df['waist']) < diff_waist_defined_bust) &
        ( (df['hips'] - df['waist']) < diff_waist_defined_hips),
    ]

    choices = [
        'Hourglass',
        'Bottom Hourglass',
        'Top Hourglass',
        'Triangle',
        'Inverted Triangle',
        'Apple',
        'Rectangle',
    ]

    # Corrected: Ensure the column name is consistent
    df['Body_type'] = np.select(conditions, choices, default='Undetermined')

    # --- 3. Generate a balanced dataset ---
    all_body_types = [
        'Hourglass', 'Bottom Hourglass', 'Top Hourglass',
        'Triangle', 'Inverted Triangle', 'Apple', 'Rectangle'
    ]
    balanced_df = generate_balanced_dataset(df, all_body_types, num_per_class=20)
    
    # --- 4. Calculate all the ratios on the final combined dataset ---
    # This ensures all rows, both original and generated, have these values
    balanced_df['WHR'] = balanced_df['waist'] / (balanced_df['hips'] + safety_epsilon)
    balanced_df['BHR'] = balanced_df['bust'] / (balanced_df['hips'] + safety_epsilon)
    balanced_df['HHR_WHR'] = balanced_df['high_hip'] / (balanced_df['waist'] + safety_epsilon)

    # --- 5. Save the new, balanced dataset to a CSV file ---
    balanced_df.to_csv('balanced_data.csv', index=False)
    print("\nBalanced dataset generated and saved to 'balanced_data.csv'")
    print("\nNew dataset shape:", balanced_df.shape)
    print("\nNew class distribution:\n", balanced_df['Body_type'].value_counts())


In [22]:
data = pd.read_csv('balanced_data.csv')
data.head()

Unnamed: 0,bust,hips,waist,height,gender,whr,bhr,body_type
0,8.0,8.0,7.0,24.0,female,0.875,1.0,Apple
1,8.0,19.0,14.0,63.0,female,0.736842,0.421053,Triangle
2,9.0,20.0,17.0,49.0,female,0.85,0.45,Triangle
3,9.0,45.0,41.0,60.0,female,0.911111,0.2,Triangle
4,10.0,11.0,9.0,46.0,female,0.818182,0.909091,Rectangle


In [23]:
data.tail()

Unnamed: 0,bust,hips,waist,height,gender,whr,bhr,body_type
155,32.533873,33.94888,29.78879,,,0.87746,0.958319,Rectangle
156,43.611313,41.082436,42.64061,,,1.037928,1.061556,Rectangle
157,43.74195,41.436045,39.049456,,,0.942403,1.05565,Rectangle
158,49.242914,45.84027,45.317393,,,0.988593,1.074228,Rectangle
159,42.055597,39.764537,40.228526,,,1.011668,1.057616,Rectangle


In [24]:
# drop the height and gender columns
data.drop(columns=['height','gender'], inplace=True)

In [25]:
data.shape

(160, 6)

In [26]:
data.to_csv('data.csv', index=False)