In [38]:
import pandas as pd
import numpy as np

In [40]:
# Read the CSV file
df = pd.read_csv("data/VT_Data.csv")  

In [42]:
# Remove individuals under the age of 18
df = df[df['age'] >= 18]  

In [44]:
# Check if there is any missing data in the dataset
print(df.isnull().any().any())  

# Display the number of missing values for each column
print(df.isnull().sum())  

False
gender                0
height                0
weight                0
age                   0
resting_pulse         0
sports_experience     0
body_fat              0
goals                 0
BMI                   0
W/Hip_Ratio           0
W/Height_Ratio        0
shoulder_c            0
chest_c               0
right_arm_c           0
left_arm_c            0
waist_c               0
hip_c                 0
upper_leg_c           0
lower_leg             0
chronic_conditions    0
surgeries             0
program               0
8_label               0
12_label              0
dtype: int64


In [46]:
# Fill empty cells in the 'chronic_conditions' column with 0
df['chronic_conditions'] = df['chronic_conditions'].fillna(0)

# Fill empty cells in the 'surgeries' column with 0
df['surgeries'] = df['surgeries'].fillna(0)

# Fill empty cells in specified columns with 0
df[['shoulder_c', 'chest_c', 'right_arm_c', 'left_arm_c', 'upper_leg_c', 'lower_leg']] = df[['shoulder_c', 'chest_c', 'right_arm_c', 'left_arm_c', 'upper_leg_c', 'lower_leg']].fillna(0)

# Check if there are any missing values left in the dataset
print(df.isnull().any().any())


False


In [48]:
# "K" stands for "Kadin" in Turkish and it means Female
# "E" stands for "Erkek" in Turkish and it means Male
# Encode the values in the 'gender' column as 0 and 1 
df['gender'] = df['gender'].map({'K': 0, 'E': 1})

In [50]:
# Update the 'sports_experience' column
experience_mapping = {
    "No experience": 0,
    "Some experience": 1,
    "Athlete": 2
}
df['sports_experience'] = df['sports_experience'].map(experience_mapping)

In [52]:
# Update the 'goals' column
goals_mapping = {
    "Improving overall health": "0",
    "Enhancing muscle growth": "1",
    "Improving physical appearance": "2",
    "Reducing body weight": "3"
}
df['goals'] = df['goals'].map(goals_mapping)

In [54]:
# BMI calculation function
def calculate_bmi(weight, height):
    if pd.notnull(weight) and pd.notnull(height) and height > 0:
        return round(weight / (height / 100) ** 2, 2)  # Calculate BMI and round to 2 decimal places
    return np.nan  # Return NaN for missing data

# Create the BMI column
df['BMI'] = df.apply(lambda row: calculate_bmi(row['weight'], row['height']), axis=1)

In [56]:
# WHR (Waist-to-Hip Ratio) calculation function
def calculate_waist_to_hip_ratio(waist, hip):
    if pd.notnull(waist) and pd.notnull(hip) and hip > 0:
        return round(waist / hip, 2)  # Calculate WHR and round to 2 decimal places
    return np.nan  # Return NaN for missing data

# Create the WHR column
df['W/Hip_Ratio'] = df.apply(lambda row: calculate_waist_to_hip_ratio(row['waist_c'], row['hip_c']), axis=1)

In [58]:
# Waist-to-Height Ratio calculation function
def calculate_waist_to_height_ratio(waist, height):
    if pd.notnull(waist) and pd.notnull(height) and height > 0:
        return round(waist / height, 2)  # Calculate Waist-to-Height Ratio and round to 2 decimal places
    return np.nan  # Return NaN for missing data

# Create the Waist-to-Height Ratio column
df['W/Height_Ratio'] = df.apply(lambda row: calculate_waist_to_height_ratio(row['waist_c'], row['height']), axis=1)

In [15]:
# Check and remove duplicate rows
duplicates = df[df.duplicated()]
print(f"Number of Duplicate Rows: {len(duplicates)}")

# Remove duplicate rows
df = df.drop_duplicates()
print(f"Number of Duplicate Rows (after removal): {df.duplicated().sum()}")


Number of Duplicate Rows: 8
Number of Duplicate Rows (after removal): 0


In [66]:
# Function to combine 'program' and 'difficulty' into 12 unique options
def create_combined_column(row):
    combined_mapping = {
        ("Circuit", 1.0): "0",
        ("Circuit", 3.0): "1",
        ("Faz1", 1.0): "2",
        ("Faz1", 3.0): "3",
        ("Faz2", 1.0): "4",
        ("Faz2", 3.0): "5",
        ("Faz3", 1.0): "6",
        ("Faz3", 3.0): "7",
    }
    return combined_mapping.get((row['program'], row['8_label']), None)

# Create a new column for the training program
df['Training_Program'] = df.apply(create_combined_column, axis=1)

# Count the number of occurrences for each unique option
option_counts = df['Training_Program'].value_counts()

print(option_counts)

Training_Program
0    33
2    30
3    28
4    27
1    26
5    18
6    15
7    14
Name: count, dtype: int64


In [68]:
# List unique combinations of 'program' and '8_label'
unique_combinations = df[['program', '8_label']].drop_duplicates()
print(unique_combinations)

    program  8_label
0      Faz1        3
2   Circuit        3
4   Circuit        1
13     Faz3        3
15     Faz3        1
16     Faz2        3
28     Faz2        1
34     Faz1        1


In [70]:
# Define a function to detect outliers
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)  # 1st quartile
    Q3 = df[column].quantile(0.75)  # 3rd quartile
    IQR = Q3 - Q1  # Interquartile range

    lower_bound = Q1 - 1.5 * IQR  # Lower bound
    upper_bound = Q3 + 1.5 * IQR  # Upper bound

    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]  # Outliers
    return outliers

# Identify numeric columns
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns

# Loop to check for outliers in each numeric column
for column in numeric_columns:
    outliers = detect_outliers(df, column)
    print(f"Column: {column}")
    print(f"Number of Outliers: {len(outliers)}")
    if not outliers.empty:
        print(outliers)
    else:
        print("No outliers found.")
    print("-" * 50)


Column: gender
Number of Outliers: 0
No outliers found.
--------------------------------------------------
Column: height
Number of Outliers: 1
    gender  height  weight  age  resting_pulse  sports_experience  body_fat  \
77     NaN   206.0    95.0   33             65                NaN      18.7   

   goals    BMI  W/Hip_Ratio  ...  waist_c  hip_c  upper_leg_c  lower_leg  \
77   NaN  22.39         0.84  ...     88.0  105.0         54.0       37.5   

    chronic_conditions  surgeries  program  8_label  12_label  \
77                   0          0     Faz3        1         3   

    Training_Program  
77                 6  

[1 rows x 25 columns]
--------------------------------------------------
Column: weight
Number of Outliers: 3
     gender  height  weight  age  resting_pulse  sports_experience  body_fat  \
61      NaN   184.0   137.0   32             63                NaN      50.0   
101     NaN   185.0   130.0   35             79                NaN      14.0   
102     NaN   

In [72]:
print(df.columns)

Index(['gender', 'height', 'weight', 'age', 'resting_pulse',
       'sports_experience', 'body_fat', 'goals', 'BMI', 'W/Hip_Ratio',
       'W/Height_Ratio', 'shoulder_c', 'chest_c', 'right_arm_c', 'left_arm_c',
       'waist_c', 'hip_c', 'upper_leg_c', 'lower_leg', 'chronic_conditions',
       'surgeries', 'program', '8_label', '12_label', 'Training_Program'],
      dtype='object')


In [74]:
# List of required columns
required_columns = ['gender', 'height', 'weight', 'age', 'resting_pulse', 'sports_experience', 
                    'body_fat', 'goals', 'BMI', 'W/Hip_Ratio', 'W/Height_Ratio','shoulder_c', 'chest_c',
                    'right_arm_c', 'left_arm_c', 'waist_c', 'hip_c', 'upper_leg_c',
                    'lower_leg', 'chronic_conditions', 'surgeries', 'Training_Program']

df = df[required_columns]  # Update the DataFrame with only the required columns

In [78]:
# Save the updated DataFrame to a CSV file
df.to_csv("data/processed_VT_data_.csv", index=False)