In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read the CSV file
df = pd.read_csv("data/VT_Data.csv")  

In [3]:
# Remove individuals under the age of 18
df = df[df['age'] >= 18]  

In [4]:
# Check if there is any missing data in the dataset
print(df.isnull().any().any())  

# Display the number of missing values for each column
print(df.isnull().sum())  

True
gender                  0
height                  0
weight                  0
age                     0
shoulder_c             69
chest_c                69
right_arm_c            69
left_arm_c             69
waist_c                 0
hip_c                   0
upper_leg_c            69
lower_leg              69
resting_pulse           0
chronic_conditions    194
surgeries             192
sports_experience       0
body_fat                0
goals                   0
program                33
difficulty             33
dtype: int64


In [5]:
# Fill empty cells in the 'chronic_conditions' column with 0
df['chronic_conditions'] = df['chronic_conditions'].fillna(0)

# Fill empty cells in the 'surgeries' column with 0
df['surgeries'] = df['surgeries'].fillna(0)

# Fill empty cells in specified columns with 0
df[['shoulder_c', 'chest_c', 'right_arm_c', 'left_arm_c', 'upper_leg_c', 'lower_leg']] = df[['shoulder_c', 'chest_c', 'right_arm_c', 'left_arm_c', 'upper_leg_c', 'lower_leg']].fillna(0)

# Check if there are any missing values left in the dataset
print(df.isnull().any().any())


True


In [6]:
# "K" stands for "Kadin" in Turkish and it means Female
# "E" stands for "Erkek" in Turkish and it means Male
# Encode the values in the 'gender' column as 0 and 1 
df['gender'] = df['gender'].map({'K': 0, 'E': 1})

In [7]:
# Update the 'sports_experience' column
experience_mapping = {
    "No experience": 0,
    "Some experience": 1,
    "Athlete": 2
}
df['sports_experience'] = df['sports_experience'].map(experience_mapping)

In [8]:
# Update the 'goals' column
goals_mapping = {
    "Improving overall health": "0",
    "Enhancing muscle growth": "1",
    "Improving physical appearance": "2",
    "Reducing body weight": "3"
}
df['goals'] = df['goals'].map(goals_mapping)

In [9]:
# BMI calculation function
def calculate_bmi(weight, height):
    if pd.notnull(weight) and pd.notnull(height) and height > 0:
        return round(weight / (height / 100) ** 2, 2)  # Calculate BMI and round to 2 decimal places
    return np.nan  # Return NaN for missing data

# Create the BMI column
df['BMI'] = df.apply(lambda row: calculate_bmi(row['weight'], row['height']), axis=1)

In [10]:
# WHR (Waist-to-Hip Ratio) calculation function
def calculate_waist_to_hip_ratio(waist, hip):
    if pd.notnull(waist) and pd.notnull(hip) and hip > 0:
        return round(waist / hip, 2)  # Calculate WHR and round to 2 decimal places
    return np.nan  # Return NaN for missing data

# Create the WHR column
df['W/Hip_Ratio'] = df.apply(lambda row: calculate_waist_to_hip_ratio(row['waist_c'], row['hip_c']), axis=1)

In [11]:
# Waist-to-Height Ratio calculation function
def calculate_waist_to_height_ratio(waist, height):
    if pd.notnull(waist) and pd.notnull(height) and height > 0:
        return round(waist / height, 2)  # Calculate Waist-to-Height Ratio and round to 2 decimal places
    return np.nan  # Return NaN for missing data

# Create the Waist-to-Height Ratio column
df['W/Height_Ratio'] = df.apply(lambda row: calculate_waist_to_height_ratio(row['waist_c'], row['height']), axis=1)

In [12]:
# List unique combinations of 'program' and 'difficulty'
unique_combinations = df[['program', 'difficulty']].drop_duplicates()
print(unique_combinations)

     program  difficulty
12   Circuit         1.0
13   Circuit         2.0
14      Faz1         3.0
15   Circuit         3.0
16      Faz1         2.0
22      Faz3         2.0
23      Faz3         1.0
24      Faz2         2.0
25      Faz3         3.0
28      Faz2         3.0
34      Faz1         1.0
36      Faz2         1.0
160      NaN         NaN


In [13]:
# Function to combine 'program' and 'difficulty' into 12 unique options
def create_combined_column(row):
    combined_mapping = {
        ("Circuit", 1.0): "0",
        ("Circuit", 2.0): "1",
        ("Circuit", 3.0): "2",
        ("Faz1", 1.0): "3",
        ("Faz1", 2.0): "4",
        ("Faz1", 3.0): "5",
        ("Faz2", 1.0): "6",
        ("Faz2", 2.0): "7",
        ("Faz2", 3.0): "8",
        ("Faz3", 1.0): "9",
        ("Faz3", 2.0): "10",
        ("Faz3", 3.0): "11",
    }
    return combined_mapping.get((row['program'], row['difficulty']), None)

# Create a new column for the training program
df['Training_Program'] = df.apply(create_combined_column, axis=1)

# Count the number of occurrences for each unique option
option_counts = df['Training_Program'].value_counts()

print(option_counts)

Training_Program
0     27
4     22
3     13
6     13
2     12
10    12
7     12
11    12
8     12
1     11
5     10
9     10
Name: count, dtype: int64


In [14]:
# Define a function to detect outliers
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)  # 1st quartile
    Q3 = df[column].quantile(0.75)  # 3rd quartile
    IQR = Q3 - Q1  # Interquartile range

    lower_bound = Q1 - 1.5 * IQR  # Lower bound
    upper_bound = Q3 + 1.5 * IQR  # Upper bound

    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]  # Outliers
    return outliers

# Identify numeric columns
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns

# Loop to check for outliers in each numeric column
for column in numeric_columns:
    outliers = detect_outliers(df, column)
    print(f"Column: {column}")
    print(f"Number of Outliers: {len(outliers)}")
    if not outliers.empty:
        print(outliers)
    else:
        print("No outliers found.")
    print("-" * 50)


Column: gender
Number of Outliers: 0
No outliers found.
--------------------------------------------------
Column: height
Number of Outliers: 1
    gender  height  weight  age  shoulder_c  chest_c  right_arm_c  left_arm_c  \
64       1   206.0    95.0   33       117.5     95.0         34.5        36.0   

    waist_c  hip_c  ...  surgeries  sports_experience  body_fat  goals  \
64     88.0  105.0  ...        0.0                  2      18.7      2   

    program  difficulty    BMI W/Hip_Ratio W/Height_Ratio  Training_Program  
64     Faz3         3.0  22.39        0.84           0.43                11  

[1 rows x 24 columns]
--------------------------------------------------
Column: weight
Number of Outliers: 3
     gender  height  weight  age  shoulder_c  chest_c  right_arm_c  \
55        1   184.0   137.0   32       142.0    126.0         45.0   
89        1   185.0   130.0   35         0.0      0.0          0.0   
171       1   185.0   130.0   35         0.0      0.0          0.0 

In [15]:
# Check and remove duplicate rows
duplicates = df[df.duplicated()]
print(f"Number of Duplicate Rows: {len(duplicates)}")

# Remove duplicate rows
df = df.drop_duplicates()
print(f"Number of Duplicate Rows (after removal): {df.duplicated().sum()}")


Number of Duplicate Rows: 8
Number of Duplicate Rows (after removal): 0


In [16]:
print(df.columns)

Index(['gender', 'height', 'weight', 'age', 'shoulder_c', 'chest_c',
       'right_arm_c', 'left_arm_c', 'waist_c', 'hip_c', 'upper_leg_c',
       'lower_leg', 'resting_pulse', 'chronic_conditions', 'surgeries',
       'sports_experience', 'body_fat', 'goals', 'program', 'difficulty',
       'BMI', 'W/Hip_Ratio', 'W/Height_Ratio', 'Training_Program'],
      dtype='object')


In [17]:
# List of required columns
required_columns = ['gender', 'height', 'weight', 'age', 'resting_pulse', 'sports_experience', 
                    'body_fat', 'goals', 'BMI', 'W/Hip_Ratio', 'W/Height_Ratio', 'Training_Program']

df = df[required_columns]  # Update the DataFrame with only the required columns

In [18]:
# Save the updated DataFrame to a CSV file
df.to_csv("data/processed_VT_data.csv", index=False)