In [49]:
import numpy as numpy
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 

df = pd.read_csv('./Gym_members.csv')

In [50]:
df.shape

(973, 15)

In [51]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 973 entries, 0 to 972
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Age                            973 non-null    int64  
 1   Gender                         973 non-null    object 
 2   Weight (kg)                    973 non-null    float64
 3   Height (m)                     973 non-null    float64
 4   Max_BPM                        973 non-null    int64  
 5   Avg_BPM                        973 non-null    int64  
 6   Resting_BPM                    973 non-null    int64  
 7   Session_Duration (hours)       973 non-null    float64
 8   Calories_Burned                973 non-null    float64
 9   Workout_Type                   973 non-null    object 
 10  Fat_Percentage                 973 non-null    float64
 11  Water_Intake (liters)          973 non-null    float64
 12  Workout_Frequency (days/week)  973 non-null    int

In [52]:
#show missing value in data
print(f"Missing Value = {df.isnull().sum()}")

Missing Value = Age                              0
Gender                           0
Weight (kg)                      0
Height (m)                       0
Max_BPM                          0
Avg_BPM                          0
Resting_BPM                      0
Session_Duration (hours)         0
Calories_Burned                  0
Workout_Type                     0
Fat_Percentage                   0
Water_Intake (liters)            0
Workout_Frequency (days/week)    0
Experience_Level                 0
BMI                              0
dtype: int64


In [53]:
#show duplicated value
print(f'Duplicated = {df.duplicated().sum()}')

#the shape of data
print (f'Shape = {df.shape}')

Duplicated = 0
Shape = (973, 15)


In [54]:
df.head()

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
0,56,Male,88.3,1.71,180,157,60,1.69,1313.0,Yoga,12.6,3.5,4,3,30.2
1,46,Female,74.9,1.53,179,151,66,1.3,883.0,HIIT,33.9,2.1,4,2,32.0
2,32,Female,68.1,1.66,167,122,54,1.11,677.0,Cardio,33.4,2.3,4,2,24.71
3,25,Male,53.2,1.7,190,164,56,0.59,532.0,Strength,28.8,2.1,3,1,18.41
4,38,Male,46.1,1.79,188,158,68,0.64,556.0,Strength,29.2,2.8,3,1,14.39


In [55]:
print("=== Statistiques descriptives ===")
print(df.describe())

=== Statistiques descriptives ===
              Age  Weight (kg)  Height (m)     Max_BPM     Avg_BPM  \
count  973.000000   973.000000   973.00000  973.000000  973.000000   
mean    38.683453    73.854676     1.72258  179.883864  143.766701   
std     12.180928    21.207500     0.12772   11.525686   14.345101   
min     18.000000    40.000000     1.50000  160.000000  120.000000   
25%     28.000000    58.100000     1.62000  170.000000  131.000000   
50%     40.000000    70.000000     1.71000  180.000000  143.000000   
75%     49.000000    86.000000     1.80000  190.000000  156.000000   
max     59.000000   129.900000     2.00000  199.000000  169.000000   

       Resting_BPM  Session_Duration (hours)  Calories_Burned  Fat_Percentage  \
count   973.000000                973.000000       973.000000      973.000000   
mean     62.223022                  1.256423       905.422405       24.976773   
std       7.327060                  0.343033       272.641516        6.259419   
min      50

In [56]:
# BMI (doit être entre 10 et 50 généralement)
print("\n=== Vérification BMI ===")
print(f"BMI min: {df['BMI'].min()}, max: {df['BMI'].max()}")
df_bmi_issue = df[(df['BMI'] < 10) | (df['BMI'] > 50)]
print(f"Lignes avec BMI suspect: {len(df_bmi_issue)}")


=== Vérification BMI ===
BMI min: 12.32, max: 49.84
Lignes avec BMI suspect: 0


In [57]:
# Height (doit être entre 1.40m et 2.20m)
print(f"\nHeight min: {df['Height (m)'].min()}, max: {df['Height (m)'].max()}")

# Weight (doit être entre 40kg et 200kg)
print(f"Weight min: {df['Weight (kg)'].min()}, max: {df['Weight (kg)'].max()}")

# Age (doit être entre 15 et 80)
print(f"Age min: {df['Age'].min()}, max: {df['Age'].max()}")


Height min: 1.5, max: 2.0
Weight min: 40.0, max: 129.9
Age min: 18, max: 59


In [58]:
# Resting_BPM doit être < Avg_BPM < Max_BPM
print("\n=== Vérification BPM ===")
bpm_issues = df[~((df['Resting_BPM'] < df['Avg_BPM']) & 
                  (df['Avg_BPM'] < df['Max_BPM']))]
print(f"Lignes avec ordre BPM incorrect: {len(bpm_issues)}")

# Resting_BPM normal: 40-100
# Max_BPM pendant exercice: 100-220
print(f"Resting_BPM hors norme [40-100]: {len(df[(df['Resting_BPM'] < 40) | (df['Resting_BPM'] > 100)])}")
print(f"Max_BPM hors norme [100-220]: {len(df[(df['Max_BPM'] < 100) | (df['Max_BPM'] > 220)])}")


=== Vérification BPM ===
Lignes avec ordre BPM incorrect: 28
Resting_BPM hors norme [40-100]: 0
Max_BPM hors norme [100-220]: 0


In [59]:
#  Session_Duration (ne peut pas être négative ou > 5h)
print("\n=== Vérification Session Duration ===")
print(f"Sessions > 5h: {len(df[df['Session_Duration (hours)'] > 5])}")
print(f"Sessions négatives: {len(df[df['Session_Duration (hours)'] <= 0])}")

# Workout_Frequency (doit être 1-7 jours/semaine)
print(f"\nWorkout_Frequency hors [1-7]: {len(df[(df['Workout_Frequency (days/week)'] < 1) | (df['Workout_Frequency (days/week)'] > 7)])}")

#  Water_Intake (doit être réaliste: 0.5-5 litres)
print(f"Water_Intake hors [0.5-5]: {len(df[(df['Water_Intake (liters)'] < 0.5) | (df['Water_Intake (liters)'] > 5)])}")

# Fat_Percentage (doit être 5-50%)
print(f"Fat_Percentage hors [5-50]: {len(df[(df['Fat_Percentage'] < 5) | (df['Fat_Percentage'] > 50)])}")

#  Calories_Burned (doit être cohérent avec durée)
# Approximation: 200-1000 cal/heure selon intensité
df['Calories_per_hour'] = df['Calories_Burned'] / df['Session_Duration (hours)']
print(f"Calories/h < 100 ou > 1500: {len(df[(df['Calories_per_hour'] < 100) | (df['Calories_per_hour'] > 1500)])}")


=== Vérification Session Duration ===
Sessions > 5h: 0
Sessions négatives: 0

Workout_Frequency hors [1-7]: 0
Water_Intake hors [0.5-5]: 0
Fat_Percentage hors [5-50]: 0
Calories/h < 100 ou > 1500: 0


In [60]:
df['Workout_Type'].unique()
df['Workout_Type'] = df['Workout_Type'].replace({'Yoga':0 , 'HIIT':0.3 , 'Cardio':0.6 , 'Strength':1 })

  df['Workout_Type'] = df['Workout_Type'].replace({'Yoga':0 , 'HIIT':0.3 , 'Cardio':0.6 , 'Strength':1 })


In [61]:
df['Gender'].unique()
df['Gender'] = df['Gender'].replace({'Male':0 , 'Female':1 })
df.head(5)

  df['Gender'] = df['Gender'].replace({'Male':0 , 'Female':1 })


Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI,Calories_per_hour
0,56,0,88.3,1.71,180,157,60,1.69,1313.0,0.0,12.6,3.5,4,3,30.2,776.923077
1,46,1,74.9,1.53,179,151,66,1.3,883.0,0.3,33.9,2.1,4,2,32.0,679.230769
2,32,1,68.1,1.66,167,122,54,1.11,677.0,0.6,33.4,2.3,4,2,24.71,609.90991
3,25,0,53.2,1.7,190,164,56,0.59,532.0,1.0,28.8,2.1,3,1,18.41,901.694915
4,38,0,46.1,1.79,188,158,68,0.64,556.0,1.0,29.2,2.8,3,1,14.39,868.75


In [65]:
df_cleaned = df.copy()

In [66]:
# Créer des catégories utiles pour le chatbot
df_cleaned['BMI_Category'] = pd.cut(df_cleaned['BMI'], 
                                     bins=[0, 18.5, 25, 30, 100],
                                     labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

df_cleaned['Activity_Level'] = pd.cut(df_cleaned['Workout_Frequency (days/week)'],
                                       bins=[0, 2, 4, 7],
                                       labels=['Sedentary', 'Moderate', 'Active'])

# Mapper Experience_Level
experience_map = {1: 'Beginner', 2: 'Intermediate', 3: 'Advanced'}
df_cleaned['Experience_Label'] = df_cleaned['Experience_Level'].map(experience_map)

In [67]:
# Sauvegarder
df_cleaned.to_csv('fitness_data_cleaned.csv', index=False)

# Rapport de nettoyage
print("\n=== RAPPORT FINAL ===")
print(f"Lignes originales: {len(df)}")
print(f"Lignes finales: {len(df_cleaned)}")
print(f"\nDistribution Gender: {df_cleaned['Gender'].value_counts()}")
print(f"\nDistribution Workout_Type: {df_cleaned['Workout_Type'].value_counts()}")
print(f"\nDistribution Experience: {df_cleaned['Experience_Label'].value_counts()}")


=== RAPPORT FINAL ===
Lignes originales: 973
Lignes finales: 973

Distribution Gender: Gender
0    511
1    462
Name: count, dtype: int64

Distribution Workout_Type: Workout_Type
1.0    258
0.6    255
0.0    239
0.3    221
Name: count, dtype: int64

Distribution Experience: Experience_Label
Intermediate    406
Beginner        376
Advanced        191
Name: count, dtype: int64


In [69]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 973 entries, 0 to 972
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Age                            973 non-null    int64  
 1   Gender                         973 non-null    int64  
 2   Weight (kg)                    973 non-null    float64
 3   Height (m)                     973 non-null    float64
 4   Max_BPM                        973 non-null    int64  
 5   Avg_BPM                        973 non-null    int64  
 6   Resting_BPM                    973 non-null    int64  
 7   Session_Duration (hours)       973 non-null    float64
 8   Calories_Burned                973 non-null    float64
 9   Workout_Type                   973 non-null    float64
 10  Fat_Percentage                 973 non-null    float64
 11  Water_Intake (liters)          973 non-null    float64
 12  Workout_Frequency (days/week)  973 non-null    int

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI,Calories_per_hour
0,56,0,88.3,1.71,180,157,60,1.69,1313.0,0.0,12.6,3.5,4,3,30.2,776.923077
1,46,1,74.9,1.53,179,151,66,1.3,883.0,0.3,33.9,2.1,4,2,32.0,679.230769
2,32,1,68.1,1.66,167,122,54,1.11,677.0,0.6,33.4,2.3,4,2,24.71,609.90991
3,25,0,53.2,1.7,190,164,56,0.59,532.0,1.0,28.8,2.1,3,1,18.41,901.694915
4,38,0,46.1,1.79,188,158,68,0.64,556.0,1.0,29.2,2.8,3,1,14.39,868.75
