In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

import os

In [2]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,participant_id,date,age,gender,height_cm,weight_kg,activity_type,duration_minutes,intensity,calories_burned,...,bmi,resting_heart_rate,blood_pressure_systolic,blood_pressure_diastolic,health_condition,smoking_status,fitness_level,recommendation_1,recommendation_2,recommendation_3
0,1,2024-01-01,56,F,165.3,53.7,Dancing,41,Low,3.3,...,19.6,69.5,110.7,72.9,,Never,0.04,Berenang,Berjalan,Berjalan
1,1,2024-01-04,56,F,165.3,53.9,Swimming,28,Low,2.9,...,19.6,69.5,110.7,72.9,,Never,0.07,Berjalan,Berenang,Sepeda
2,1,2024-01-05,56,F,165.3,54.2,Swimming,21,Medium,2.6,...,19.6,69.5,110.7,72.9,,Never,0.09,Berjalan,Berenang,Sepeda
3,1,2024-01-07,56,F,165.3,54.4,Weight Training,99,Medium,10.7,...,19.6,69.5,110.7,72.9,,Never,0.21,Sepeda,Sepeda,Sepeda
4,1,2024-01-09,56,F,165.3,54.7,Swimming,100,Medium,12.7,...,19.6,69.5,110.7,72.9,,Never,0.33,Sepeda,Sepeda,Sepeda


In [3]:
df_olahraga = df.head(30000)
columns_to_keep = ['age', 'gender', 'height_cm', 'weight_kg', 'bmi', 'recommendation_1', 'recommendation_2', 'recommendation_3' ]

df_olahraga = df_olahraga[columns_to_keep]
print(df_olahraga.head())

   age gender  height_cm  weight_kg   bmi recommendation_1 recommendation_2  \
0   56      F      165.3       53.7  19.6         Berenang         Berjalan   
1   56      F      165.3       53.9  19.6         Berjalan         Berenang   
2   56      F      165.3       54.2  19.6         Berjalan         Berenang   
3   56      F      165.3       54.4  19.6           Sepeda           Sepeda   
4   56      F      165.3       54.7  19.6           Sepeda           Sepeda   

  recommendation_3  
0         Berjalan  
1           Sepeda  
2           Sepeda  
3           Sepeda  
4           Sepeda  


In [4]:
df_olahraga = df_olahraga[df_olahraga['gender']!= "Other"]
print(df_olahraga['gender'].unique())

['F' 'M']


In [5]:
df_olahraga.shape

(29299, 8)

In [6]:
df_olahraga['gender'] = df_olahraga ['gender'].replace({'F':'Female', 'M' : 'Male'})
print(df_olahraga['gender'].unique)

<bound method Series.unique of 0        Female
1        Female
2        Female
3        Female
4        Female
          ...  
29995      Male
29996      Male
29997      Male
29998      Male
29999      Male
Name: gender, Length: 29299, dtype: object>


In [8]:
df_olahraga.head()

Unnamed: 0,age,gender,height_cm,weight_kg,bmi,recommendation_1,recommendation_2,recommendation_3
0,56,Female,165.3,53.7,19.6,Berenang,Berjalan,Berjalan
1,56,Female,165.3,53.9,19.6,Berjalan,Berenang,Sepeda
2,56,Female,165.3,54.2,19.6,Berjalan,Berenang,Sepeda
3,56,Female,165.3,54.4,19.6,Sepeda,Sepeda,Sepeda
4,56,Female,165.3,54.7,19.6,Sepeda,Sepeda,Sepeda


In [9]:
# Cek missing values
print(df_olahraga.isnull().sum())

age                 0
gender              0
height_cm           0
weight_kg           0
bmi                 0
recommendation_1    0
recommendation_2    0
recommendation_3    0
dtype: int64


In [10]:
# Encode gender
label_encoder = LabelEncoder()
df_olahraga['gender'] = label_encoder.fit_transform(df_olahraga['gender'])

# Cek hasil encoding gender
print(df_olahraga['gender'].head())
print("Mapping Gender:", dict(enumerate(label_encoder.classes_)))

0    0
1    0
2    0
3    0
4    0
Name: gender, dtype: int32
Mapping Gender: {0: 'Female', 1: 'Male'}


In [11]:
unique_workout_types = df_olahraga['recommendation_1'].unique()
print(unique_workout_types)

['Berenang' 'Berjalan' 'Sepeda' 'Weight Training' 'Kardio' 'Yoga' 'Senam'
 'HIIT' 'Tidak ada rekomendasi']


In [12]:
df_olahraga = df_olahraga.loc[df_olahraga['recommendation_1'] != 'Tidak ada rekomendasi']
print(df_olahraga['recommendation_1'].unique())

['Berenang' 'Berjalan' 'Sepeda' 'Weight Training' 'Kardio' 'Yoga' 'Senam'
 'HIIT']


In [13]:
df_olahraga = df_olahraga.loc[df_olahraga['recommendation_2'] != 'Tidak ada rekomendasi']
print(df_olahraga['recommendation_2'].unique())

['Berjalan' 'Berenang' 'Sepeda' 'Kardio' 'Weight Training' 'Yoga' 'Senam'
 'HIIT']


In [14]:
df_olahraga.shape

(28862, 8)

In [15]:
df_olahraga.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28862 entries, 0 to 29999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               28862 non-null  int64  
 1   gender            28862 non-null  int32  
 2   height_cm         28862 non-null  float64
 3   weight_kg         28862 non-null  float64
 4   bmi               28862 non-null  float64
 5   recommendation_1  28862 non-null  object 
 6   recommendation_2  28862 non-null  object 
 7   recommendation_3  28862 non-null  object 
dtypes: float64(3), int32(1), int64(1), object(3)
memory usage: 1.9+ MB


In [16]:
# MultiLabel Binarizer untuk rekomendasi olahraga
mlb = MultiLabelBinarizer()
recommendation_columns = ['recommendation_1', 'recommendation_2']

# Gabungkan rekomendasi menjadi list untuk encoding
Y = mlb.fit_transform(df_olahraga[recommendation_columns].values)

# Cek hasil encoding
print("Sample encoded recommendations:\n", Y[:5])
print("Recommendation Classes:", mlb.classes_)

Sample encoded recommendations:
 [[1 1 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0]
 [0 0 0 0 0 1 0 0]]
Recommendation Classes: ['Berenang' 'Berjalan' 'HIIT' 'Kardio' 'Senam' 'Sepeda' 'Weight Training'
 'Yoga']


In [17]:
# Define input features
X = df_olahraga[['age', 'gender', 'height_cm', 'weight_kg', 'bmi']]

# Apply standard scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Cek hasil scaling
print("Sample scaled data:\n", X_scaled[:5])

Sample scaled data:
 [[ 1.02436241 -1.07127496 -0.50664695 -1.93469333 -0.97428517]
 [ 1.02436241 -1.07127496 -0.50664695 -1.92572557 -0.97428517]
 [ 1.02436241 -1.07127496 -0.50664695 -1.91227394 -0.97428517]
 [ 1.02436241 -1.07127496 -0.50664695 -1.90330619 -0.97428517]
 [ 1.02436241 -1.07127496 -0.50664695 -1.88985456 -0.97428517]]


In [18]:
# Train-Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

# Cek ukuran data train dan test
print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

Training set size: (23089, 5)
Test set size: (5773, 5)


In [19]:
# Model Architecture
model = models.Sequential()
model.add(layers.InputLayer(input_shape=(X_train.shape[1],)))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(Y_train.shape[1], activation='sigmoid'))





In [20]:
# Compile model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])

# EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)


In [21]:
# Train model
history = model.fit(X_train, Y_train, epochs=100, batch_size=64, validation_data=(X_test, Y_test), callbacks=[early_stopping])

Epoch 1/100
[1m361/361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.3147 - loss: 0.4607 - val_accuracy: 0.3740 - val_loss: 0.3075
Epoch 2/100
[1m361/361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3509 - loss: 0.3272 - val_accuracy: 0.3509 - val_loss: 0.2885
Epoch 3/100
[1m361/361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3521 - loss: 0.3048 - val_accuracy: 0.3021 - val_loss: 0.2775
Epoch 4/100
[1m361/361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3487 - loss: 0.2931 - val_accuracy: 0.3494 - val_loss: 0.2711
Epoch 5/100
[1m361/361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3608 - loss: 0.2819 - val_accuracy: 0.3591 - val_loss: 0.2691
Epoch 6/100
[1m361/361[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3540 - loss: 0.2782 - val_accuracy: 0.2638 - val_loss: 0.2638
Epoch 7/100
[1m361/36

In [22]:
# Evaluate model
test_loss, test_acc = model.evaluate(X_test, Y_test)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

[1m181/181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5495 - loss: 0.2575
Test Loss: 0.2575
Test Accuracy: 0.5566


In [23]:
# Save model
model.save('model_olahraga54.h5')

# Save label_encoder, scaler, mlb menggunakan pickle
import pickle

with open('label_encoder54.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

with open('scaler54.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('mlb54.pkl', 'wb') as f:
    pickle.dump(mlb, f)

