In [34]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import pandas as pd

# Load the dataset
VT = pd.read_csv("data/processed_VT_Data.csv")
pd.set_option('display.max_columns', None)

# Separate features and target variable
X = VT.drop(columns=["Training_Program"])
y = VT["Training_Program"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12, stratify=y)

# Normalization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Generate synthetic data with SMOTE
smote = SMOTE(random_state=12)  # random_state for consistency
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Original Training Data Distribution:")
print(pd.Series(y_train).value_counts())
print("\nTraining Data Distribution After SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

# Create a model with the new balanced data
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LeakyReLU
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import to_categorical

# One-hot encode the target variable
y_train_resampled_encoded = to_categorical(y_train_resampled, num_classes=len(y.unique()))
y_test_encoded = to_categorical(y_test, num_classes=len(y.unique()))

# Define the model
model = Sequential([
    Dense(128),
    LeakyReLU(alpha=0.1),
    Dropout(0.5),
    Dense(64),
    LeakyReLU(alpha=0.1),
    Dropout(0.5),
    Dense(len(y.unique()), activation='softmax')
])

# Compile the model
model.compile(
    optimizer=SGD(learning_rate=0.001, momentum=0.8),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    X_train_resampled, y_train_resampled_encoded,
    epochs=50,
    batch_size=4,
    validation_split=0.2,
    verbose=1
)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test_encoded, verbose=0)

# Print test results
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)
y_true_classes = y_test_encoded.argmax(axis=1)

classification_rep = classification_report(y_true_classes, y_pred_classes)

print("Test Accuracy:", test_accuracy)
print("Classification Report:")
print(classification_rep)


Original Training Data Distribution:
Training_Program
0    26
2    24
3    22
1    21
5    18
4    18
6    12
7    11
Name: count, dtype: int64

Training Data Distribution After SMOTE:
Training_Program
3    26
5    26
2    26
0    26
7    26
1    26
4    26
6    26
Name: count, dtype: int64
Epoch 1/50




[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.0768 - loss: 2.4536 - val_accuracy: 0.3571 - val_loss: 1.8507
Epoch 2/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1538 - loss: 2.2221 - val_accuracy: 0.3810 - val_loss: 1.8494
Epoch 3/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1758 - loss: 2.1434 - val_accuracy: 0.3810 - val_loss: 1.8512
Epoch 4/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2390 - loss: 2.0299 - val_accuracy: 0.3810 - val_loss: 1.8526
Epoch 5/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.2409 - loss: 1.9344 - val_accuracy: 0.3571 - val_loss: 1.8352
Epoch 6/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3219 - loss: 1.8240 - val_accuracy: 0.3571 - val_loss: 1.8138
Epoch 7/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━