3. Train and Evaluate a Machine Learning Model

In [2]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Input
from sklearn.metrics import classification_report

# Load processed data
X_file_path = 'X_tfidf_vectors.npz'
y_file_path = 'y_labels.csv'
X = sp.load_npz(X_file_path)
y = pd.read_csv(y_file_path)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y.values.ravel())

# Sample a subset of the data for initial testing
sample_size = 10000  # Adjust this based on available resources
X_sample, _, y_sample, _ = train_test_split(X, y_encoded, train_size=sample_size, random_state=42)

# Split the sample data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

# Build and compile model
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Generator function to yield batches of data
def data_generator(X, y, batch_size):
    num_samples = X.shape[0]
    while True:
        for offset in range(0, num_samples, batch_size):
            X_batch = X[offset:offset+batch_size].toarray()
            y_batch = y[offset:offset+batch_size]
            yield X_batch, y_batch

batch_size = 32  # Increase batch size to speed up training
epochs = 10  # Reduce number of epochs

# Train the model using the generator
train_gen = data_generator(X_train, y_train, batch_size)
val_gen = data_generator(X_test, y_test, batch_size)
steps_per_epoch = X_train.shape[0] // batch_size
validation_steps = X_test.shape[0] // batch_size

model.fit(train_gen,
          steps_per_epoch=steps_per_epoch,
          epochs=epochs,
          validation_data=val_gen,
          validation_steps=validation_steps)

# Predict and evaluate
X_test_array = X_test.toarray()
y_pred = model.predict(X_test_array)
y_pred_classes = y_pred.argmax(axis=1)

# Use unique classes in y_test for the classification report
unique_test_labels = np.unique(y_test)
target_names = label_encoder.inverse_transform(unique_test_labels)

print(classification_report(y_test, y_pred_classes, labels=unique_test_labels, target_names=target_names, zero_division=0))

print("Model trained and evaluated successfully.")

Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 238ms/step - accuracy: 0.0000e+00 - loss: 12.3147 - val_accuracy: 0.0000e+00 - val_loss: 12.3191
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 235ms/step - accuracy: 0.0067 - loss: 10.9719 - val_accuracy: 0.0000e+00 - val_loss: 12.6765
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 236ms/step - accuracy: 5.3182e-04 - loss: 8.0799 - val_accuracy: 0.0000e+00 - val_loss: 14.1667
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 240ms/step - accuracy: 0.0061 - loss: 6.5273 - val_accuracy: 0.0000e+00 - val_loss: 15.5382
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 241ms/step - accuracy: 0.0118 - loss: 5.7510 - val_accuracy: 0.0000e+00 - val_loss: 16.3956
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 242ms/step - accuracy: 0.0109 - loss: 5.4187 - val_accuracy: 0