In [None]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import SGDClassifier
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

saved_images_dir = "/content/Processed Data/preprocessed_batches"
saved_labels_path = "/content/Processed Data/preprocessed_labels.npy"

# Load labels
y = np.load(saved_labels_path)

# Compute class weights to address the imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(zip(np.unique(y), class_weights))

# Train-test split
train_indices, test_indices = train_test_split(np.arange(len(y)), test_size=0.2, random_state=42, stratify=y)

# Initialize SGDClassifier model with class weights
svm_model = SGDClassifier(loss="log_loss", random_state=42, early_stopping=False, class_weight=class_weight_dict)

# Precompute Standard Scaler
scaler = StandardScaler()
all_X_train = []

# Fit scaler on all training data first
num_batches = 10  # Assume we know the number of batches
for batch_index in range(num_batches):
    batch_size = 500  # Define batch size before use
    batch_path = os.path.join(saved_images_dir, f"batch_{batch_index}.npy")
    if os.path.exists(batch_path):
        X_batch = np.load(batch_path)  # Load batch first
        X_batch = X_batch.reshape(X_batch.shape[0], -1)  # Then reshape it
        batch_indices = np.arange(batch_index * batch_size, batch_index * batch_size + len(X_batch))
        train_mask = np.isin(batch_indices, train_indices)
        all_X_train.append(X_batch[train_mask])

all_X_train = np.vstack(all_X_train)
scaler.fit(all_X_train)  # Fit scaler once

# Train in batches
print("Training SVM model in batches...")
training_accuracies = []

batch_index = 0
while True:
    batch_path = os.path.join(saved_images_dir, f"batch_{batch_index}.npy")
    if not os.path.exists(batch_path):
        break  # No more batches

    X_batch = np.load(batch_path)
    X_batch = X_batch.reshape(X_batch.shape[0], -1)

    batch_indices = np.arange(batch_index * batch_size, batch_index * batch_size + len(X_batch))
    train_mask = np.isin(batch_indices, train_indices)

    # Select training data
    X_train_batch = X_batch[train_mask]
    y_train_batch = y[train_indices[np.isin(train_indices, batch_indices)]]

    if len(X_train_batch) == 0:
        batch_index += 1
        continue  # Skip empty batches

    # Scale data
    X_train_batch = scaler.transform(X_train_batch)

    # Incrementally train model without class_weight in partial_fit
    svm_model.partial_fit(X_train_batch, y_train_batch, classes=np.unique(y))

    # Predict on training batch only
    y_train_pred = svm_model.predict(X_train_batch)

    # Compute accuracy
    batch_accuracy = accuracy_score(y_train_batch, y_train_pred)
    training_accuracies.append(batch_accuracy)

    batch_index += 1

print("SVM training complete!")

import joblib
joblib.dump(svm_model, 'svm_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
