In [None]:
# Cell 1: Connect to Drive & Setup
from google.colab import drive
import pandas as pd
import numpy as np
import pickle
import os
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import RandomOverSampler

# Mount Google Drive
drive.mount('/content/drive')

# --- KEY SETTINGS ---
SEED = 42
GDRIVE_PATH = '/content/drive/MyDrive/eecsi_revise/'

print(f"✅ Setup complete. Working inside folder: {GDRIVE_PATH}")

Mounted at /content/drive
✅ Setup complete. Working inside folder: /content/drive/MyDrive/eecsi_revise/


In [None]:
# Cell 2: Load Data and Split Definitions
# Define file paths
file_path_csv = os.path.join(GDRIVE_PATH, 'final_golden_dataset_eecsi.csv')
file_path_split = os.path.join(GDRIVE_PATH, 'kfold_splits.pkl')

# Load the dataset and the split file
try:
    df = pd.read_csv(file_path_csv)
    with open(file_path_split, 'rb') as f:
        kfold_splits = pickle.load(f)
    print("✅ Successfully loaded dataset and 5-fold splits.")
    print(f"Total data points: {df.shape[0]}.")
    print(f"Number of folds: {len(kfold_splits)}.")
except FileNotFoundError as e:
    print(f"❌ ERROR: File not found. Please ensure '{e.filename}' is in the 'eecsi_revise' folder in your Google Drive.")

✅ Successfully loaded dataset and 5-fold splits.
Total data points: 3030.
Number of folds: 5.


In [None]:
# Cell 3: Define the Hybrid Model Pipeline
# This pipeline combines oversampling (data-level fix)
# and class weighting (algorithm-level fix).
pipeline_svm = ImbPipeline([
    ('tfidf', TfidfVectorizer()),
    ('ros', RandomOverSampler(random_state=SEED)),
    # Added class_weight='balanced' for consistent treatment with other models
    ('svm', SVC(kernel='linear', C=1, random_state=SEED, class_weight='balanced'))
])

print("✅ TF-IDF + SVM pipeline with full hybrid strategy has been defined.")

✅ TF-IDF + SVM pipeline with full hybrid strategy has been defined.


In [None]:
# Cell 4: Run the 5-Fold Cross-Validation
# List to store the classification report from each fold
fold_reports = []

# Prepare features (X) and labels (y)
X = df['cleaned_text']
y = df['aspect']

for i, fold in enumerate(kfold_splits):
    print(f"--- Running Fold {i+1}/5 ---")

    # Get train and test data for the current fold using the predefined indices
    train_index, test_index = fold['train'], fold['test']
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the pipeline on the training data for this fold
    pipeline_svm.fit(X_train, y_train)

    # Evaluate on the test data for this fold
    y_pred = pipeline_svm.predict(X_test)

    # Store the classification report as a dictionary
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    fold_reports.append(report)
    print(f"Fold {i+1} complete. Macro F1-Score: {report['macro avg']['f1-score']:.4f}")

print("\n✅ 5-fold cross-validation process finished.")

--- Running Fold 1/5 ---
Fold 1 complete. Macro F1-Score: 0.6462
--- Running Fold 2/5 ---
Fold 2 complete. Macro F1-Score: 0.6803
--- Running Fold 3/5 ---
Fold 3 complete. Macro F1-Score: 0.6255
--- Running Fold 4/5 ---
Fold 4 complete. Macro F1-Score: 0.6545
--- Running Fold 5/5 ---
Fold 5 complete. Macro F1-Score: 0.6323

✅ 5-fold cross-validation process finished.


In [None]:
# Cell 5: Aggregate and Display Final Results
# Extract the 'macro avg f1-score' from each fold's report
macro_f1_scores = [report['macro avg']['f1-score'] for report in fold_reports]

# Calculate the mean and standard deviation
mean_macro_f1 = np.mean(macro_f1_scores)
std_macro_f1 = np.std(macro_f1_scores)

print("--- Final Aggregated Results (5-Fold CV) for TF-IDF + SVM ---")
print(f"Macro F1-Score = {mean_macro_f1:.4f} ± {std_macro_f1:.4f}")

--- Final Aggregated Results (5-Fold CV) for TF-IDF + SVM ---
Macro F1-Score = 0.6477 ± 0.0192


In [None]:
# Cell 6: Save Results to Google Drive
# Prepare the results dictionary for saving
final_results = {
    'model': 'TF-IDF + SVM',
    'mean_macro_f1': mean_macro_f1,
    'std_dev_macro_f1': std_macro_f1,
    'reports_per_fold': fold_reports
}

# Define the output file path
results_file_path = os.path.join(GDRIVE_PATH, 'results_svm.json')

# Save to a JSON file
with open(results_file_path, 'w') as f:
    json.dump(final_results, f, indent=4)

print(f"\n✅ Final results for TF-IDF + SVM have been saved to: '{results_file_path}'")


✅ Final results for TF-IDF + SVM have been saved to: '/content/drive/MyDrive/eecsi_revise/results_svm.json'
