# 04 â€” SVM (Dual Pipeline)

This notebook trains **SVM** on two datasets:
1. **Standard**: Basic cleaning.
2. **Irony-Augmented**: With `[IRONIA]` tags.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
import os

plt.style.use('../style.mplstyle')

In [2]:
%load_ext watermark
%watermark -v -n -m -p numpy,pandas,sklearn,matplotlib,seaborn,joblib

Python implementation: CPython
Python version       : 3.12.12
IPython version      : 9.10.0

numpy     : 1.26.4
pandas    : 3.0.0
sklearn   : 1.8.0
matplotlib: 3.10.8
seaborn   : 0.13.2
joblib    : 1.5.3

Compiler    : Clang 17.0.0 (clang-1700.6.3.2)
OS          : Darwin
Release     : 25.2.0
Machine     : x86_64
Processor   : i386
CPU cores   : 8
Architecture: 64bit



## Pipeline Function

In [3]:
MODEL_NAME = "SVM"

def run_pipeline(variation_name, input_dir, output_dir):
    print(f"\n{'='*20} {MODEL_NAME}: {variation_name} {'='*20}")
    
    # 1. Load Data
    train_df = pd.read_csv(f'{input_dir}/train.csv')
    test_df = pd.read_csv(f'{input_dir}/test.csv')
    train_df['text_clean'] = train_df['text_clean'].fillna('')
    test_df['text_clean'] = test_df['text_clean'].fillna('')
    
    # 2. Vectorize
    vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
    X_train = vectorizer.fit_transform(train_df['text_clean'])
    X_test = vectorizer.transform(test_df['text_clean'])
    y_train = train_df['label']
    y_test = test_df['label']
    
    # 3. Train
    clf = LinearSVC(class_weight='balanced', random_state=42, dual='auto')
    clf.fit(X_train, y_train)
    
    # 4. Evaluate
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{MODEL_NAME} ({variation_name}) Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    
    # 5. Export
    os.makedirs(output_dir, exist_ok=True)
    joblib.dump(clf, f'{output_dir}/model.joblib')
    joblib.dump(vectorizer, f'{output_dir}/vectorizer.joblib')
    print(f"Artifacts saved to {output_dir}")
    
    return acc

## Run Both Pipelines

In [4]:
acc_standard = run_pipeline("Standard", "../data/processed/standard", "../models/svm/standard")
acc_irony = run_pipeline("Irony", "../data/processed/irony", "../models/svm/irony")




SVM (Standard) Accuracy: 0.8156
              precision    recall  f1-score   support

    NEGATIVE       0.80      0.84      0.82       225
    POSITIVE       0.83      0.79      0.81       225

    accuracy                           0.82       450
   macro avg       0.82      0.82      0.82       450
weighted avg       0.82      0.82      0.82       450



Artifacts saved to ../models/svm/standard



SVM (Irony) Accuracy: 0.8178
              precision    recall  f1-score   support

    NEGATIVE       0.80      0.84      0.82       225
    POSITIVE       0.83      0.80      0.81       225

    accuracy                           0.82       450
   macro avg       0.82      0.82      0.82       450
weighted avg       0.82      0.82      0.82       450

Artifacts saved to ../models/svm/irony


## Comparison

In [5]:
print("\n=== Final Comparison ===")
print(f"Standard: {acc_standard:.4f}")
print(f"Irony:    {acc_irony:.4f}")
diff = acc_irony - acc_standard
print(f"Impact of Irony features: {diff:+.4f}")


=== Final Comparison ===
Standard: 0.8156
Irony:    0.8178
Impact of Irony features: +0.0022
