# Support Vector Machine
___

This model is based on:

```Bibtex
@inproceedings{liebeckWhatAirportMining2016,
  title = {What to Do with an Airport? {{Mining}} Arguments in the German Online Participation Project Tempelhofer Feld},
  author = {Liebeck, Matthias and Esau, Katharina and Conrad, Stefan},
  date = {2016},
  doi = {10.18653/v1/w16-2817},
  }
```

Features:
- Unigrams
- L2 Normalized POS Tag distribution of STTS
- L2 Normalized POS Tag dependencies TIGER Schema

Parameter:

In [4]:
import os

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

from src.features import POSTagDistribution, POSDependencyDistribution
from src.dataset import load_dataset
from config import PROJECT_NAME, DATASET

import wandb

In [5]:
MODEL_NAME = "SVM"
TRACKING = False

### 0. Load data

In [6]:
X_train, X_test, y_train, y_test = load_dataset()

### 1. Encode features

In [7]:
text_features = FeatureUnion(transformer_list=[("unigrams", CountVectorizer())])

In [8]:
column_trans = ColumnTransformer(
    [
        ("unigrams", text_features, "Sentence"),
        ("POS_tag", POSTagDistribution(), "Sentence"),
        ("POS_dep", POSDependencyDistribution(), "Sentence"),
    ],
    remainder="drop",
    verbose=True,
)

### 2. Create model

In [9]:
pipe = Pipeline(
    [
        ("preprocessing", column_trans),
        ("scaler", StandardScaler(with_mean=False)),
        ("classify", LinearSVC()),
    ],
    verbose=True,
)

### 4. Train model

In [10]:
if TRACKING:
    wandb.init(project=PROJECT_NAME,
            config={
                "model": MODEL_NAME,
                "dataset": DATASET,
                "train_data_size": len(X_train),
                "validation_data_size": 0,
                "test_data_size": len(X_test),
                "batch_size": None,
                "learning_rate": None,
                "epochs": None
            })

In [11]:
pipe.fit(X_train, y_train)

[ColumnTransformer] ...... (1 of 3) Processing unigrams, total=   0.0s
[ColumnTransformer] ....... (2 of 3) Processing POS_tag, total=  10.1s
[ColumnTransformer] ....... (3 of 3) Processing POS_dep, total=   8.9s
[Pipeline] ..... (step 1 of 3) Processing preprocessing, total=  19.0s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 3 of 3) Processing classify, total=   0.2s




Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('unigrams',
                                                  FeatureUnion(transformer_list=[('unigrams',
                                                                                  CountVectorizer())]),
                                                  'Sentence'),
                                                 ('POS_tag',
                                                  POSTagDistribution(),
                                                  'Sentence'),
                                                 ('POS_dep',
                                                  POSDependencyDistribution(),
                                                  'Sentence')],
                                   verbose=True)),
                ('scaler', StandardScaler(with_mean=False)),
                ('classify', LinearSVC())],
         verbose=True)

### 5. Predict results

In [12]:
Y_pred = pipe.predict(X_test)

### 6. Evaluate results

In [13]:
print(classification_report(y_test, Y_pred))

              precision    recall  f1-score   support

       False       0.69      0.74      0.71       235
        True       0.74      0.69      0.72       259

    accuracy                           0.71       494
   macro avg       0.72      0.72      0.71       494
weighted avg       0.72      0.71      0.71       494



In [14]:
f1 = f1_score(y_test, Y_pred)
recall = recall_score(y_test, Y_pred)
precision = precision_score(y_test, Y_pred)
accuracy = accuracy_score(y_test, Y_pred)

In [15]:
if TRACKING:
    wandb.log({'test_f1': f1})
    wandb.log({'test_recall': recall})
    wandb.log({'test_precision': precision})
    wandb.log({'test_accuracy': accuracy})
