# Support Vector Machine
___

This model is based on:

```Bibtex
@inproceedings{liebeckWhatAirportMining2016,
  title = {What to Do with an Airport? {{Mining}} Arguments in the German Online Participation Project Tempelhofer Feld},
  author = {Liebeck, Matthias and Esau, Katharina and Conrad, Stefan},
  date = {2016},
  doi = {10.18653/v1/w16-2817},
  }
```

Features:
- Unigrams
- L2 Normalized POS Tag distribution of STTS
- L2 Normalized POS Tag dependencies TIGER Schema

Parameter:

In [37]:
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

from src.features import POSTagDistribution, POSDependencyDistribution
from src.dataset import load_dataset
from config import CLAIM_LEXICON_PATH, PROJECT_NAME, DATASETS

import wandb

In [38]:
MODEL_NAME = "SVM"
TRACKING = True
FALSE_CLASS_BALANCE = 1.0

dataset = DATASETS["dataset_2014"]

### 0. Load data

In [39]:
X_train, X_test, y_train, y_test = load_dataset(dataset_path=os.path.join(dataset["base_path"], dataset["data"]), false_class_balance=FALSE_CLASS_BALANCE)

### 1. Encode features

In [40]:
text_features = FeatureUnion(transformer_list=[("unigrams", CountVectorizer())])

In [41]:
column_trans = ColumnTransformer(
    [
        ("unigrams", text_features, "Sentence"),
        ("POS_tag", POSTagDistribution(), "Sentence"),
        ("POS_dep", POSDependencyDistribution(), "Sentence"),
    ],
    remainder="drop",
    verbose=True,
)

### 2. Create model

In [42]:
pipe = Pipeline(
    [
        ("preprocessing", column_trans),
        ("scaler", StandardScaler(with_mean=False)),
        ("classify", LinearSVC()),
    ],
    verbose=True,
)

### 4. Train model

In [43]:
if TRACKING:
    wandb.init(project=PROJECT_NAME,
            config={
                "model": MODEL_NAME,
                "setup": str(pipe.get_feature_names_out),
                "dataset": dataset["name"],
                "train_data_size": len(X_train),
                "validation_data_size": 0,
                "test_data_size": len(X_test),
                "batch_size": None,
                "learning_rate": None,
                "epochs": None,
                "false_class_balance": FALSE_CLASS_BALANCE
            })

[34m[1mwandb[0m: wandb version 0.12.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [44]:
pipe.fit(X_train, y_train)

[ColumnTransformer] ...... (1 of 3) Processing unigrams, total=   0.1s
[ColumnTransformer] ....... (2 of 3) Processing POS_tag, total=   7.8s
[ColumnTransformer] ....... (3 of 3) Processing POS_dep, total=   5.8s
[Pipeline] ..... (step 1 of 3) Processing preprocessing, total=  13.7s
[Pipeline] ............ (step 2 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 3 of 3) Processing classify, total=   0.1s




Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('unigrams',
                                                  FeatureUnion(transformer_list=[('unigrams',
                                                                                  CountVectorizer())]),
                                                  'Sentence'),
                                                 ('POS_tag',
                                                  POSTagDistribution(),
                                                  'Sentence'),
                                                 ('POS_dep',
                                                  POSDependencyDistribution(),
                                                  'Sentence')],
                                   verbose=True)),
                ('scaler', StandardScaler(with_mean=False)),
                ('classify', LinearSVC())],
         verbose=True)

### 5. Predict results

In [45]:
Y_pred = pipe.predict(X_test)

### 6. Evaluate results

In [46]:
print(classification_report(y_test, Y_pred))

              precision    recall  f1-score   support

       False       0.75      0.72      0.73       147
        True       0.73      0.76      0.74       147

    accuracy                           0.74       294
   macro avg       0.74      0.74      0.74       294
weighted avg       0.74      0.74      0.74       294



In [47]:
f1 = f1_score(y_test, Y_pred)
recall = recall_score(y_test, Y_pred)
precision = precision_score(y_test, Y_pred)
accuracy = accuracy_score(y_test, Y_pred)

In [48]:
if TRACKING:
    wandb.log({'test_f1': f1})
    wandb.log({'test_recall': recall})
    wandb.log({'test_precision': precision})
    wandb.log({'test_accuracy': accuracy})
    wandb.finish()

0,1
test_accuracy,▁
test_f1,▁
test_precision,▁
test_recall,▁

0,1
test_accuracy,0.7381
test_f1,0.74247
test_precision,0.73026
test_recall,0.7551
