In [1]:
from collections import defaultdict

import numpy as np
from datasets import disable_caching, load_from_disk
import nltk
from nltk.tokenize import word_tokenize as tokenizer
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
nltk.download('punkt_tab')
disable_caching()
ds = load_from_disk(dataset_path="../../datasets/ManualDataset")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\dol28\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [2]:
X_train = ds["train"]["masked_text"]
y_train = ds["train"]["label"]
X_test = ds["test"]["masked_text"]
y_test = ds["test"]["label"]

In [3]:
vectorizer = CountVectorizer(binary=True, tokenizer=tokenizer)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)



In [4]:
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

In [5]:
grid_search = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid, cv=6, n_jobs=-1)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
y_pred = grid_search.best_estimator_.predict(X_test)
micro_f1 = f1_score(y_test, y_pred, average='micro')
macro_f1 = f1_score(y_test, y_pred, average='macro')
classwise_f1 = f1_score(y_test, y_pred, average=None)
# Print results
print("Micro F1:", round(micro_f1, 4))
print("Macro F1:", round(macro_f1, 4))
print(classwise_f1)

{'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}
Micro F1: 0.5799
Macro F1: 0.456
[0.71657754 0.21138211 0.53140097 0.38297872 0.4375    ]
