## imports

In [15]:
from tqdm import tqdm
from joblib import dump
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertForSequenceClassification, BertTokenizer
from datasets import load_dataset
import jsonlines
import torch
from huggingface_hub import HfApi
from huggingface_hub import notebook_login

## load dataset

In [6]:
labels = []
texts = []

ds = load_dataset("K-Monitor/kmdb_base").filter(lambda row: row['category'] and row['text'])
for n in ds['train']:
    labels.append(n['category'])
    texts.append(n['text'])

## load model

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
model = BertForSequenceClassification.from_pretrained('K-Monitor/kmdb_classification_hubert').to(device)
tokenizer = BertTokenizer.from_pretrained('SZTAKI-HLT/hubert-base-cc')



In [9]:
def get_bert_embeddings(texts):
    model.eval()
    with torch.no_grad():
        embeddings = []
        for text in tqdm(texts):
            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
            output = model(**inputs, output_hidden_states=True)
            cls_embedding = output.hidden_states[-1][:, 0, :]
            embeddings.append(cls_embedding.squeeze().to('cpu').numpy())
    return embeddings

embeddings = get_bert_embeddings(texts)

  0%|          | 0/52254 [00:00<?, ?it/s]

100%|██████████| 52254/52254 [20:02<00:00, 43.44it/s]


In [10]:
# Prepare train and test sets
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.25, random_state=42)

## train

In [11]:
# Initialize and train the SVM
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

## evaluate

In [12]:
predictions = svm_classifier.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
# Accuracy: 0.9659369259032455

Accuracy: 0.9665492957746479


## upload

In [13]:
dump(svm_classifier, 'svm_classifier_category.joblib')

['svm_classifier_category.joblib']

In [None]:
notebook_login()

In [16]:
api = HfApi()

In [22]:
api.upload_file(
    path_or_fileobj="svm_classifier_category.joblib",
    path_in_repo="svm_classifier_category.joblib",
    repo_id="K-Monitor/kmdb_classification_category",
    repo_type="model",
)

svm_classifier_category.joblib: 100%|██████████| 21.2M/21.2M [00:02<00:00, 7.65MB/s]


CommitInfo(commit_url='https://huggingface.co/K-Monitor/kmdb_classification_category/commit/77cbd0d43cf5ab0ee3d734a22bd8a6aca202de80', commit_message='Upload svm_classifier_category.joblib with huggingface_hub', commit_description='', oid='77cbd0d43cf5ab0ee3d734a22bd8a6aca202de80', pr_url=None, pr_revision=None, pr_num=None)