In [1]:
import numpy as np
import pandas as pd
import polars as pl

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# 1. Data

In [2]:
# reading data - BERT
train_bert = pl.read_csv('output/bert/train_pca.csv').to_pandas()
test_bert = pl.read_csv('output/bert/test_pca.csv').to_pandas()
train_bert.shape, test_bert.shape

((11413, 89), (4024, 89))

In [3]:
# reading data - BERT
train_tfidf = pl.read_csv('output/tf_idf/train_pca.csv').to_pandas()
test_tfidf = pl.read_csv('output/tf_idf/test_pca.csv').to_pandas()
train_tfidf.shape, test_tfidf.shape

((11413, 1517), (4024, 1517))

In [4]:
# copy to save
test = test_bert[['doc_class']].copy()

# 2. SVM

## 2.1 BERT

In [5]:
# parameters
svm_bert_params = {
    'C': 0.1,
    'gamma': 0.1,
    'kernel': 'linear'
}

In [6]:
# build model
svm_bert = SVC(**svm_bert_params)
svm_bert

SVC(C=0.1, gamma=0.1, kernel='linear')

In [7]:
%%time
# fitting
svm_bert.fit(train_bert.iloc[:, 2:].values, train_bert.doc_class.values)

CPU times: user 2.2 s, sys: 16.6 ms, total: 2.22 s
Wall time: 2.22 s


SVC(C=0.1, gamma=0.1, kernel='linear')

In [8]:
# predictions
test['svm_bert'] = svm_bert.predict(test_bert.iloc[:, 2:].values)

## 2.2 TF-IDF

In [9]:
# parameters
svm_tfidf_params = {
    'C': 10,
    'gamma': 0.1,
    'kernel': 'linear'
}

In [10]:
# build model
svm_tfidf = SVC(**svm_tfidf_params)
svm_tfidf

SVC(C=10, gamma=0.1, kernel='linear')

In [11]:
%%time
# fitting
svm_tfidf.fit(train_tfidf.iloc[:, 2:].values, train_tfidf.doc_class.values)

CPU times: user 1min 15s, sys: 143 ms, total: 1min 15s
Wall time: 1min 16s


SVC(C=10, gamma=0.1, kernel='linear')

In [12]:
# predictions
test['svm_tfidf'] = svm_tfidf.predict(test_tfidf.iloc[:, 2:].values)

# 3. KNN

## 3.1 BERT

In [13]:
# parameters
knn_bert_params = {
    'n_neighbors': 11,
    'weights': 'uniform',
    'metric': 'euclidean'
}

In [14]:
# build model
knn_bert = KNeighborsClassifier(**knn_bert_params)
knn_bert

KNeighborsClassifier(metric='euclidean', n_neighbors=11)

In [15]:
# fitting
knn_bert.fit(train_bert.iloc[:, 2:].values, train_bert.doc_class.values)

KNeighborsClassifier(metric='euclidean', n_neighbors=11)

In [16]:
%%time
# predictions
test['knn_bert'] = knn_bert.predict(test_bert.iloc[:, 2:].values)

CPU times: user 1.22 s, sys: 1.05 s, total: 2.27 s
Wall time: 779 ms


## 3.2 TF-IDF

In [17]:
# parameters
knn_tfidf_params = {
    'n_neighbors': 14,
    'weights': 'uniform',
    'metric': 'cosine'
}

In [18]:
# build model
knn_tfidf = KNeighborsClassifier(**knn_tfidf_params)
knn_tfidf

KNeighborsClassifier(metric='cosine', n_neighbors=14)

In [19]:
# fitting
knn_tfidf.fit(train_tfidf.iloc[:, 2:].values, train_tfidf.doc_class.values)

KNeighborsClassifier(metric='cosine', n_neighbors=14)

In [20]:
%%time
# predictions
test['knn_tfidf'] = knn_tfidf.predict(test_tfidf.iloc[:, 2:].values)

CPU times: user 5.8 s, sys: 2.66 s, total: 8.46 s
Wall time: 1.45 s


In [22]:
# saving 
test.to_csv('output/results/results.csv', index=False)