# Illya-BOICHUK-Camp-2025

## Answer

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

with open("data/rt-polarity.neg", "r", encoding="utf-8", errors="ignore") as f:
    texts_neg = f.read().splitlines()
with open("data/rt-polarity.pos", "r", encoding="utf-8", errors="ignore") as f:
    texts_pos = f.read().splitlines()

data = pd.DataFrame({
    "text": texts_neg + texts_pos,
    "label": [0]*len(texts_neg) + [1]*len(texts_pos)
})
data = data.sample(frac=1, random_state=42).reset_index(drop=True)
X_train, X_test, y_train, y_test = train_test_split(
    data["text"], data["label"], test_size=0.2, stratify=data["label"], random_state=42
)


### TF-IDF + Logistic Regression

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

tfidf = TfidfVectorizer(max_features=15000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

clf_lr = LogisticRegression(max_iter=1000)
clf_lr.fit(X_train_tfidf, y_train)
y_pred_lr = clf_lr.predict(X_test_tfidf)

print("TF-IDF + Logistic Regression")
print(f"Accuracy : {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_lr):.4f}")
print(f"Recall   : {recall_score(y_test, y_pred_lr):.4f}")
print(f"F1-score : {f1_score(y_test, y_pred_lr):.4f}")

TF-IDF + Logistic Regression
Accuracy : 0.7595
Precision: 0.7616
Recall   : 0.7552
F1-score : 0.7584


### CountVectorizer + MultinomialNB

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

count_vec = CountVectorizer()
X_train_count = count_vec.fit_transform(X_train)
X_test_count = count_vec.transform(X_test)
clf_nb = MultinomialNB()
clf_nb.fit(X_train_count, y_train)
y_pred_nb = clf_nb.predict(X_test_count)

print("CountVectorizer + MultinomialNB")
print(f"Accuracy : {accuracy_score(y_test, y_pred_nb):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_nb):.4f}")
print(f"Recall   : {recall_score(y_test, y_pred_nb):.4f}")
print(f"F1-score : {f1_score(y_test, y_pred_nb):.4f}")

CountVectorizer + MultinomialNB
Accuracy : 0.7712
Precision: 0.7867
Recall   : 0.7439
F1-score : 0.7647


### BERT-based classifier (DistilBERT)

In [12]:
from transformers import pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
X_test_bert = X_test.iloc[:200]
y_test_bert = y_test.iloc[:200]
bert_preds = [1 if x['label'] == 'POSITIVE' else 0 for x in classifier(X_test_bert.tolist())]

print("BERT-based classifier (DistilBERT):")
print("Accuracy:", accuracy_score(y_test_bert, bert_preds))
print("Precision:", precision_score(y_test_bert, bert_preds))
print("Recall:", recall_score(y_test_bert, bert_preds))
print("F1-score:", f1_score(y_test_bert, bert_preds))


Device set to use cpu


BERT-based classifier (DistilBERT):
Accuracy: 0.895
Precision: 0.9130434782608695
Recall: 0.865979381443299
F1-score: 0.8888888888888888


### SpaCy Word Embeddings + Logistic Regression

To avoid conflicts between the libraries, I launched this model from a second environment

In [None]:
import spacy
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

with open("data/rt-polarity.neg", "r", encoding="utf-8", errors="ignore") as f:
    texts_neg = f.read().splitlines()
with open("data/rt-polarity.pos", "r", encoding="utf-8", errors="ignore") as f:
    texts_pos = f.read().splitlines()

data = pd.DataFrame({
    "text": texts_neg + texts_pos,
    "label": [0]*len(texts_neg) + [1]*len(texts_pos)
})
data = data.sample(frac=1, random_state=42).reset_index(drop=True)
X_train, X_test, y_train, y_test = train_test_split(
    data["text"], data["label"], test_size=0.2, stratify=data["label"], random_state=42
)

nlp = spacy.load("en_core_web_md")
def get_doc_vector(text):
    return nlp(text).vector
X_train_small = X_train.iloc[:1000]
y_train_small = y_train.iloc[:1000]
X_test_small = X_test.iloc[:200]
y_test_small = y_test.iloc[:200]
X_train_embed = np.vstack(X_train_small.apply(get_doc_vector))
X_test_embed = np.vstack(X_test_small.apply(get_doc_vector))
clf_embed = LogisticRegression(max_iter=1000)
clf_embed.fit(X_train_embed, y_train_small)
y_pred_embed = clf_embed.predict(X_test_embed)

print("SpaCy Word Embeddings + Logistic Regression")
print(f"Accuracy : {accuracy_score(y_test_small, y_pred_embed):.4f}")
print(f"Precision: {precision_score(y_test_small, y_pred_embed):.4f}")
print(f"Recall   : {recall_score(y_test_small, y_pred_embed):.4f}")
print(f"F1-score : {f1_score(y_test_small, y_pred_embed):.4f}")


SpaCy Word Embeddings + Logistic Regression
Accuracy : 0.6650
Precision: 0.6596
Recall   : 0.6392
F1-score : 0.6492


### Transformer (BART) Zero-shot Classification

In [13]:
from transformers import pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

classifier_bart = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=-1)
labels = ["positive", "negative"]
X_test_bart = X_test.iloc[:20]
y_test_bart = y_test.iloc[:20].values
y_pred_bart = []
for text in X_test_bart:
    result = classifier_bart(text, candidate_labels=labels)
    pred = 1 if result["labels"][0] == "positive" else 0
    y_pred_bart.append(pred)

print("Transformer (BART) Zero-shot Classification")
print(f"Accuracy : {accuracy_score(y_test_bart, y_pred_bart):.4f}")
print(f"Precision: {precision_score(y_test_bart, y_pred_bart):.4f}")
print(f"Recall   : {recall_score(y_test_bart, y_pred_bart):.4f}")
print(f"F1-score : {f1_score(y_test_bart, y_pred_bart):.4f}")


Device set to use cpu


Transformer (BART) Zero-shot Classification
Accuracy : 0.9000
Precision: 0.8571
Recall   : 0.8571
F1-score : 0.8571


### Evaluation of models

In [14]:
results = pd.DataFrame({
    "Model": [
        "TF-IDF + LR", 
        "CountVectorizer + NB", 
        "BERT Sentiment (DistilBERT)", 
        "spaCy Embeddings + LR", 
        "Zero-shot BART"
    ],

    "Accuracy": [
        accuracy_score(y_test, y_pred_lr),
        accuracy_score(y_test, y_pred_nb),
        accuracy_score(y_test_bert, bert_preds),
        0.6650,
        accuracy_score(y_test_bart, y_pred_bart)
    ],

    "Precision": [
        precision_score(y_test, y_pred_lr),
        precision_score(y_test, y_pred_nb),
        precision_score(y_test_bert, bert_preds),
        0.6596,
        precision_score(y_test_bart, y_pred_bart)
    ],

    "Recall": [
        recall_score(y_test, y_pred_lr),
        recall_score(y_test, y_pred_nb),
        recall_score(y_test_bert, bert_preds),
        0.6392,
        recall_score(y_test_bart, y_pred_bart)
    ],

    "F1-score": [
        f1_score(y_test, y_pred_lr),
        f1_score(y_test, y_pred_nb),
        f1_score(y_test_bert, bert_preds),
        0.6492,
        f1_score(y_test_bart, y_pred_bart)
    ]
})
print(results.round(4))

                         Model  Accuracy  Precision  Recall  F1-score
0                  TF-IDF + LR    0.7595     0.7616  0.7552    0.7584
1         CountVectorizer + NB    0.7712     0.7867  0.7439    0.7647
2  BERT Sentiment (DistilBERT)    0.8950     0.9130  0.8660    0.8889
3        spaCy Embeddings + LR    0.6650     0.6596  0.6392    0.6492
4               Zero-shot BART    0.9000     0.8571  0.8571    0.8571


The Zero-shot BART model showed the highest accuracy of 0.9000, as well as balanced precision, recall and F1-measure values of 0.8571. This proves its effectiveness even without preliminary training on a specific dataset. The BERT Sentiment model (DistilBERT) also showed very good results (accuracy 0.8950, F1-measure 0.8889), which confirms the power of transformational approaches in natural language processing tasks.

Among the traditional models, CountVectorizer + Naive Bayes was the best performer, achieving an accuracy of 0.7712 and an F1-measure of 0.7647. It outperformed TF-IDF + Logistic Regression with an accuracy of 0.7595 and an F1-measure of 0.7584. This may indicate that for this dataset, simple word frequency counting is slightly more efficient than the weighted TF-IDF transformation.

The spaCy Embeddings + Logistic Regression model showed the worst results, with an accuracy of 0.6650 and an F1-measure of 0.6492. This may be due to the limited ability of weighted average embeddings to capture complex contextual information compared to transformers.