### Classification
The notebook includes experiments with classification models, comparing large language models (LLMs) to conventional methods.

In [None]:
from datasets import load_dataset
import numpy as np
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset
from transformers import pipeline
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression

In [None]:
data = load_dataset("rotten_tomatoes")

In [None]:
model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"
pipe = pipeline(
    model = model_path,            
    tokenizer = model_path,
    return_all_scores = True,
    device="cuda:0"
)

In [None]:
y_pred = []
for output in tqdm(pipe(KeyDataset(data['test'], 'text')), total=len(data['test'])):
    negative_score = output[0]["score"]
    positive_score = output[2]["score"]
    assignment = np.argmax([negative_score, positive_score])
    y_pred.append(assignment)

In [None]:
def evaluate_performance(y_true, y_pred):
    performance = classification_report(y_true, y_pred, target_names=["Negative Review", "Positive Review"])
    print(performance)
evaluate_performance(data["test"]["label"], y_pred)

#### Sentence Transformer

In [None]:
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
train_embeddings = model.encode(data['train']['text'], show_progress_bar=True)
test_embeddings = model.encode(data['test']['text'], show_progress_bar=True)

In [None]:
clf = LogisticRegression(random_state=42)
clf.fit(train_embeddings, data["train"]["label"])

In [None]:
y_pred = clf.predict(test_embeddings)
evaluate_performance(data["test"]["label"], y_pred)

#### Google's Flan (LLM)

In [None]:
pipe = pipeline(
  "text2text-generation",
  model="google/flan-t5-small",
  device="cuda:0"
)

In [None]:
prompt = "Is the following sentence positive or negative? "
data = data.map(lambda example: {"t5": prompt + example['text']})

In [None]:
y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "t5")), total=len(data['test'])):
    text = output[0]["generated_text"]
    y_pred.append(0 if text == 'negative' else 1)

In [None]:
evaluate_performance(data["test"]["label"], y_pred)