In [None]:
#install packages
!pip install transformers accelerate datasets tqdm openai

In [2]:
#import packages
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers.pipelines.pt_utils import KeyDataset
from sklearn.metrics import classification_report
from datasets import load_dataset
import numpy as np
from tqdm import tqdm

# **The sentiment of movie reviews.**

In [None]:
#load the data
data = load_dataset("rotten_tomatoes")

## Representation models

- Task specific model

In [None]:
#model selection
model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"

#build the model
generator = pipeline(model=model_path, tokenizer=model_path, return_all_scores=True, device="cuda:0")

#y_predicted data
y_pred = []

#predict
for output in tqdm(generator(KeyDataset(data['test'], 'text')), total=len(data['test'])):
  negative_scores = output[0]['score']
  positive_scores = output[2]['score']
  assignment = np.argmax([negative_scores, positive_scores])
  y_pred.append(assignment)

In [5]:
#evaluation function
def evaluation(y_true, y_pred):
  report = classification_report(y_true, y_pred)
  return report

In [None]:
report = evaluation(data['test']['label'], y_pred)
print(report)

- Embaddings model

In [None]:
!pip install sentence_transformers

In [30]:
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression

In [None]:
emb_model_path = "sentence-transformers/all-mpnet-base-v2"

emb_model = SentenceTransformer(emb_model_path)

train_embadings = emb_model.encode(data['train']['text'], show_progress_bar=True)

test_embadings = emb_model.encode(data['test']['text'], show_progress_bar=True)

In [None]:
#Build Logistic Regression Model
clf_model = LogisticRegression()

#Fit the model
clf_model.fit(train_embadings, data['train']['label'])

#predict
clf_y_pred = clf_model.predict(test_embadings)

#evaluation
report = evaluation(data['test']['label'], clf_y_pred)
print(report)

- What if we do not have labeled data?

In [33]:
label_embadings = emb_model.encode(["A very negative movie review", "A positive movie review"])

In [34]:
from sklearn.metrics.pairwise import cosine_similarity

In [35]:
sim_matrix = cosine_similarity(test_embadings, label_embadings)
cos_y_pred = np.argmax(sim_matrix, axis=1)

In [None]:
report = evaluation(data['test']['label'], cos_y_pred)
print(report)

## Generative models

- Flan-T5


In [None]:
pipe = pipeline("text2text-generation",model="google/flan-t5-small",device="cuda:0")

In [None]:
prompt = "Is the following sentence positive or negative? "
data = data.map(lambda example: {"t5": prompt + example['text']})
data

In [None]:
y_pred = []
for output in tqdm(pipe(KeyDataset(data["test"], "t5")),total=len(data["test"])):
  text = output[0]["generated_text"]
  y_pred.append(0 if text == "negative" else 1)

In [None]:
evaluation(data["test"]["label"], y_pred)

- Gpt-3.5-turbo

In [7]:
import openai

In [8]:
client = openai.OpenAI(api_key="Your openai key")

In [20]:
def chatgpt_generation(prompt, document, model="gpt-3.5-turbo-0125"):
  messages=[{"role": "system","content": "You are a helpful assistant."}, {"role": "user","content": prompt.replace("[DOCUMENT]", document)}
            ]
  chat_completion = client.chat.completions.create(messages=messages,
                                                   model=model,
                                                   temperature=0)

  return chat_completion.choices[0].message.content

In [22]:
prompt = """Predict whether the following document is a positive
or negative movie review:
[DOCUMENT]
If it is positive return 1 and if it is negative return 0. Do not
give any other answers.
"""

In [None]:
predictions = [chatgpt_generation(prompt, doc) for doc in tqdm(data["test"]["text"])]

y_pred = []

for output in predictions:
  if len(output) > 1:
    output = '1'
  y_pred.append(int(output))

In [None]:
evaluation(data["test"]["label"], y_pred)