## Loading datasets

In [None]:
!pip install datasets

In [24]:
from datasets import load_dataset

In [25]:
data = load_dataset("rotten_tomatoes")

In [26]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [5]:
data["train"][0, -1]

{'text': ['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
  'things really get weird , though not particularly scary : the movie is all portent and no content .'],
 'label': [1, 0]}

# Text Classification Using a Task-Specific Model

In [27]:
from transformers import pipeline

In [7]:
model_path = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

In [8]:
pipe = pipeline(
    model=model_path,
    tokenizer=model_path,
    return_all_scores=True
)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [None]:
!pip install --upgrade transformers

In [11]:
import numpy as np
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

In [12]:
y_pred = []

for output in tqdm(pipe(KeyDataset(data['test'], 'text')),
                   total=len(data['test'])):
                   negative_score = output[0]['score']
                   positive_score = output[2]['score']
                   assignment = np.argmax([negative_score, positive_score])
                   y_pred.append(assignment)


100%|██████████| 1066/1066 [03:49<00:00,  4.64it/s]


In [13]:
from sklearn.metrics import classification_report

def evaluate_performance(y_true, y_pred):
  performance = classification_report(
      y_true, y_pred,
      target_names=['Negative Review', 'Positive Review']
  )
  print(performance)

In [14]:
evaluate_performance(data['test']['label'], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.76      0.88      0.81       533
Positive Review       0.86      0.72      0.78       533

       accuracy                           0.80      1066
      macro avg       0.81      0.80      0.80      1066
   weighted avg       0.81      0.80      0.80      1066



## Embedding model + Classifier For Classification Task

In [15]:
from sentence_transformers import SentenceTransformer

In [16]:
model= SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

train_embeddings = model.encode(data['train']['text'],
                                show_progress_bar=True)

test_embeddings = model.encode(data['test']['text'],
                               show_progress_bar=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/267 [00:00<?, ?it/s]

Batches:   0%|          | 0/34 [00:00<?, ?it/s]

In [17]:
train_embeddings.shape

(8530, 768)

In [18]:
from sklearn.linear_model import LogisticRegression


clf = LogisticRegression(random_state=42)
clf.fit(train_embeddings, data['train']['label'])

In [19]:
y_pred = clf.predict(test_embeddings)
evaluate_performance(data['test']['label'], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.85      0.86      0.85       533
Positive Review       0.86      0.85      0.85       533

       accuracy                           0.85      1066
      macro avg       0.85      0.85      0.85      1066
   weighted avg       0.85      0.85      0.85      1066



## What if we didn't have labels?

In [20]:
label_embeddings = model.encode(['A negative review', 'A positive review'])

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

sim_matrix = cosine_similarity(test_embeddings, label_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)

In [22]:
evaluate_performance(data['test']['label'], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.78      0.77      0.78       533
Positive Review       0.77      0.79      0.78       533

       accuracy                           0.78      1066
      macro avg       0.78      0.78      0.78      1066
   weighted avg       0.78      0.78      0.78      1066



This shows how important the embeddings are !!

# Text Classification With Generative Model

In [29]:
# using flan-t5-small

pipe = pipeline(
    'text2text-generation',
    model='google/flan-t5-small'
)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cpu


In [30]:
prompt = 'Is the following sentence positive or negative ?'
data = data.map(lambda example : {'t5': prompt + example['text']})
data

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
})

In [31]:
data['train']['t5'][0]

'Is the following sentence positive or negative ?the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'

In [32]:
y_pred = []

for output in tqdm(pipe(KeyDataset(data['test'], 't5')),
                   total=len(data['test'])):
                   text=output[0]['generated_text']
                   y_pred.append(0 if text == 'negative' else 1)

100%|██████████| 1066/1066 [02:14<00:00,  7.95it/s]


In [33]:
evaluate_performance(data['test']['label'], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.82      0.87      0.84       533
Positive Review       0.86      0.80      0.83       533

       accuracy                           0.84      1066
      macro avg       0.84      0.84      0.84      1066
   weighted avg       0.84      0.84      0.84      1066



This shows how a generative model can be used for required purposes with the help of prompt.