<a href="https://colab.research.google.com/github/kavyaaaa16/Sentiment_analysis/blob/main/electra.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers --quiet


In [25]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import get_scheduler
from torch.optim import AdamW


from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [3]:
book_set=pd.read_csv('/content/labeled_book_summaries_roberta.csv')

In [4]:
book_set.isnull().sum()

Unnamed: 0,0
Unnamed: 0.1,0
Unnamed: 0,0
asin,0
helpful,0
rating,0
reviewText,0
reviewTime,0
reviewerID,0
reviewerName,38
summary,0


In [6]:
book_set=book_set.drop(['Unnamed: 0.1','Unnamed: 0','asin', 'helpful','rating','reviewTime','reviewerID','reviewerName','summary','unixReviewTime'], axis='columns')

In [5]:
book_set.rename(columns={'reviewText':'review', 'sentiment':'label'}, inplace=True)

In [7]:
book_set

Unnamed: 0,review,label
0,"Jace Rankin may be short, but he's nothing to ...",Neutral
1,Great short read. I didn't want to put it dow...,Positive
2,I'll start by saying this is the first of four...,Positive
3,Aggie is Angela Lansbury who carries pocketboo...,Positive
4,I did not expect this type of book to be in li...,Positive
...,...,...
11993,Valentine cupid is a vampire- Jena and Ian ano...,Positive
11994,I have read all seven books in this series. Ap...,Positive
11995,This book really just wasn't my cuppa. The si...,Negative
11996,"tried to use it to charge my kindle, it didn't...",Negative


In [8]:
pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [9]:
import re
import emoji

def cleaning(text):
  text=text.lower()
  text=emoji.replace_emoji(text, replace='')
  text=re.sub(r"http\S+", "", text)
  text= re.sub(r"[^a-zA-Z0-9\s]", "", text)
  text=re.sub(r"(.)\1{2,}", r"\1", text)
  text=re.sub(r"\s+", " ", text).strip()
  return text

book_set['review']=book_set['review'].astype(str).apply(cleaning)

In [10]:
valid_labels=['Positive', 'Negative', 'Neutral']
book_set=book_set[book_set['label'].isin(valid_labels)]
book_set=book_set.reset_index(drop=True)

In [11]:
book_set['label'] = book_set['label'].str.lower()


In [12]:
book_set.shape

(11998, 2)

In [13]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

topword_list=stopwords.words('english') #will give a list of stopwords
stop=set(stopwords.words('english'))  #we convert it to set for faster checking
print(stop)

{'few', "we'd", 'after', 'ma', "wouldn't", 'was', 're', "mightn't", 'i', 'out', 'which', 'doing', 'below', "weren't", "you'd", 'an', 'then', "won't", 'who', 'shouldn', "should've", 'hasn', 'your', 'my', 'at', 'been', 'when', 'of', 'from', 'do', "shan't", "hasn't", "needn't", "that'll", 'up', "it'd", "i've", 'himself', "don't", 'shan', 'don', "doesn't", 'ain', 'for', 'm', 'most', 'their', 'through', 'll', 'because', 'off', 'those', 'all', 'will', 'won', "she'll", 't', 'mightn', 'theirs', 'any', 'with', 'mustn', 'had', 'were', 'and', "he'll", 'just', 'down', 'too', 'above', 'some', "they're", 'be', 'this', 'you', 'myself', 'd', "we're", 'where', "didn't", 'as', 'by', 'y', 'to', 'more', 'me', 'such', 'only', 'here', 'him', 'am', 'both', 'very', "you're", 'its', 'has', 'did', 'in', 'against', "we'll", 'what', "it's", 'being', 'have', "aren't", "it'll", 'isn', 'further', "they'd", 'her', 'once', 'themselves', 'now', 'into', 'does', 'aren', 'during', "they'll", 'can', 'we', "you've", 'hers',

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
def remove_stopwords(text):
  words=text.split()
  filtered=[w for w in words if w not in stop]
  return ' '.join(filtered)

book_set['review']=book_set['review'].apply(remove_stopwords)

In [15]:
test_df=pd.read_csv('/content/books_test.csv')

In [16]:
test_df=test_df.drop(['title','author', 'rating','rating_count', 'genre'], axis='columns')


In [None]:
test_df

Unnamed: 0,review,label
0,ive been reading a lot of romance lately thank...,positive
1,this book shows us how to become supercommunic...,positive
2,tldr empire of ai is not the definitive chroni...,negative
3,note from 4172023 so someone reported this for...,negative
4,this is a lovely lovely book of poems nearly e...,positive
...,...,...
194,i rarely write reviews but this book feels lik...,negative
195,i keep switching the rating of this book from ...,positive
196,i absolutely loved rereading pet sematary the ...,positive
197,rating all the stars in the universe i needed ...,positive


In [17]:
test_df['review']=test_df['review'].apply(remove_stopwords)
test_df

Unnamed: 0,review,label
0,ive reading lot romance lately thanks arcs kno...,positive
1,book shows us become supercommunicators superc...,positive
2,tldr empire ai definitive chronicle ai revolut...,negative
3,note 4172023 someone reported unmarked spoiler...,negative
4,lovely lovely book poems nearly every poem end...,positive
...,...,...
194,rarely write reviews book feels like personal ...,negative
195,keep switching rating book 5 4 5 changing opin...,positive
196,absolutely loved rereading pet sematary experi...,positive
197,rating stars universe needed dead agreedyou ne...,positive


In [18]:
# Fix label typos and standardize to lowercase
def clean_label(label):
    label = label.lower()
    if label == 'neagtive':
        return 'negative'
    return label

book_set['label'] = book_set['label'].apply(clean_label)
test_df['label'] = test_df['label'].apply(clean_label)


In [19]:
valid_labels = ['positive', 'negative', 'neutral']
book_set = book_set[book_set['label'].isin(valid_labels)].reset_index(drop=True)
test_df = test_df[test_df['label'].isin(valid_labels)].reset_index(drop=True)

In [20]:
import torch
from transformers import pipeline, ElectraTokenizer, ElectraForSequenceClassification

# 1. Create a pipeline for emotion classification with ELECTRA
classifier = pipeline(
    task="text-classification",
    model="bhadresh-savani/electra-base-emotion",
    torch_dtype=torch.float16,  # use float16 for faster inference if supported
    device=0 if torch.cuda.is_available() else -1  # use GPU if available else CPU
)

# Example inference with the pipeline
result = classifier("This restaurant has amazing food!")
print("Pipeline output:", result)


# 2. Manual tokenization and model inference example

# Load tokenizer and model separately
tokenizer = ElectraTokenizer.from_pretrained("bhadresh-savani/electra-base-emotion")
model = ElectraForSequenceClassification.from_pretrained("bhadresh-savani/electra-base-emotion")

# Sample texts of varying length
texts = [
    "Short text",
    "This is a much longer text that needs padding"
]

# Tokenize with padding and attention masks
inputs = tokenizer(texts, padding=True, return_tensors="pt", truncation=True, max_length=128)

# Forward pass through the model
outputs = model(**inputs)  # model will use attention_mask automatically

# The outputs include logits, use softmax to get probabilities
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
print("Manual model output probabilities:")
print(probs)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/134M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/336 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


Pipeline output: [{'label': 'joy', 'score': 0.6666125059127808}]
Manual model output probabilities:
tensor([[0.2741, 0.0489, 0.0343, 0.3409, 0.2461, 0.0557],
        [0.4324, 0.0636, 0.0305, 0.3098, 0.1229, 0.0407]],
       grad_fn=<SoftmaxBackward0>)


In [21]:
import torch
from transformers import pipeline

classifier = pipeline(
    task="text-classification",
    model="bhadresh-savani/electra-base-emotion",
    torch_dtype=torch.float16,
    device=0
)
classifier("This restaurant has amazing food!")

Device set to use cpu


[{'label': 'joy', 'score': 0.6666125059127808}]

In [22]:
# Example of properly handling padding with attention masks
inputs = tokenizer(["Short text", "This is a much longer text that needs padding"],
                padding=True,
                return_tensors="pt")
outputs = model(**inputs)  # automatically uses the attention_mask

In [23]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [27]:
import torch


In [34]:
from transformers import Trainer, TrainingArguments
from datasets import load_metric


# 2. Split into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    book_set['review'].tolist(),
    book_set['label'].tolist(),
    test_size=0.2,
    stratify=book_set['label'],
    random_state=42
)

# Map labels from strings to ints
label2id = {"positive": 0, "neutral": 1, "negative": 2}

train_labels = [label2id[label] for label in train_labels]
val_labels = [label2id[label] for label in val_labels]

# Now create datasets as usual

# 3. Load tokenizer and model
from transformers import ElectraForSequenceClassification

model_name = "bhadresh-savani/electra-base-emotion"

# Load tokenizer and model once, with ignore_mismatched_sizes=True for new classification head
tokenizer = ElectraTokenizer.from_pretrained(model_name)
model = ElectraForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    ignore_mismatched_sizes=True
)

# 4. Tokenize function
def tokenize(batch):
    return tokenizer(batch, padding='max_length', truncation=True, max_length=128)

train_encodings = tokenizer(train_texts, padding=True, truncation=True, max_length=128)
val_encodings = tokenizer(val_texts, padding=True, truncation=True, max_length=128)

# 5. Create Dataset class
class BookReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = BookReviewDataset(train_encodings, train_labels)
val_dataset = BookReviewDataset(val_encodings, val_labels)

# 6. Define metrics (accuracy here, but you can add others)
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# 7. Training arguments
training_args = TrainingArguments(
    output_dir='./electra-bookreview-results',
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=2
)


# 8. Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# 9. Train the model
trainer.train()

# 10. Evaluate the model on validation set
eval_results = trainer.evaluate()
print(f"Validation accuracy: {eval_results['eval_accuracy']:.4f}")

# 11. Save the fine-tuned model and tokenizer
model.save_pretrained('./electra-bookreview-model')
tokenizer.save_pretrained('./electra-bookreview-tokenizer')


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at bhadresh-savani/electra-base-emotion and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([6, 256]) in the checkpoint and torch.Size([3, 256]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([6]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
50,1.0577
100,0.9885
150,0.9636
200,0.8986
250,0.8542
300,0.8079
350,0.7612
400,0.7479
450,0.6932
500,0.7009


NameError: name 'np' is not defined

In [36]:
import numpy as np


In [37]:
eval_results = trainer.evaluate()
print(f"Validation accuracy: {eval_results['eval_accuracy']:.4f}")


Validation accuracy: 0.7854


In [119]:
# If you're using a Trainer or custom training loop
model.save_pretrained("electra-sentiment-model")
tokenizer.save_pretrained("electra-sentiment-model")


('electra-sentiment-model/tokenizer_config.json',
 'electra-sentiment-model/special_tokens_map.json',
 'electra-sentiment-model/vocab.txt',
 'electra-sentiment-model/added_tokens.json')

In [90]:
import pandas as pd
df = pd.read_csv('/content/labeled_book_summaries_roberta.csv')
print(df.columns)


Index(['Unnamed: 0.1', 'Unnamed: 0', 'asin', 'helpful', 'rating', 'reviewText',
       'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime',
       'sentiment'],
      dtype='object')
