In [1]:
# Cell 1: Install necessary libraries
!pip install torch torchvision numpy pandas transformers




In [14]:
# Cell 2: Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Load your dataset
df = pd.read_csv(r"D:/mepco/sem_5/Big Data/tcpc/Dataset/extract TV data in Section IV-A/allTV_review_2010.csv")
df = df[['Text', 'Star']]  # We need only text and stars for this task
df.dropna()
df = df.head(1000)


# Preprocess: Map the Star ratings to 0 (negative), 1 (neutral), 2 (positive)
def label_map(star):
    if star <= 2:
        return 0  # Negative
    elif star == 3:
        return 1  # Neutral
    else:
        return 2  # Positive

df['label'] = df['Star'].apply(label_map)
df[['Text', 'Star', 'label']].head()


Unnamed: 0,Text,Star,label
0,"Order this TV without researching it too much,...",3,1
1,I bought this TV 3 weeks ago and I am very hap...,4,2
2,I did a lot of looking around before I made th...,5,2
3,I had been eyeing the LED market for a year in...,5,2
4,When I first looked into buying a new tv i loo...,4,2


In [5]:
df.describe()

Unnamed: 0,Star,label
count,1000.0,1000.0
mean,2.762,0.881
std,1.764219,0.964245
min,1.0,0.0
25%,1.0,0.0
50%,2.0,0.0
75%,5.0,2.0
max,5.0,2.0


In [15]:
# Cell 3: Define Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.texts = df['Text'].tolist()
        self.labels = df['label'].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return input_ids, attention_mask, label


In [5]:
# Cell 4: Initialize tokenizer and create dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset = SentimentDataset(df, tokenizer)
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(dataset, batch_size=16, shuffle=False)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')




BERT Model

In [6]:
# Cell 5: Define BERT model
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
bert_model.to(device)
optimizer = optim.AdamW(bert_model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

# Cell 6: Training Loop for BERT
epochs = 1
for epoch in range(epochs):
    bert_model.train()
    total_loss = 0
    for input_ids, attention_mask, labels in train_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"BERT Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


: 

TextCNN Model

In [5]:
# Cell 7: Define TextCNN model
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1 = nn.Conv2d(1, 100, (3, embed_dim))
        self.conv2 = nn.Conv2d(1, 100, (4, embed_dim))
        self.conv3 = nn.Conv2d(1, 100, (5, embed_dim))
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(300, num_classes)

    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids).unsqueeze(1)  # Add channel dimension
        x1 = nn.functional.relu(self.conv1(x)).squeeze(3)
        x1 = nn.functional.max_pool1d(x1, x1.size(2)).squeeze(2)
        x2 = nn.functional.relu(self.conv2(x)).squeeze(3)
        x2 = nn.functional.max_pool1d(x2, x2.size(2)).squeeze(2)
        x3 = nn.functional.relu(self.conv3(x)).squeeze(3)
        x3 = nn.functional.max_pool1d(x3, x3.size(2)).squeeze(2)
        x = torch.cat((x1, x2, x3), 1)
        x = self.dropout(x)
        logits = self.fc(x)
        return logits

# Cell 8: Initialize TextCNN model
text_cnn_model = TextCNN(vocab_size=tokenizer.vocab_size, embed_dim=128, num_classes=3).to(device)
optimizer = optim.Adam(text_cnn_model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()


In [6]:
# Cell 9: Training Loop for TextCNN
for epoch in range(epochs):
    text_cnn_model.train()
    total_loss = 0
    for input_ids, attention_mask, labels in train_loader:
        input_ids, labels = input_ids.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = text_cnn_model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"TextCNN Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")


TextCNN Epoch 1/1, Loss: 1.12742928569279


TextRNN Model
8. Define TextRNN (bi-LSTM) Model

In [7]:
# Cell 10: Define TextRNN model
class TextRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(TextRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids)
        x, _ = self.lstm(x)
        x = x[:, -1, :]  # Use last hidden state
        logits = self.fc(x)
        return logits

# Cell 11: Initialize TextRNN model
textrnn_model = TextRNN(vocab_size=tokenizer.vocab_size, embed_dim=128, hidden_dim=64, num_classes=3).to(device)
optimizer = optim.Adam(textrnn_model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()


In [8]:
# Cell 12: Training Loop for TextRNN
for epoch in range(epochs):
    textrnn_model.train()
    total_loss = 0
    for input_ids, attention_mask, labels in train_loader:
        input_ids, labels = input_ids.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = textrnn_model(input_ids)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"TextRNN Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")


TextRNN Epoch 1/1, Loss: 1.0599294872511


Evaluation and Comparison
10. Define Evaluation Function

In [None]:
# Cell 13: Evaluation function
def evaluate_model(model, dataloader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for input_ids, attention_mask, labels in dataloader:
            # Move tensors to device
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            
            # Forward pass with attention_mask (if applicable)
            if 'attention_mask' in model.forward.__code__.co_varnames:
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            else:
                outputs = model(input_ids)
            
            # Check if outputs has 'logits' (for models like BERT); otherwise, use outputs directly
            logits = outputs.logits if hasattr(outputs, "logits") else outputs
            preds = torch.argmax(logits, dim=1)
            
            # Collect predictions and true labels
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    # Calculate accuracy and classification report
    return accuracy_score(true_labels, predictions), classification_report(true_labels, predictions)


DistilBERT

In [24]:
# Cell 15: Import and Initialize DistilBERT Model and Tokenizer
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
distilbert_model.to(device)

# Cell 16: Initialize DistilBERT DataLoader
distilbert_dataset = SentimentDataset(df, distilbert_tokenizer)
distilbert_train_loader = DataLoader(distilbert_dataset, batch_size=16, shuffle=True)
distilbert_test_loader = DataLoader(distilbert_dataset, batch_size=16, shuffle=False)

# Cell 17: Training Loop for DistilBERT
for epoch in range(epochs):
    distilbert_model.train()
    total_loss = 0
    for input_ids, attention_mask, labels in distilbert_train_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = distilbert_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"DistilBERT Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(distilbert_train_loader)}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBERT Epoch 1/1, Loss: 1.0612239080762107


RoBERTa Model

In [16]:
# Cell 18: Import and Initialize RoBERTa Model and Tokenizer


roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)
roberta_model.to(device)

# Cell 19: Initialize RoBERTa DataLoader
roberta_dataset = SentimentDataset(df, roberta_tokenizer)
roberta_train_loader = DataLoader(roberta_dataset, batch_size=16, shuffle=True)
roberta_test_loader = DataLoader(roberta_dataset, batch_size=16, shuffle=False)

# Cell 20: Training Loop for RoBERTa
for epoch in range(epochs):
    roberta_model.train()
    total_loss = 0
    for input_ids, attention_mask, labels in roberta_train_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = roberta_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"RoBERTa Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(roberta_train_loader)}")






ConnectionError: (MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /roberta-base/resolve/main/tf_model.h5 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000002BB1C02E540>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: c08b4e2d-a5d5-40c6-a0a1-04c0924b1c83)')

In [13]:
# Cell 18: Import and Initialize RoBERTa Model and Tokenizer
from transformers import RobertaTokenizer, RobertaForSequenceClassification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
num_labels = df['label'].nunique()  
roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels)
roberta_model.to(device)

# Cell 19: Initialize RoBERTa DataLoader
# Assuming you have defined SentimentDataset elsewhere
roberta_dataset = SentimentDataset(df, roberta_tokenizer) 
roberta_train_loader = DataLoader(roberta_dataset, batch_size=16, shuffle=True)
roberta_test_loader = DataLoader(roberta_dataset, batch_size=16, shuffle=False)

# Cell 20: Training Loop for RoBERTa
from tqdm import tqdm  # Import tqdm for progress bar

for epoch in range(epochs):
    roberta_model.train()
    total_loss = 0
    
    progress_bar = tqdm(roberta_train_loader, desc=f"Epoch {epoch + 1}/{epochs}") 
    
    for input_ids, attention_mask, labels in progress_bar:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = roberta_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        progress_bar.set_postfix({'loss': loss.item()}) 
    
    print(f"RoBERTa Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(roberta_train_loader)}")

SSLError: (MaxRetryError("HTTPSConnectionPool(host='cdn-lfs.hf.co', port=443): Max retries exceeded with url: /roberta-base/5bde1d28afb363d0103324efeb5afc8b2b397fe5e04beabb9b1ef355255ade81?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1731296899&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMTI5Njg5OX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yb2JlcnRhLWJhc2UvNWJkZTFkMjhhZmIzNjNkMDEwMzMyNGVmZWI1YWZjOGIyYjM5N2ZlNWUwNGJlYWJiOWIxZWYzNTUyNTVhZGU4MT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=g83aC2dUFdfA5mGocB42u1c4bdkJ8jXsuFuLppfIlPfrh1nwuHBL8hQD0VXNRxbogORAGIujY~~o3qkSggu5aOmZ60cwveJE6oIdyNUArwUDD3txqxvDKThaNzNpdMWcB71NtlDZVsb9WIHS484v2zuTbysncz3ptYxVG8u5p2fxtinHjZ24iG14YP0vhlDX1OPOsnpRMmDPSAPXI0xXPZ0nrlCmXjyFCO3pcXjoSm1NnvfMH~6TkXSQVV49ZduDH3FhfwGd-c63L1DPlX4-TXcrB2RhLCJtk50TW5jOuXLKaxYRt7N2L06-fN4AlBFhQtz7gYbyygs7QhqvYYO8xw__&Key-Pair-Id=K3RPWS32NSSJCE (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1000)')))"), '(Request ID: cac3dbe8-3fd5-46b7-950d-b41d64ebcd77)')

11. Evaluate All Models and Print Results

In [14]:
# Cell 14: Evaluate each model and print results
print("Evaluating BERT model:")
bert_accuracy, bert_report = evaluate_model(bert_model, test_loader)
print("Accuracy:", bert_accuracy)
print(bert_report)

print("Evaluating TextCNN model:")
textcnn_accuracy, textcnn_report = evaluate_model(text_cnn_model, test_loader)
print("Accuracy:", textcnn_accuracy)
print(textcnn_report)

print("Evaluating TextRNN model:")
textrnn_accuracy, textrnn_report = evaluate_model(textrnn_model, test_loader)
print("Accuracy:", textrnn_accuracy)
print(textrnn_report)


Evaluating BERT model:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.876
              precision    recall  f1-score   support

           0       0.84      0.97      0.90       531
           1       0.00      0.00      0.00        57
           2       0.93      0.87      0.90       412

    accuracy                           0.88      1000
   macro avg       0.59      0.62      0.60      1000
weighted avg       0.83      0.88      0.85      1000

Evaluating TextCNN model:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.536
              precision    recall  f1-score   support

           0       0.53      1.00      0.70       531
           1       0.00      0.00      0.00        57
           2       1.00      0.01      0.02       412

    accuracy                           0.54      1000
   macro avg       0.51      0.34      0.24      1000
weighted avg       0.70      0.54      0.38      1000

Evaluating TextRNN model:
Accuracy: 0.565
              precision    recall  f1-score   support

           0       0.64      0.52      0.57       531
           1       0.00      0.00      0.00        57
           2       0.51      0.71      0.59       412

    accuracy                           0.56      1000
   macro avg       0.38      0.41      0.39      1000
weighted avg       0.55      0.56      0.55      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
print("Evaluating DistilBERT model:")
distilbert_accuracy, distilbert_report = evaluate_model(distilbert_model, distilbert_test_loader)
print("Accuracy:", distilbert_accuracy)
print(distilbert_report)

Evaluating DistilBERT model:
Accuracy: 0.531
              precision    recall  f1-score   support

           0       0.60      0.55      0.58       531
           1       0.00      0.00      0.00        57
           2       0.46      0.58      0.51       412

    accuracy                           0.53      1000
   macro avg       0.36      0.38      0.36      1000
weighted avg       0.51      0.53      0.52      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Cell 21: Evaluate RoBERTa model
print("Evaluating RoBERTa model:")
roberta_accuracy, roberta_report = evaluate_model(roberta_model, roberta_test_loader)
print("Accuracy:", roberta_accuracy)
print(roberta_report)