In [1]:
import os
import csv
import numpy as np
import pandas as pd
import torch

In [2]:
X = pd.read_csv('train.csv')
X.head()

Unnamed: 0,Text,Sentiment
0,"Trading on the success of the 1975 hit, this f...",0
1,Is this supposed to be serious? I hope not. Th...,0
2,"So many great actors, so little worth watching...",0
3,This is the type of late-night cable flick usu...,0
4,I remember when I first heard about Jack Frost...,0


In [3]:
X_train = X['Text']
y_train = X['Sentiment']

In [4]:
x = pd.read_csv('test.csv')
x.head()

Unnamed: 0,Text,Sentiment
0,The title creatures wreak havoc at a peaceful ...,0
1,Jim Carrey is back to much the same role that ...,1
2,From the filmmakers who brought us The March o...,1
3,I haven't read a single IMDb comment for this ...,0
4,"Well, this is probably one of the best movies ...",1


In [5]:
X_test = x['Text']
y_test = x['Sentiment']

In [6]:
train_list = []
for text in X_train:
    train_list.append(text)

In [7]:
test_list = []
for text in X_test:
    test_list.append(text)

In [8]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
train_encodings = tokenizer(train_list, truncation=True, padding=True)
test_encodings = tokenizer(test_list, truncation=True, padding=True)

In [10]:
train_labels = []
for i in y_train:
    train_labels.append(i)

test_labels = []
for i in y_test:
    test_labels.append(i)

In [11]:
class SentimentAnalysis(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [12]:
train_dataset = SentimentAnalysis(train_encodings, train_labels)
test_dataset = SentimentAnalysis(test_encodings, test_labels)

In [17]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [18]:
all_predictions = []
all_labels = []

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

for batch in test_loader:
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        probabilities = torch.softmax(logits, dim=1)

        predicted_labels = torch.argmax(probabilities, dim=1)

        all_predictions.extend(predicted_labels.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

all_predictions = np.array(all_predictions)
all_labels = np.array(all_labels)

In [19]:
print(all_predictions)
print(len(all_predictions))

[0 1 1 ... 0 1 1]
25000


In [20]:
print(all_labels)
print(len(all_labels))

[0 1 1 ... 0 0 0]
25000


In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_predictions)

# Calculate precision
precision = precision_score(all_labels, all_predictions)

# Calculate recall
recall = recall_score(all_labels, all_predictions)

# Calculate F1-score
f1 = f1_score(all_labels, all_predictions)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Accuracy: 0.91576
Precision: 0.9140376035691523
Recall: 0.91784
F1-score: 0.9159348555005588


In [22]:
import os
output_dir = "./saved_model/"
os.makedirs(output_dir, exist_ok=True)

model.save_pretrained(output_dir)

print("Model saved successfully at:", output_dir)

Model saved successfully at: ./saved_model/


In [27]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def predict_single_text(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True, padding=True)

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    probabilities = torch.softmax(logits, dim=1)

    predicted_label = torch.argmax(probabilities, dim=1).item()

    return predicted_label

In [28]:
#for positive prediction
text_to_predict = "The Movie was very good"
predicted_label = predict_single_text(text_to_predict)
print("Predicted label:", "positive" if predicted_label == 1 else "Negative")

Predicted label: positive


In [29]:
#for positive prediction
text_to_predict = "The Movie was not so good"
predicted_label = predict_single_text(text_to_predict)
print("Predicted label:", "positive" if predicted_label == 1 else "Negative")

Predicted label: Negative
