In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/IMDB Dataset.csv


# Libraries

In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
import re

2025-08-30 12:20:40.258731: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756556440.628018      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756556440.738518      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
df = pd.read_csv('/kaggle/input/IMDB Dataset.csv')

# Preprocessing

In [4]:
df = df.drop_duplicates().reset_index(drop=True)

In [5]:
def preprocessing(text:str) -> str:
    # def lower_casing(text:str) -> str:
    #     return text.lower()

    def remove_html_tags(text:str) -> str:
        return re.sub(r"<[^>]+>", "", text)

    def remove_url(text:str) -> str:
        return re.sub(r"https?://\S+|www\.\S+", "", text)
    
    # def remove_punctuation(text:str) -> str:
    #     return text.translate(str.maketrans("", "", string.punctuation))
    
    # STOP_WORDS = set(stopwords.words('english'))
    # def remove_stop_words(text:str) -> str:
    #     return " ".join(word for word in text.split() if word not in STOP_WORDS)
    
    # text = lower_casing(text)
    text = remove_html_tags(text)
    text = remove_url(text)
    # text = remove_punctuation(text)
    # text = remove_stop_words(text)
    return text

In [6]:
df['review'] = df['review'].apply(preprocessing)

In [7]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['sentiment'])

In [8]:
le.classes_

array(['negative', 'positive'], dtype=object)

In [9]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [10]:
encodings = tokenizer(
    df['review'].to_list(),
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors='pt'
)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [13]:
train_texts, val_texts, train_labels, val_labels, train_indices, val_indices = train_test_split(
    df['review'], df['label'], df.index, test_size=0.2, random_state=42
)

train_encodings = {key: val[train_indices] for key, val in encodings.items()}
val_encodings = {key: val[val_indices] for key, val in encodings.items()}

train_dataset = SentimentDataset(train_encodings, train_labels.to_list())
val_dataset = SentimentDataset(val_encodings, val_labels.to_list())

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32)
print("PyTorch DataLoaders created.")

PyTorch DataLoaders created.


In [14]:
model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=len(df['label'].unique())
)
model.to(device)
print("Model loaded and moved to device.")

if torch.cuda.device_count() > 1:
  print(f"Using {torch.cuda.device_count()} GPUs for Data Parallel training.")
  model = torch.nn.DataParallel(model)

optimizer = AdamW(model.parameters(), lr=2e-5)

num_training_steps = len(train_dataloader) * 3
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)
print("Optimizer and scheduler configured.")

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded and moved to device.
Using 2 GPUs for Data Parallel training.
Optimizer and scheduler configured.


In [16]:
for epoch in range(3):
    print(f"\n--- Epoch {epoch + 1}/{3} ---")
    
    model.train()
    total_train_loss = 0
    progress_bar = tqdm(train_dataloader, desc="Training")
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs.loss
        if torch.cuda.device_count() > 1:
            loss = loss.mean()
        total_train_loss += loss.item()
        
        # Backward pass: compute gradients.
        loss.backward()
        
        # Update weights.
        optimizer.step()
        
        # Update learning rate.
        scheduler.step()
        
        progress_bar.set_postfix({'loss': loss.item()})

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Average Training Loss: {avg_train_loss:.4f}")

    # Validation Phase
    model.eval() # Put the model in evaluation mode.
    total_eval_loss = 0
    total_eval_accuracy = 0
    
    with torch.no_grad(): # Disable gradient calculation for efficiency.
        for batch in tqdm(val_dataloader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            
            loss = outputs.loss
            if torch.cuda.device_count() > 1:
                loss = loss.mean()
            total_eval_loss += loss.item()
            
            # Calculate accuracy
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            total_eval_accuracy += (predictions == labels).sum().item()

    avg_val_loss = total_eval_loss / len(val_dataloader)
    avg_val_accuracy = total_eval_accuracy / len(val_dataset)
    print(f"Validation Loss: {avg_val_loss:.4f}")
    print(f"Validation Accuracy: {avg_val_accuracy:.4f}")


--- Epoch 1/3 ---


Training:   0%|          | 0/1240 [00:00<?, ?it/s]

Average Training Loss: 0.1769


Validation:   0%|          | 0/310 [00:00<?, ?it/s]

Validation Loss: 0.1438
Validation Accuracy: 0.9482

--- Epoch 2/3 ---


Training:   0%|          | 0/1240 [00:00<?, ?it/s]

Average Training Loss: 0.0960


Validation:   0%|          | 0/310 [00:00<?, ?it/s]

Validation Loss: 0.1451
Validation Accuracy: 0.9509

--- Epoch 3/3 ---


Training:   0%|          | 0/1240 [00:00<?, ?it/s]

Average Training Loss: 0.0530


Validation:   0%|          | 0/310 [00:00<?, ?it/s]

Validation Loss: 0.1677
Validation Accuracy: 0.9509


In [18]:
model.module.save_pretrained('/kaggle/working/roberta-sentiment-analysis')
tokenizer.save_pretrained('/kaggle/working/roberta-sentiment-tokenizer')

('/kaggle/working/roberta-sentiment-tokenizer/tokenizer_config.json',
 '/kaggle/working/roberta-sentiment-tokenizer/special_tokens_map.json',
 '/kaggle/working/roberta-sentiment-tokenizer/vocab.json',
 '/kaggle/working/roberta-sentiment-tokenizer/merges.txt',
 '/kaggle/working/roberta-sentiment-tokenizer/added_tokens.json')