In [None]:
!pip install transformers datasets torch scikit-learn pandas


In [92]:
import os
import pandas as pd
data_dir = '/kaggle/input/imdb15/aclImdb'


# Define the path to the dataset directory
train_data_dir = '/kaggle/input/imdb15/aclImdb/train'
test_data_dir = '/kaggle/input/imdb15/aclImdb/test'

# Load the data
def load_data(data_dir):
    data = {'review': [], 'sentiment': []}
    for label in ['pos', 'neg']:
        labeled_dir = os.path.join(data_dir, label)
        for review_file in os.listdir(labeled_dir):
            if review_file.endswith('.txt'):
                with open(os.path.join(labeled_dir, review_file), 'r', encoding='utf-8') as file:
                    review_text = file.read()
                    data['review'].append(review_text)
                    data['sentiment'].append(1 if label == 'pos' else 0)
    return pd.DataFrame(data)

# Load training data
train_data = load_data(train_data_dir)
print(train_data)
# Load test data
test_data = load_data(test_data_dir)
print(test_data)
# Combine train and test data if needed, or use them separately
imdb_data = pd.concat([train_data, test_data])

imdb_data = imdb_data.sample(n=10000, random_state=42)  # 10,000 samples
# imdb_data = imdb_data.sample(frac=0.1, random_state=42) 

# # Zip the dataset directory
# folder_to_zip = 'aclImdb'  # Directory to zip
# zip_file_name = 'aclImdb.zip'
# shutil.make_archive(zip_file_name.replace('.zip', ''), 'zip', folder_to_zip)

# # Download the zip file
# files.download(zip_file_name)


                                                  review  sentiment
0      This was one of those wonderful rare moments i...          1
1      Have you seen The Graduate? It was hailed as t...          1
2      I don't watch a lot of TV, except for The Offi...          1
3      Kubrick again puts on display his stunning abi...          1
4      First of all, I liked very much the central id...          1
...                                                  ...        ...
24995  The first hour of the movie was boring as hell...          0
24996  A fun concept, but poorly executed. Except for...          0
24997  I honestly don't understand how tripe like thi...          0
24998  This remake of the 1962 orginal film'o the boo...          0
24999  La Sanguisuga Conduce la Danza, or The Bloodsu...          0

[25000 rows x 2 columns]
                                                  review  sentiment
0      I've Seen The Beginning Of The Muppet Movie, B...          1
1      If it had been 

In [93]:
imdb_data

Unnamed: 0,review,sentiment
8553,"For us, an Abbott and Costello movie is someth...",1
9427,"This one and ""Her Pilgrim Soul"" are two of my ...",1
199,Spectacular Horror movie that will give you th...,1
12447,I don't think most of us would tend to apply t...,1
14489,I would like to say something different about ...,0
...,...,...
3567,Probably this is the best film of Clint Eastwo...,1
79,"Reading some of the other comments, I must agr...",1
18707,A young man kills a young woman for no reason....,0
15200,I think its time for Seagal to go quietly into...,0


In [103]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, random_split
import pandas as pd
from sklearn.model_selection import train_test_split

In [94]:
train_data = load_data(train_data_dir)
print("Training Data:")
print(train_data)

Training Data:
                                                  review  sentiment
0      This was one of those wonderful rare moments i...          1
1      Have you seen The Graduate? It was hailed as t...          1
2      I don't watch a lot of TV, except for The Offi...          1
3      Kubrick again puts on display his stunning abi...          1
4      First of all, I liked very much the central id...          1
...                                                  ...        ...
24995  The first hour of the movie was boring as hell...          0
24996  A fun concept, but poorly executed. Except for...          0
24997  I honestly don't understand how tripe like thi...          0
24998  This remake of the 1962 orginal film'o the boo...          0
24999  La Sanguisuga Conduce la Danza, or The Bloodsu...          0

[25000 rows x 2 columns]


In [96]:
imdb_data

Unnamed: 0,review,sentiment
8553,"For us, an Abbott and Costello movie is someth...",1
9427,"This one and ""Her Pilgrim Soul"" are two of my ...",1
199,Spectacular Horror movie that will give you th...,1
12447,I don't think most of us would tend to apply t...,1
14489,I would like to say something different about ...,0
...,...,...
3567,Probably this is the best film of Clint Eastwo...,1
79,"Reading some of the other comments, I must agr...",1
18707,A young man kills a young woman for no reason....,0
15200,I think its time for Seagal to go quietly into...,0


In [107]:
# Prepare data for fine-tuning
texts = imdb_data['review']
labels = imdb_data['sentiment']  # 'sentiment' column contains labels (0 for negative, 1 for positive)

# Use BERT tokenizer
# tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize texts
tokenized_texts = tokenizer(list(texts), padding=True, truncation=True, return_tensors='pt')

# Convert labels to PyTorch tensor
# labels = labels.apply(lambda x: 1 if x == 'positive' else 0)  # Convert labels to numeric values
labels = torch.tensor(labels.values)
print(labels)
# Create TensorDataset
dataset = TensorDataset(tokenized_texts.input_ids, tokenized_texts.attention_mask, labels)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


test_size = int(0.2 * len(dataset))  # 20% for the test set
val_size = int(0.2 * (len(dataset) - test_size))  # 20% of the remaining data for validation
train_size = len(dataset) - val_size - test_size  # The rest is for training

# Split dataset into train, validation, and test sets
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# DataLoader for training, validation, and test sets
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
print("train_dataloader Done")
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=True)
print("val_dataloader Done")
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)
print("test_dataloader Done")

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # 2 labels: positive, negative



tensor([1, 1, 1,  ..., 0, 0, 1])
cuda
train_dataloader Done
val_dataloader Done
test_dataloader Done


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [109]:
torch.cuda.empty_cache()

In [112]:
import torch
from transformers import AdamW, get_linear_schedule_with_warmup, BertForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define the model with dropout (if not already included in your model)
model =BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# model.classifier.dropout = torch.nn.Dropout(p=0.3)  # Assuming you have a dropout layer in your model definition

# Initialize the optimizer with weight decay
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8, weight_decay=0.01)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * 2)

# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

model.to(device)

epochs = 2  

# Early stopping parameters
early_stopping_patience = 2
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)

    # Validation
    model.eval()
    total_val_loss = 0
    true_labels = []
    predictions = []

    for batch in val_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()
            logits = outputs.logits
            true_labels.extend(labels.cpu().numpy())
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())

    avg_val_loss = total_val_loss / len(val_dataloader)
    val_accuracy = accuracy_score(true_labels, predictions)
    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')

    print(f"Epoch {epoch + 1}/{epochs} - Average Training Loss: {avg_train_loss:.4f} - Validation Loss: {avg_val_loss:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(f"Validation Precision: {val_precision:.4f}")
    print(f"Validation Recall: {val_recall:.4f}")
    print(f"Validation F1 Score: {val_f1:.4f}")

    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        # Save the best model
        model.save_pretrained('final_fine_tuned_bert_semantic_model')
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered")
            break

# Save the final model
model.save_pretrained('final_fine_tuned_bert_semantic_model')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


cuda
Epoch 1/2
Epoch 1/2 - Average Training Loss: 0.7216 - Validation Loss: 0.2421
Validation Accuracy: 0.9056
Validation Precision: 0.8819
Validation Recall: 0.9378
Validation F1 Score: 0.9090
Epoch 2/2
Epoch 2/2 - Average Training Loss: 0.3262 - Validation Loss: 0.2339
Validation Accuracy: 0.9062
Validation Precision: 0.8884
Validation Recall: 0.9303
Validation F1 Score: 0.9089
