<a href="https://colab.research.google.com/github/jyotidabass/Utilizing-Large-Language-Models-in-Natural-Language-Understanding/blob/main/Utilizing_Large_Language_Models_in_Natural_Language_Understanding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Create dummy data
data = {
    'text': [
        'I love this product!',
        'This product is terrible.',
        'I am neutral about this product.',
        'I hate this product.',
        'I like this product.',
        'This product is amazing.',
        'I am disappointed with this product.',
        'I am satisfied with this product.',
        'I would recommend this product.',
        'I would not recommend this product.'
    ],
    'label': [1, 0, 1, 0, 1, 1, 0, 1, 1, 0]
}

# Create a pandas dataframe
df = pd.DataFrame(data)

# Split data into training and testing sets
train_text, test_text, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Reset indices for train_labels and test_labels. This will create a sequential index from 0 to n-1 that is compatible with the DataLoader
train_labels = train_labels.reset_index(drop=True)
test_labels = test_labels.reset_index(drop=True)

# Load pre-trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Preprocess data
train_encodings = tokenizer(list(train_text), truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(list(test_text), truncation=True, padding=True, return_tensors='pt')

# Create a custom dataset class
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create dataset and data loader
train_dataset = SentimentDataset(train_encodings, train_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

# Train model
for epoch in range(5):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs.logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}')

    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = criterion(outputs.logits, labels)
            test_loss += loss.item()
            _, predicted = torch.max(outputs.logits, dim=1)
            correct += (predicted == labels).sum().item()

    accuracy = correct / len(test_loader.dataset)
    print(f'Test Accuracy: {accuracy:.4f}')

# Evaluate model
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs.logits, labels)
        test_loss += loss.item()
        _, predicted = torch.max(outputs.logits, dim=1)
        correct += (predicted == labels).sum().item()

    accuracy = correct / len(test_loader.dataset)
    print(f'Test Accuracy: {accuracy:.4f}')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.697117269039154
Test Accuracy: 0.5000
Epoch 2, Loss: 0.7000041007995605
Test Accuracy: 0.5000
Epoch 3, Loss: 0.6830722093582153
Test Accuracy: 0.5000
Epoch 4, Loss: 0.6609206199645996
Test Accuracy: 0.5000
Epoch 5, Loss: 0.6439856290817261
Test Accuracy: 0.5000
Test Accuracy: 0.5000
