In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



In [100]:
# Sample data
data = [
    ("I love this product!", 1),
    ("This is terrible.", 0),
    ("Absolutely fantastic!", 1),
    ("Not worth the money.", 0),
    ("Really great experience.", 1),
    ("Awful, never again.", 0)
]

In [101]:
# Convert to DataFrame
import pandas as pd
df = pd.DataFrame(data, columns=["text", "label"])

In [6]:
data = pd.read_csv('/Users/jiteshdewangan/Downloads/training.tsv', sep='\t')
df = data.sample(frac=.01, random_state=50)

In [80]:
df['target'] = df['category'].apply(lambda x: 0 if x == 'S' else 1)
df['input'] = df['title'] + " " + df['description']

In [81]:
x_train = df['input']
y_train = df['target']

In [82]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [102]:
x_train, x_test, y_train, y_test = train_test_split(x_train.tolist(), y_train.tolist(), test_size=0.1, random_state=42)

In [103]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if idx >= len(self.texts):
            raise IndexError(f"Index {idx} out of bounds for dataset with length {len(self.texts)}")
        
        text = self.texts[idx]
        label = self.labels[idx]
        #print(f"Fetching index: {idx}")
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [104]:
train_dataset = TextDataset(x_train, y_train, tokenizer)
test_dataset = TextDataset(x_test, y_test, tokenizer)

In [105]:
train_loader = DataLoader(train_dataset, batch_size=20)
test_loader = DataLoader(test_dataset, batch_size=20)

In [106]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

optimizer = AdamW(model.parameters(), lr=0.0001)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [107]:
def train(model, train_loader, optimizer,device, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_losss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            output = model(input_ids, attention_mask=attention_mask, labels= labels)
            loss = output.loss
            loss.backward()
            optimizer.step()
            total_losss += loss.item()
        

    print(f"Epoch {epoch+1}, Loss: {total_losss/len(train_loader):.4f}")


In [108]:
train(model, train_loader, optimizer, "cpu", 2)

Fetching index: 0
Fetching index: 1
Fetching index: 2
Fetching index: 3
Fetching index: 4
Fetching index: 5
Fetching index: 6
Fetching index: 7
Fetching index: 8
Fetching index: 9
Fetching index: 10
Fetching index: 11
Fetching index: 12
Fetching index: 13
Fetching index: 14
Fetching index: 15
Fetching index: 16
Fetching index: 17
Fetching index: 18
Fetching index: 19
Fetching index: 20
Fetching index: 21
Fetching index: 22
Fetching index: 23
Fetching index: 24
Fetching index: 25
Fetching index: 26
Fetching index: 27
Fetching index: 28
Fetching index: 29
Fetching index: 30
Fetching index: 31
Fetching index: 32
Fetching index: 33
Fetching index: 34
Fetching index: 35
Fetching index: 36
Fetching index: 37
Fetching index: 38
Fetching index: 39
Fetching index: 40
Fetching index: 41
Fetching index: 42
Fetching index: 43
Fetching index: 44
Fetching index: 45
Fetching index: 46
Fetching index: 47
Fetching index: 48
Fetching index: 49
Fetching index: 50
Fetching index: 51
Fetching index: 52
Fet

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x11275ece0>>
Traceback (most recent call last):
  File "/Users/jiteshdewangan/.pyenv/versions/3.10.11/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


KeyboardInterrupt: 

In [None]:
def evaluate_model(model, test_loader):
    model.eval()
    predictions , true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['label']
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(true_labels, predictions)

    return accuracy