In [1]:
%pip install transformers torch pandas scikit-learn



In [2]:
import torch
import pandas as pd
import warnings
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

warnings.filterwarnings("ignore") # Bypass HF warnings

In [3]:
class CopiumDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': torch.tensor(label)}

In [None]:
import os

# Load the curated dataset
possible_paths = [
    'copium_dataset.csv',
    '/content/copium_dataset.csv',
    '/content/drive/MyDrive/copium_dataset.csv',
    '../cloud/copium_dataset.csv'
]

csv_path = None
for path in possible_paths:
    if os.path.exists(path):
        csv_path = path
        print(f"Found dataset at: {path}")
        break

if csv_path is None:
    print(f"Current directory: {os.getcwd()}")
    print(f"Files here: {os.listdir()}")
    raise FileNotFoundError("copium_dataset.csv not found - run curate_dataset.py first")

# Load curated dataset (2500 samples per class)
df = pd.read_csv(csv_path)

# Labels: 0=Copium, 1=Sarcastic, 2=Sincere, 3=Neutral
print(f"\nDataset size: {len(df)}")
print("\nSamples per class:")
print(df['class'].value_counts())
print("\nLabel distribution:")
print(df['label'].value_counts().sort_index())

print("\n--- Sample texts from each class ---")
for class_name in ['copium', 'sarcastic', 'sincere', 'neutral']:
    print(f"\n{class_name.upper()}:")
    samples = df[df['class'] == class_name]['text'].head(2).tolist()
    for s in samples:
        print(f"  - {s[:80]}..." if len(s) > 80 else f"  - {s}")

texts = df['text'].tolist()
labels = df['label'].tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_dataset = CopiumDataset(train_texts, train_labels, tokenizer)
val_dataset = CopiumDataset(val_texts, val_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

print(f"\nTraining samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

Current directory: /content
Files here: ['.config', 'sample_data']


FileNotFoundError: train-balanced-sarcasm.csv not found

In [8]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4)
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(3):
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Class names for readability
class_names = ['Copium', 'Sarcastic', 'Sincere', 'Neutral']

print("=== Model Evaluation ===\n")
print(f'Overall Accuracy: {accuracy_score(true_labels, predictions):.4f}')
print(f'Weighted F1 Score: {f1_score(true_labels, predictions, average="weighted"):.4f}')

print("\n=== Classification Report ===\n")
print(classification_report(true_labels, predictions, target_names=class_names))

print("\n=== Confusion Matrix ===")
print("(Rows: Actual, Columns: Predicted)")
print(f"{'':>12} {class_names[0]:>10} {class_names[1]:>10} {class_names[2]:>10} {class_names[3]:>10}")
cm = confusion_matrix(true_labels, predictions)
for i, row in enumerate(cm):
    print(f"{class_names[i]:>12} {row[0]:>10} {row[1]:>10} {row[2]:>10} {row[3]:>10}")

Accuracy: 0.0
F1 Score: 0.0


In [None]:
model.save_pretrained('copium_model')
tokenizer.save_pretrained('copium_model')
print("Model saved to 'copium_model/' directory")