In [1]:
%pip install transformers torch pandas scikit-learn



In [2]:
import torch
import pandas as pd
import warnings
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

warnings.filterwarnings("ignore") # Bypass HF warnings

In [3]:
class CopiumDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': torch.tensor(label)}

In [None]:
import os
import re

possible_paths = [
    'train-balanced-sarcasm.csv',
    '/content/train-balanced-sarcasm.csv',
    '/content/drive/MyDrive/train-balanced-sarcasm.csv',
    '../cloud/train-balanced-sarcasm.csv'
]

csv_path = None
for path in possible_paths:
    if os.path.exists(path):
        csv_path = path
        print(f"Found file at: {path}")
        break

if csv_path is None:
    print(f"Current directory: {os.getcwd()}")
    print(f"Files here: {os.listdir()}")
    raise FileNotFoundError("train-balanced-sarcasm.csv not found")

df_sarcasm = pd.read_csv(csv_path)

copium_patterns = [
    r"\bit'?s? fine\b",
    r"\bi'?m? (not even |totally )?(mad|upset|bothered)",
    r"\bwhatever\b",
    r"\bdidn'?t (even )?(want|need|care)",
    r"\bdoesn'?t (even )?(matter|bother)",
    r"\bi'?m? over it\b",
    r"\bwho cares\b",
    r"\bnot like i (wanted|needed|cared)",
    r"\bi (guess|suppose) (it'?s?|that'?s?) (ok|fine|whatever)",
    r"\bcould be worse\b",
    r"\bat least\b",
    r"\bi'?ll? (just |)be fine\b",
    r"\b(anyway|anyways),? (it'?s?|that'?s?) (ok|fine)",
    r"\bno big deal\b",
    r"\bnot that (great|good|bad) anyway",
    r"\bprobably (for the best|better this way)",
    r"\bi'?m? (totally |)okay with (this|that|it)",
    r"\bthis is fine\b",
    r"\beverything is fine\b",
    r"\b(lol|lmao|haha),? (it'?s?|i'?m?) (fine|ok|okay)",
]

combined_pattern = '|'.join(copium_patterns)

df_sarcasm['comment'] = df_sarcasm['comment'].fillna('')
df_sarcasm['is_copium'] = df_sarcasm['comment'].str.lower().str.contains(combined_pattern, regex=True)

df_copium = df_sarcasm[df_sarcasm['is_copium'] == True][['comment']].copy()
df_copium['label'] = 0
print(f"Found {len(df_copium)} potential copium samples")

if len(df_copium) < 2500:
    templates = [
        "It's fine, I didn't want {} anyway",
        "Whatever, {} doesn't even matter",
        "I'm not even mad about {}",
        "Who cares about {}? Not me",
        "{} is overrated anyway",
        "I guess {} wasn't meant to be",
        "At least I still have {}",
        "Could be worse than {}",
        "I'm totally okay with {}",
        "Not like I needed {} or anything",
    ]
    subjects = ["the promotion", "winning", "that relationship", "the game", "my grade", 
                "the job", "their opinion", "being right", "success", "approval",
                "being included", "the prize", "first place", "the match", "my rank"]
    
    synthetic = []
    for template in templates:
        for subject in subjects:
            synthetic.append({'comment': template.format(subject), 'label': 0})
    
    df_synthetic = pd.DataFrame(synthetic)
    df_copium = pd.concat([df_copium, df_synthetic], ignore_index=True)

df_copium = df_copium.head(2500)
df_copium = df_copium.rename(columns={'comment': 'text'})

df_sarc = df_sarcasm[(df_sarcasm['label'] == 1) & (df_sarcasm['is_copium'] == False)].head(2500)[['comment']].copy()
df_sarc['label'] = 1
df_sarc = df_sarc.rename(columns={'comment': 'text'})

df_sincere = df_sarcasm[df_sarcasm['label'] == 0].head(2500)[['comment']].copy()
df_sincere['label'] = 2
df_sincere = df_sincere.rename(columns={'comment': 'text'})

df = pd.concat([df_copium[['text', 'label']], df_sarc, df_sincere], ignore_index=True)
df = df.dropna()
df = df.sample(frac=1).reset_index(drop=True)

print(f"\nDataset size: {len(df)}")
print(df['label'].value_counts())
print("\nSample copium texts:")
print(df[df['label'] == 0]['text'].head(5).tolist())

texts = df['text'].tolist()
labels = df['label'].tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_dataset = CopiumDataset(train_texts, train_labels, tokenizer)
val_dataset = CopiumDataset(val_texts, val_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

Current directory: /content
Files here: ['.config', 'sample_data']


FileNotFoundError: train-balanced-sarcasm.csv not found

In [8]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4)
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(3):
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

print(f'Accuracy: {accuracy_score(true_labels, predictions)}')
print(f'F1 Score: {f1_score(true_labels, predictions, average="weighted")}')

Accuracy: 0.0
F1 Score: 0.0


In [None]:
model.save_pretrained('copium_model')
tokenizer.save_pretrained('copium_model')
print("Model saved to 'copium_model/' directory")