In [1]:
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv("../data/subtask1/train.csv",index_col="id")
#data = data.drop("id",axis=1)

In [3]:
# Check for nan
for i in data.index:
    if(pd.isna(data["text"][i])):
        print(data["text"][i])

nan


In [4]:
for i in data.index:
    if(pd.isna(data["text"][i])):
        data["text"][i] = ""

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["text"][i] = ""


In [5]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)


In [6]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_encodings = tokenizer(list(train_df["text"]), truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(list(test_df['text']), truncation=True, padding=True, return_tensors='pt')



In [7]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels.values, dtype=torch.float32)

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}, self.labels[idx]

    def __len__(self):
        return len(self.labels)

In [8]:
labels = []
for col in data.columns:
    if(col not in ["text","id"]):
        labels.append(col)

labels = sorted(labels)
        

In [9]:
train_dataset = CustomDataset(train_encodings, train_df[labels])
test_dataset = CustomDataset(test_encodings, test_df[labels])
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)


In [10]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(labels))


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [12]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()

In [13]:
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)
        outputs = model(**inputs).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

In [None]:
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)
        outputs = torch.sigmoid(model(**inputs).logits)
        all_preds.extend(outputs.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Convert predictions to binary format
threshold = 0.5
binary_preds = (np.array(all_preds) > threshold).astype(int)

# Evaluate the model
accuracy = accuracy_score(all_labels, binary_preds)
print(f"Accuracy: {accuracy:.2f}")

classification_report_str = classification_report(all_labels, binary_preds, target_names=train_df.columns[1:])
print("Classification Report:\n", classification_report_str)


In [25]:
torch.save(model,"model1.pt")

In [27]:
import torch.nn as nn
test = torch.load("model1.pt")
type(test)

transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification