In [141]:
import os
import re
import string
import json
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, AutoModel, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import warnings
from time import time

from subtask_1_2a import check_format, evaluate_h
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", None)

In [84]:
# keras_core/backend/torch/core.py
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
print(device)

mps


In [44]:
data = pd.read_csv("../training_set.csv")
data = data.dropna()

train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

train_df = train_df.reset_index().drop(columns=["index"])
test_df = test_df.reset_index().drop(columns=["index"])

In [46]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 2
TEST_BATCH_SIZE = 64
EPOCHS = 1
LEARNING_RATE = 2e-5
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
# tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

target_cols = [col for col in data.columns if col not in ["id", "text"]]

In [47]:
class BERTDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.text = df.text
        self.tokenizer = tokenizer
        self.targets = df[target_cols].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [48]:
train_dataset = BERTDataset(train_df, tokenizer, MAX_LEN)
test_dataset = BERTDataset(test_df, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE, shuffle=True)

In [49]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.roberta = AutoModel.from_pretrained('roberta-base')
#         self.l2 = torch.nn.Dropout(0.3)
        self.fc = torch.nn.Linear(768, data.shape[1] - 2) # Retrieve id, text column
    
    def forward(self, ids, mask, token_type_ids):
        _, features = self.roberta(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
#         output_2 = self.l2(output_1)
        output = self.fc(features)
        return output

model = BERTClass()
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTClass(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [50]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = AdamW(params =  model.parameters(), lr=LEARNING_RATE, weight_decay=1e-6)

def train(epoch):
    model.train()
    for i, data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        
        outputs = model(ids, mask, token_type_ids)
        loss = loss_fn(outputs, targets)
        if i % 1000 == 0:
            print(f'Epoch: {epoch}, Sect: {i}, Loss:  {loss.item()}')
            torch.save(model, "results/T_bert_subtask1_epoch_" + str(epoch) + "_" + str(i) + ".pt")
            if i == 2000: break
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [85]:
# torch.mps.empty_cache()
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

In [51]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Sect: 0, Loss:  0.6865091919898987
Epoch: 0, Sect: 1000, Loss:  0.18705444037914276
Epoch: 0, Sect: 2000, Loss:  0.08826180547475815


In [52]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(test_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [74]:
outputs, targets = validation()

In [77]:

outputs_2 = np.array(outputs) >= 0.5
classification_report(outputs_2, targets)
accuracy = metrics.accuracy_score(targets, outputs_2)
f1_score_micro = metrics.f1_score(targets, outputs_2, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs_2, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.2007142857142857
F1 Score (Micro) = 0.2054134223210975
F1 Score (Macro) = 0.04997636467644879


In [124]:
data_to_predict = pd.read_json("english/en_subtask1_test_unlabeled.json")

In [117]:
def pred(input_text):
    model.eval()

    token_ids = tokenizer.encode(input_text, truncation=True, padding=True, return_tensors="pt").to("mps")
    mask = (token_ids != 0).float()  # Create a mask where token_ids are not zero
    token_type_ids = torch.zeros_like(token_ids)  # For BERT, token type ids are all zeros

    # Forward pass through the model
    output = model(token_ids, mask, token_type_ids)

    output_vec = torch.sigmoid(output[0]).cpu().detach().numpy() >= 0.15

    return np.array(target_cols)[output_vec].tolist()


In [126]:
data_to_predict["labels"] = data_to_predict.apply(lambda row: pred(row["text"]), axis=1)

In [143]:
def to_json(df, file_name):
    data = []

    for index, row in df.iterrows():
        entry = {
            "id": str(row['id']),
            "text": str(row['text']),
            "labels": row['labels']
        }
        data.append(entry)

    with open(file_name, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

In [144]:
to_json(data_to_predict, "subtask_1_2a_example.json")

In [145]:
check_format("subtask_1_2a_example.json")

True