# Demo Code

demo code to generate the predictions.

In [None]:
# COLAB
from google.colab import drive
drive.mount('/content/drive')

# COMMON
import numpy as np
import torch

# DATA
import pandas as pd

# PREPROCESS DATA
import re
!pip install contractions
import contractions

# DATASET/LOADER
!pip install datasets
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

# MODEL
from transformers import AutoModelForSequenceClassification

# TEST
from tqdm.auto import tqdm
import torch.nn as nn
import sys
import pickle

Mounted at /content/drive
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.7/110.7 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.1.0 textsearch-0.0.24
Collecting datasets


Set the path for the several configurations.

In [None]:
# define path
test_data_path = "/content/drive/MyDrive/nlu/data/test.csv"
trained_model_path = "/content/drive/MyDrive/nlu/result/model/6_model.pt"
path_to_save = "./Group_14_C.csv"

# define variables
MODEL_CHECKPOINT = "bert-base-cased"
MAX_LENGTH = 256
BATCH_SIZE = 32

# move model to device if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# define preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return " "

    # convert to lower case
    text = text.lower()

    # expand contractions (ex. don't -> do not)
    text = contractions.fix(text)

    # remove multiple spaces
    text = re.sub(r'\s+', ' ', text, flags=re.I)

    return text

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def preprocess_function(records):
    return tokenizer(records['text_1'], records['text_2'], truncation=True, return_token_type_ids=True, max_length = MAX_LENGTH)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# own dataset
class PairwiseDataset(torch.utils.data.Dataset):
    def __init__(self, data, train=True):
        dataset = Dataset.from_pandas(data)
        # dataset = dataset.remove_columns('__index_level_0__')
        encoded_dataset = dataset.map(preprocess_function, batched=True)

        self.input_ids = encoded_dataset["input_ids"]
        self.token_type_ids = encoded_dataset["token_type_ids"]
        self.attention_mask = encoded_dataset["attention_mask"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx]),
            'token_type_ids': torch.tensor(self.token_type_ids[idx]),
            'attention_mask': torch.tensor(self.attention_mask[idx])
        }

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
# read data
test_data = pd.read_csv(test_data_path)

# preprocess test data
test_data['text_1'] = test_data['text_1'].apply(preprocess_text)
test_data['text_2'] = test_data['text_2'].apply(preprocess_text)

# prepare dataset
test_dataset = PairwiseDataset(test_data)

# define dataloader
test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=data_collator
)

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [None]:
# import trained model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=1).to(device)
model.load_state_dict(torch.load(trained_model_path))

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [None]:
# generate the predictions and save it in csf file
def predict(model, test_dataloader, path_to_save):
    model.eval()
    y_pred = []

    with torch.inference_mode():
        for batch in tqdm(test_dataloader):
            # move batch to device
            input_ids = batch['input_ids'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # forward pass
            outputs = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
            logits = outputs.logits
            logits = logits.reshape(logits.shape[0])

            predicted = (torch.sigmoid(logits) > 0.5).float()

            y_pred.extend(predicted.detach().cpu().tolist())

    y_pred = [int(x) for x in y_pred]
    df = pd.DataFrame({"prediction": y_pred})

    df.to_csv(path_to_save, index=False)


predict(model, test_dataloader, path_to_save)

  0%|          | 0/188 [00:00<?, ?it/s]