# Imports

In [1]:
# general
import torch
from tqdm.notebook import tqdm

# data
from torch.utils.data import DataLoader
from datasets import load_dataset

# model
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator
from sklearn.metrics import classification_report

# custom
from utils import *

  _torch_pytree._register_pytree_node(


# Data

In [2]:
test_A = load_dataset("json", data_files='subtaskA_test.jsonl', split='train')
test_B = load_dataset("json", data_files='subtaskB_test.jsonl', split='train')

In [3]:
test_A

Dataset({
    features: ['text', 'label', 'model', 'source', 'id'],
    num_rows: 11976
})

In [4]:
test_B

Dataset({
    features: ['text', 'model', 'source', 'label', 'id'],
    num_rows: 7103
})

In [5]:
test_A[0]

{'text': '  First passage of stochastic processes under resetting has recently been an\nactive research topic in the field of statistical physics. However, most of\nprevious studies mainly focused on the systems with continuous time and space.\nIn this paper, we study the effect of stochastic resetting on first passage\nproperties of discrete-time absorbing Markov chains, described by a transition\nmatrix $\\brm{Q}$ between transient states and a transition matrix $\\brm{R}$\nfrom transient states to absorbing states. Using a renewal approach, we exactly\nderive the unconditional mean first passage time (MFPT) to either of absorbing\nstates, the splitting probability the and conditional MFPT to each absorbing\nstate. All the quantities can be expressed in terms of a deformed fundamental\nmatrix $\\brm{Z_{\\gamma}}=\\left[\\brm{I}-(1-\\gamma) \\brm{Q} \\right]^{-1}$ and\n$\\brm{R}$, where $\\brm{I}$ is the identity matrix, and $\\gamma$ is the\nresetting probability at each time step. W

In [6]:
test_B[0]

{'text': 'Athletics at the 2006 Commonwealth Games – Men\'s 200 metres Commonwealth Games was held in Delhi, India between 3 and 14 October 2006. The 200 metres event was held on the first day of the Games. Indian sprinter P.V. Sindhu won the most coveted gold medal of the Games. Two other Indian athletes, Sajjad Hiqal and Navendu Khatik, also won medals (silver and bronze) that day. Sindhu\'s win established herself as the highest ranked Indian athlete in the Women\'s 200 meters category as of July 2014. The Indian men fielded a team of six athletes for this event. The team, led by Suresh S myo, included Sai Suresh Reddy, Surya Sai Saiyan Reddy, B S Sailo, C.K. Nayudu, and P.R. Sahoo. Indian athletes finished in seventh, ninth, eleventh, thirteenth and seventeenth place respectively. Suresh Smyo, who was leading the race for the entire duration, came home in ninth place with a timing of 20.32 seconds. Indian medal hopes at the Games ended when Sailo and Sahoo finished outside the meda

In [7]:
def tokenize(data, tokenizer, init_prompt, max_length, labeled=False):
    '''Variation of custom function tokenize() (see utils.py) which also includes init_prompt.'''

    batch_size = len(data['text'])
    init_prompt = init_prompt
    inputs = [f'{init_prompt}. Text: "{text}' for text in data['text']] # input text to be passed to the LM
    if labeled:
      labels = [f'\nLabel: {label}' for label in data['label']]
    else:
      labels = [f'\nLabel: ' for label in data['label']]
    tokenized_inputs = tokenizer(inputs) # tokenized input text
    tokenized_labels = tokenizer(labels) # tokenized labels

    for i in range(batch_size):
        sample_input_ids = tokenized_inputs['input_ids'][i] # input ids of i-th sample
        label_input_ids = tokenized_labels['input_ids'][i] # label ids of i-th sample

        tokenized_inputs['input_ids'][i] = sample_input_ids + [tokenizer.pad_token_id] * (max_length - len(sample_input_ids)) # right padding
        tokenized_inputs['input_ids'][i] = tokenized_inputs['input_ids'][i][:max_length - len(label_input_ids) - 1] + [tokenizer('"')['input_ids'][1]] # truncation
        tokenized_inputs['input_ids'][i] = tokenized_inputs['input_ids'][i] + label_input_ids # adding label
        
        tokenized_labels['input_ids'][i] = [-100] * (max_length - len(label_input_ids)) + label_input_ids
        tokenized_inputs['labels'] = tokenized_labels['input_ids']

        tokenized_inputs['attention_mask'][i] = [1] * max_length
  
    return tokenized_inputs

In [8]:
model_tokenizer_path = 'mistralai/Mistral-7B-v0.1'

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_tokenizer_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [24]:
max_length = 300
init_prompt_A = 'Decide if the following text has been written by a human or by a language model. Write 0 if it has been written by a human. Write 1 if it has been written by a language model'
init_prompt_B = 'Decide if the text has been written by a human or by a language model among: ChatGPT, Cohere, Davinci, Bloomz or Dolly. Write 0 if it has been written by a human. Write 1 if it has been written by ChatGPT. Write 2 if it has been written by Cohere. Write 3 if it has been written by Davinci. Write 4 if it has been written by Bloomz. Write 5 if it has been written by Dolly'

In [12]:
test_A = test_A.map(
    lambda example: tokenize(example, tokenizer, init_prompt_A, max_length, labeled=False),
    batched=True,
    desc="Tokenizing dataset",
)

Tokenizing dataset:   0%|          | 0/11976 [00:00<?, ? examples/s]

In [13]:
test_B = test_B.map(
    lambda example: tokenize(example, tokenizer, init_prompt_B, max_length, labeled=False),
    batched=True,
    desc="Tokenizing dataset",
)

Tokenizing dataset:   0%|          | 0/7103 [00:00<?, ? examples/s]

In [14]:
tokenizer.decode(test_A[0]['input_ids'])

'<s> Decide if the following text has been written by a human (0) or by a language model (1). Text: "  First passage of stochastic processes under resetting has recently been an\nactive research topic in the field of statistical physics. However, most of\nprevious studies mainly focused on the systems with continuous time and space.\nIn this paper, we study the effect of stochastic resetting on first passage\nproperties of discrete-time absorbing Markov chains, described by a transition\nmatrix $\\brm{Q}$ between transient states and a transition matrix $\\brm{R}$\nfrom transient states to absorbing states. Using a renewal approach, we exactly\nderive the unconditional mean first passage time (MFPT) to either of absorbing\nstates, the splitting probability the and conditional MFPT to each absorbing\nstate. All the quantities can be expressed in terms of a deformed fundamental\nmatrix $\\brm{Z_{\\gamma}}=\\left[\\brm{I}-(1-\\gamma) \\brm{Q} \\right]^{-1}$ and\n$\\brm{R}$, where $\\brm{I

In [15]:
tokenizer.decode(test_B[0]['input_ids'])

'<s> Decide if the text has been written by a human (0) or by a language model among: ChatGPT (1), Cohere(2), Davinci (3), Bloomz (4) or Dolly (5). Text: "Athletics at the 2006 Commonwealth Games – Men\'s 200 metres Commonwealth Games was held in Delhi, India between 3 and 14 October 2006. The 200 metres event was held on the first day of the Games. Indian sprinter P.V. Sindhu won the most coveted gold medal of the Games. Two other Indian athletes, Sajjad Hiqal and Navendu Khatik, also won medals (silver and bronze) that day. Sindhu\'s win established herself as the highest ranked Indian athlete in the Women\'s 200 meters category as of July 2014. The Indian men fielded a team of six athletes for this event. The team, led by Suresh S myo, included Sai Suresh Reddy, Surya Sai Saiyan Reddy, B S Sailo, C.K. Nayudu, and P.R. Sahoo. Indian athletes finished in seventh, ninth, eleventh, thirteenth and seventeenth place respectively "<s> \nLabel: '

In [16]:
y_test_A = test_A['label']

In [17]:
y_test_B = test_B['label']

In [18]:
test_A = test_A.remove_columns(['id', 'text', 'source', 'label', 'model'])
test_B = test_B.remove_columns(['id', 'text', 'source', 'label', 'model'])

In [19]:
test_A.set_format('pt', columns=['input_ids', 'attention_mask', 'labels'], output_all_columns=True)
test_B.set_format('pt', columns=['input_ids', 'attention_mask', 'labels'], output_all_columns=True)

In [20]:
batch_size = 8

In [21]:
test_A = DataLoader(
    test_A, 
    shuffle=False, 
    collate_fn=default_data_collator, 
    batch_size=batch_size, 
    pin_memory=True
)

In [22]:
test_B = DataLoader(
    test_B, 
    shuffle=False, 
    collate_fn=default_data_collator, 
    batch_size=batch_size, 
    pin_memory=True
)

# Model

In [23]:
model = AutoModelForCausalLM.from_pretrained(model_tokenizer_path, torch_dtype=torch.float16) # loading model in half-precision

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [24]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device);

In [25]:
check_cuda_memory()

Current Tesla V100S-PCIE-32GB memory usage: 13.989/31.739 GiB


# Inference

In [26]:
def inference(dataset):
    outputs = list()
    for batch in tqdm(dataset):
      batch = {k : v.to(device) for k, v in batch.items()} # moving batches to GPU
      batch_outputs = model.generate(**batch, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id) # auto-regressive generation imposing max_new_tokens=1
      decoded_batch_outputs = tokenizer.batch_decode(batch_outputs.detach().cpu().numpy(), skip_special_tokens=True)
      outputs.extend(decoded_batch_outputs)
    return outputs

In [27]:
outputs_A = inference(test_A)

  0%|          | 0/1497 [00:00<?, ?it/s]

In [28]:
outputs_B = inference(test_B)

  0%|          | 0/888 [00:00<?, ?it/s]

In [29]:
y_pred_A = get_labels_from_texts(outputs_A, [0, 1]) # custom function (see utils.py)
y_pred_B = get_labels_from_texts(outputs_B, [0, 1, 2, 3, 4, 5]) 



In [30]:
print(classification_report(y_test_A, y_pred_A, digits=3)) # task A

              precision    recall  f1-score   support

           0      0.518     0.871     0.650      6298
           1      0.418     0.103     0.165      5678

    accuracy                          0.507     11976
   macro avg      0.468     0.487     0.407     11976
weighted avg      0.471     0.507     0.420     11976



In [44]:
print(classification_report(y_test_B, y_pred_B, digits=3)) # task B

              precision    recall  f1-score   support

           0      0.159     0.899     0.271      1198
           1      0.103     0.008     0.015      1148
           2      0.183     0.011     0.021      1155
           3      0.213     0.011     0.021      1189
           4      0.175     0.009     0.017      1242
           5      0.000     0.000     0.000      1171

    accuracy                          0.158      7103
   macro avg      0.139     0.156     0.057      7103
weighted avg      0.140     0.158     0.058      7103

