In [1]:
import openai
from datasets import load_dataset
from datasets import Dataset
import pandas as pd
import time
import json
import ast
import torch
from torch.utils.data import ConcatDataset, DataLoader
import nni
from datasets import load_dataset, load_metric, concatenate_datasets
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, EvalPrediction
from transformers.trainer import Trainer
from transformers.training_args import TrainingArguments
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score
from transformers import AdamW

In [2]:
class CustomDataCollator(DataCollatorWithPadding):
    def __call__(self, batch):
        claims = [element['claim'] for element in batch]
        evidences = [element['evidence'] for element in batch]
        tokenized = tokenizer(claims, evidences, return_tensors='pt', padding=True, truncation=True)
        inputs = {key: value.to(device) for key, value in tokenized.items() if key != "labels"}
        labels = torch.tensor([element['labels'] for element in batch]).to(device)
        return {"claim": claims, "evidence": evidences, "inputs": inputs, "labels": labels}

In [3]:
def chat(message): 
    response = openai.ChatCompletion.create( 
        model="gpt-3.5-turbo", 
        messages=[ 
            {"role": "user", "content": f"{message}"}, 
        ] 
    ) 
    return response['choices'][0]['message']['content']

In [4]:
def train(model, optimizer, train_dataloader, test_dataloader, num_epochs):
    misclassified_data = []
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0
        all_preds = []
        all_labels = []
        all_claims = []
        all_evidences = []
        for batch in train_dataloader:
            inputs = batch["inputs"]
            labels = batch["labels"]
            outputs = model(**inputs, labels=labels)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            loss = outputs.loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            all_claims.extend(batch['claim'])
            all_evidences.extend(batch['evidence'])
        misclassified_data.append([[claim, evidence, label] for (claim, evidence, pred, label) in zip(all_claims, all_evidences, all_preds, all_labels) if pred != label])
        model.eval()
        test_preds = []
        test_labels = []
        with torch.no_grad():
            for batch in test_dataloader:
                inputs = batch["inputs"]
                labels = batch["labels"]
                outputs = model(**inputs, labels=labels)
                logits = outputs.logits
                preds = torch.argmax(logits, dim=1)
                test_preds.extend(preds.cpu().numpy())
                test_labels.extend(labels.cpu().numpy())
        accuracy = accuracy_score(test_labels, test_preds)
        print(accuracy)
    return misclassified_data

In [5]:
def generateDataset(df, gpt):
    support_prompt = df.sample(n = 5)
    prompt_entries = ""
    for index, row in support_prompt.iterrows():
        prompt_entries += ("Claim: " + row['claim']  + "Evidence: " + evidence['evidence'] + "Claim Label: Supports")
    prompt = "Can you generate 10 new entries with a claim and evidence pair having a supports label similar to the below entries. Make sure that the entries reflect actual scientific consensus and adhere to the following json format without using any newline characters: {'claim': '', 'evidence': '', 'claim label' : ''}"
    prompt += prompt_entries
    res = gpt(prompt)
    return res

In [6]:
def generateDatasetRefute(df, gpt):
    refute_prompt = df.sample(n = 5)
    prompt_entries = ""
    for index, row in refute_prompt.iterrows():
        prompt_entries += ("Claim: " + row['claim']  + "Evidence: " + evidence['evidence'] + "Claim Label: REFUTES")
    prompt = "Can you generate 10 new entries with a claim and evidence pair having a refutes label similar to the below entries. Make sure that the entries reflect actual scientific consensus and adhere to the following json format without using any newline characters: {'claim': '', 'evidence': '', 'claim label' : ''}"
    prompt += prompt_entries
    res = gpt(prompt)
    return res

In [7]:
def generateDataForModel(misclassified_data, frequency, generateDataMultiplier):
    combined_array = np.concatenate(misclassified_data)
    reshaped_array = np.array(combined_array).reshape(-1, 3)
    misclassified_df = pd.DataFrame(reshaped_array, columns=['claim', 'evidence', 'claim_label'])
    frequency_counts = misclassified_df['claim'].value_counts()
    most_frequent_entries = frequency_counts[frequency_counts >= frequency].index
    filtered_df = misclassified_df[misclassified_df['claim'].isin(most_frequent_entries)]
    supports_df = filtered_df.loc[filtered_df['claim_label'] == '0']
    generated_dataset = []
    for i in range(generateDataMultiplier):
        generated_dataset.append(generateDataset(supports_df, chat))
        time.sleep(10)
    final_dataset = []
    for generated_entry_set in generated_dataset:
        for generated_entry in generated_entry_set.split('\n'):
            try:
                final_dataset.append(json.loads(json.dumps(ast.literal_eval(generated_entry))))
            except:
                continue
    refutes_df = filtered_df.loc[filtered_df['claim_label'] == '1']
    refutes_df.head()
    refute_generated_dataset = []
    for i in range(generateDataMultiplier):
        refute_generated_dataset.append(generateDatasetRefute(refutes_df, chat))
        time.sleep(10)
    final_refute_dataset = []
    for generated_entry_set in refute_generated_dataset:
        for generated_entry in generated_entry_set.split('\n'):
            try:
                final_refute_dataset.append(json.loads(json.dumps(ast.literal_eval(generated_entry))))
            except:
                continue
    final_generated_dataset = []
    final_generated_dataset.extend(final_dataset)
    final_generated_dataset.extend(final_refute_dataset)
    generated_train_dict = {}
    for entry in final_generated_dataset:
        if type(entry) == list:
            for key, value in entry[0].items():
                if key not in generated_train_dict:
                    generated_train_dict[key] = []
                generated_train_dict[key].append(value)
        else:
            for key, value in entry.items():
                if key not in generated_train_dict:
                    generated_train_dict[key] = []
                generated_train_dict[key].append(value)
    generated_train_dict['claim label'] = [0 if item == "supports" else 1 for item in generated_train_dict['claim label']]
    return generated_train_dict

In [8]:
def finetuneModel(model, optimizer, data_dict, num_epochs, frequency, generateDataMultiplier, fineTuneEpochs):
    dataset = Dataset.from_dict(data_dict)
    dataset = dataset.train_test_split(test_size=0.8)
    train_dataloader = DataLoader(dataset['train'], batch_size=32, shuffle=True, collate_fn=data_collator)
    test_dataloader = DataLoader(dataset['test'], batch_size=32, shuffle=False, collate_fn=data_collator)
    for _ in range(fineTuneEpochs):
        misclassified_data = train(model, optimizer, train_dataloader, test_dataloader, num_epochs)
        data_dict = generateDataForModel(misclassified_data, frequency, generateDataMultiplier)
        dataset = Dataset.from_dict(data_dict)
        dataset = dataset.rename_column("claim label", 'labels')
        train_dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=data_collator)

In [9]:
dataset = load_dataset("climate_fever")
openai.api_key = ""

In [10]:
df = pd.DataFrame(dataset['test'])
df.head()

Unnamed: 0,claim_id,claim,claim_label,evidences
0,0,Global warming is driving polar bears toward e...,0,[{'evidence_id': 'Extinction risk from global ...
1,5,The sun has gone into ‘lockdown’ which could c...,0,"[{'evidence_id': 'Famine:386', 'evidence_label..."
2,6,The polar bear population has been growing.,1,"[{'evidence_id': 'Polar bear:1332', 'evidence_..."
3,9,Ironic' study finds more CO2 has slightly cool...,1,"[{'evidence_id': 'Atmosphere of Mars:131', 'ev..."
4,10,Human additions of CO2 are in the margin of er...,1,[{'evidence_id': 'Carbon dioxide in Earth's at...


In [11]:
evidences_list = []
claims_list = []
labels = []
for index, row in df.iterrows():
    label = row['claim_label']
    evidences = row['evidences']
    length = len(evidences_list)
    for evidence in evidences:
        if(evidence['evidence_label'] == label):
            evidences_list.append(evidence['evidence'])
            claims_list.append(row['claim'])
            labels.append(label)
            break

In [12]:
device = torch.device("cpu")
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [13]:
optimizer = AdamW(model.parameters(), lr=1e-5)
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')



In [14]:
data_collator = CustomDataCollator(tokenizer=tokenizer)

In [15]:
data_dict = {
    "claim": claims_list,
    "evidence": evidences_list,
    "labels": labels,
}

In [None]:
finetuneModel(model, optimizer, data_dict, 5, 2, 30, 3)