# This takes the test set of claims, and for each one we save:
- Check if claim is climate related
- top 5 retrieved sentences
- top 5 rereanked sentences

In [1]:
from sentence_transformers import SentenceTransformer
import pickle
import json
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import transformers
from datasets import Dataset
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


### Loading Models

In [2]:
CLIMATE_CLAIM_DETECTOR_MODEL_NAME = "climatebert/distilroberta-base-climate-detector"
DENSE_MODEL_NAME = "all-mpnet-base-v2"
RERANKER_MODEL_NAME_1 = "iestynmullinor/roberta-reranker-fever-better"
RERANKER_MODEL_NAME_2 = "iestynmullinor/climatebert-rereranker-f-cf-ipcc"

In [3]:
claim_detector = pipeline("text-classification", model=CLIMATE_CLAIM_DETECTOR_MODEL_NAME)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
dense_model = SentenceTransformer(DENSE_MODEL_NAME)

In [5]:

reranker_tokenizer_1 = AutoTokenizer.from_pretrained(RERANKER_MODEL_NAME_1)
reranker_model_1 = AutoModelForSequenceClassification.from_pretrained(RERANKER_MODEL_NAME_1)

In [6]:
reranker_tokenizer_2 = AutoTokenizer.from_pretrained(RERANKER_MODEL_NAME_2)
reranker_model_2 = AutoModelForSequenceClassification.from_pretrained(RERANKER_MODEL_NAME_2)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Read in claims from csv file

In [7]:
twitter_claims_path =  "data/twitter_claims.csv"
twitter_claims = pd.read_csv(twitter_claims_path)
twitter_claims = twitter_claims["claims"].tolist()

### Read in embeddings for all-mpnet-base-v2

In [8]:
with open('/home/iestyn/honsProject/sentence_similarity/data/sentence_section_pairs.json', 'r', encoding='utf-8') as f:
    SENTENCES = json.load(f)
    SENTENCES = [sentence for (sentence, section) in SENTENCES ]
    
with open(f'/home/iestyn/honsProject/sentence_similarity/model_evaluation/model_embeddings/MODEL_all-mpnet-base-v2_EMBEDDINGS.pkl', 'rb') as f:
    EMBEDDINGS = pickle.load(f)

print(f"number of embeddings: {len(EMBEDDINGS)}")
print(f"number of sentences: {len(SENTENCES)}")

nn = NearestNeighbors(n_neighbors=30, metric='cosine')
nn.fit(EMBEDDINGS)


number of embeddings: 16004
number of sentences: 16004


### Initialise Results Dictionary

In [9]:
output_dictionary = {}

### Create functions to get results for claim

In [10]:
def check_for_climate_change_claim(claim):
    result = claim_detector(claim)
    #print(result)
    if result[0]["label"] == "yes":
        return True
    else:
        return False

In [11]:
def get_30_nearest_neighbours(claim):
    claim_embedding = dense_model.encode(claim)
    distances, indices = nn.kneighbors([claim_embedding])
    nearest_neighbours = [SENTENCES[i] for i in indices[0]]
    return nearest_neighbours


In [12]:
def rerank_evidence(claim_sentence, evidence_sentences, reranker, tokenizer):

    sentence_rerank_scores = []

    for evidence_sentence in evidence_sentences:
        tokenized_input = tokenizer(claim_sentence, evidence_sentence, padding='max_length', max_length=256, truncation=True, return_tensors="pt")
        model_output = reranker(**tokenized_input)

        # get the predicted class
        predicted_class = model_output.logits.argmax().item()

        # get the probability        
        probability = model_output.logits.max()

        # if predicted score is 0, add to sentence_rerank_scores
        if predicted_class == 0:
            sentence_rerank_scores.append((evidence_sentence, probability))

    # sort the sentence_rerank_scores by probability
    sentence_rerank_scores.sort(key=lambda x: x[1], reverse=True)

    # remove the probability from the sentence_rerank_scores
    sentence_rerank_scores = [x[0] for x in sentence_rerank_scores]

    # return the top 5 reranked evidence sentences, if there is less than 5 in the reranked list, return however many there is
    if len(sentence_rerank_scores) < 5:
        return sentence_rerank_scores
    else:
        return sentence_rerank_scores[:5]

### Get results for each claim

In [13]:
for claim in tqdm(twitter_claims):
    output_dictionary[claim] = {}
    output_dictionary[claim]["climate_claim_detected"] = check_for_climate_change_claim(claim)
    
    top_30_nearest_neighbours = get_30_nearest_neighbours(claim)
    output_dictionary[claim]["top_5_retrieved"] = top_30_nearest_neighbours[:5]

    top_5_reranked_1 = rerank_evidence(claim, top_30_nearest_neighbours, reranker_model_1, reranker_tokenizer_1)
    output_dictionary[claim]["reranked_RoBERTa-FEVER"] = top_5_reranked_1

    top_5_reranked_2 = rerank_evidence(claim, top_30_nearest_neighbours, reranker_model_2, reranker_tokenizer_2)
    output_dictionary[claim]["reranked_ClimateBert-f-cf-ipcc"] = top_5_reranked_2

100%|██████████| 50/50 [07:29<00:00,  8.99s/it]


# save output dictionary as json

In [14]:
with open('twitter_claims_results.json', 'w') as f:
    json.dump(output_dictionary, f, indent=4, ensure_ascii=False)