In [1]:
import pandas as pd
import json
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
from Data_Preprocess_Cleaning import spacy_cleaner

threshold1 = 0.01
threshold2 = 0.629

model1_path = './models/evidence_identification_model/model.epoch00-loss1.22'
model2_path = './models/evidence_classification_model/model.epoch02-loss0.93'
model_checkpoint_distilbert = "distilbert-base-uncased"
model_checkpoint_bert = "bert-base-uncased"

model1 = TFAutoModelForSequenceClassification.from_pretrained(model1_path,num_labels = 2)
tokenizer1 = AutoTokenizer.from_pretrained(model_checkpoint_distilbert)
model2 = TFAutoModelForSequenceClassification.from_pretrained(model2_path,num_labels = 2)
tokenizer2 = AutoTokenizer.from_pretrained(model_checkpoint_bert)

def predict_proba(rumor, timeline, max_length, debug = 0):
    tokenizer_output = tokenizer1(rumor, timeline, truncation=True, max_length = max_length, return_tensors = 'tf', padding=True)
    logits = model1(**tokenizer_output)["logits"]
    predicted_proba_class_id = tf.nn.softmax(logits, axis=1).numpy()[0][1]
    return predicted_proba_class_id

def inference_pipe(fname, split):
    EVLIST = []
    FLIST = []
    with open(fname, 'rb') as f:
        for line in f:
            data = json.loads(line)

            print(data['id'], data.keys())

            rumor_id = data['id']
            rumor = data['rumor']

            username = [entry[0] for entry in data['timeline']]
            timeline_id = [entry[1] for entry in data['timeline']]
            timelines = [entry[2] for entry in data['timeline']]

            predictions = [predict_proba(spacy_cleaner(rumor), spacy_cleaner(timeline), max_length=192) for timeline in timelines]

            tdf = pd.DataFrame()
            tdf['username'] = username
            tdf['timeline_id'] = timeline_id
            tdf['timelines'] = timelines
            tdf['predictions'] = predictions
            tdf['id'] = rumor_id
            tdf['rumor'] = rumor

            tdf = tdf.sort_values(by=['predictions'], ascending=False).reset_index(drop=True)

            if tdf.shape[0]==0:
                predicted_evidence = ""
            else:
                if tdf.shape[0]>5:
                    tdff = tdf.iloc[0:5]
                else:
                    tdff = tdf
                predicted_evidence = tdff[['username', 'timeline_id', 'timelines']].values.tolist()

            tdf = tdf[tdf['predictions']>threshold1]

            if tdf.shape[0]==0:
                predictions_final = "NOT ENOUGH INFO"

            else:
                if (tdf.shape[0]>5):
                    tdf = tdf.iloc[0:5]

                tdf['rank'] = tdf.index + 1
                tdf['Dummy'] = 'A0'
                tdf['Team'] = 'KNK'

                EVLIST.extend(tdf[['id', 'Dummy', 'timeline_id', 'rank', 'predictions', 'Team']].values.tolist())

                predicted_evidence = tdf[['username', 'timeline_id', 'timelines']].values.tolist()

                evidence = " ".join(tdf['timelines'].values.tolist())


                predictions_final = "SUPPORTS" if predict_proba(spacy_cleaner(rumor), spacy_cleaner(evidence), max_length=512) >= threshold2 else "REFUTES"
            if split=='test':
                FLIST.append([rumor_id, predictions_final, rumor, predicted_evidence])
            else:
                FLIST.append([rumor_id, predictions_final, rumor, data['label'], predicted_evidence])
        
    Evidence_retrieval_output = pd.DataFrame(EVLIST, columns = ['id', 'Dummy', 'timeline_id', 'rank', 'predictions', 'Team'])
    Evidence_retrieval_output.to_csv('./results/Evidence_retrieval_output_'+split+'.csv', index=False)

    if split=='test':
        Rumor_Verification_output = pd.DataFrame(FLIST, columns = ['id', 'predicted_label', 'claim', 'predicted_evidence'])
    else:
        Rumor_Verification_output = pd.DataFrame(FLIST, columns = ['id', 'predicted_label', 'claim', 'label', 'predicted_evidence'])
    Rumor_Verification_output.to_csv('./results/Rumor_Verification_output_'+split+'.csv', index=False)

    Evidence_retrieval_output.to_csv("./results/KGAT_zeroShot_evidence_English_"+split+".txt", sep="\t", index=None, header=False)

    dic_list = Rumor_Verification_output.to_dict(orient='records')

    with open('./results/KGAT_zeroShot_verification_English_'+split+'.json', 'w', encoding='utf-8') as output_file:
        for dic in dic_list:
            json.dump(dic, output_file) 
            output_file.write("\n")
    print("Processing Done!!!")

inference_pipe('./data/original_data/English_dev.json', 'dev')
inference_pipe('./data/original_data/English_test.json', 'test')

  from .autonotebook import tqdm as notebook_tqdm
