## Setting up notebook

In [1]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

#from fairseq.data.data_utils import collate_tokens

from transformers import GPT2Tokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForSeq2SeqLM

from sentence_transformers import SentenceTransformer, util
from transformers import AdamW, T5ForConditionalGeneration, T5Tokenizer, get_linear_schedule_with_warmup

In [2]:
import re
import sys
import random
from random import choices
import os
from itertools import chain
from string import punctuation
import time
import argparse
from pathlib import Path
import csv
import json

import traceback
import gc
from enum import Enum 

import numpy as np
import pandas as pd
from statistics import mean

import matplotlib.pyplot as plt


# from trl.gpt2 import GPT2HeadWithValueModel, respond_to_batch
# from trl.ppo import PPOTrainer
#from trl.core import build_bert_batch_from_txt

from IPython.core.display import Markdown,display, HTML, Latex
import qgrid

from verisci.covid import AbstractRetriever, RationaleSelector, LabelPredictor
from verisci.evaluate.lib.data import GoldDataset

from GPUtil import showUtilization as gpu_usage
import wandb

from tqdm.notebook import tqdm
tqdm.pandas()
import pickle

In [3]:
from datetime import datetime
cur_date_time = datetime.today().strftime('%Y_%m_%d_%H_%M')
loc_target_root = '../../dfs_generated/paraphrased/paws/'
#log_dir = project_opt_location+'logs/'
project_name = 'separate_t5_for_majority_tech_term_mlnli'
version = 'v1'

loc_project_opt_location = loc_target_root+project_name+'/'+version+'/'
log_dir = loc_project_opt_location+'logs/'
log_file_dir_name = log_dir+'log_all.log'
Path(log_dir).mkdir(parents=True, exist_ok=True)

import logging
  
#Create and configure logger
logging.basicConfig(filename=log_file_dir_name,
                    level=logging.INFO,
                    format='%(message)s',
                    filemode='w')

log_file_fine_tune_callback = log_dir+'log_call_back.log'

In [4]:
import neptune.new as neptune

nep_run = neptune.init(
    project="ratulalahy/scifact-paraphrase-T5-evo",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI2NWQwMGIyZi1mNzM5LTRiMjEtOTg2MC1mNTc4ODRiMWU2ZGYifQ==",
    tags=['separate_t5_for_majority', 'tech_term_2', 'mlnli'],
    source_files=["T5_filter_tech_term_no_threshold_filter_first_separate_t5_for_majority.ipynb"],#["**/*.ipynb", "*.yaml"]

)  # your credentials

https://app.neptune.ai/ratulalahy/scifact-paraphrase-T5-evo/e/SCIF3-110
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


In [5]:
# import wandb
# wandb.login()
# wandb.init(project="Scifact_paraphrase_T5_per_evo_general_parasci_sup_to_ref_2_ft_0.0.1", entity="qratulalahy")
# from pytorch_lightning.loggers import WandbLogger

# wandb_logger = WandbLogger()

In [6]:
from notify_run import Notify
notify = Notify()
notify.register()

In [7]:
pd.set_option("display.precision", 2)

In [8]:
torch.set_default_tensor_type(torch.cuda.FloatTensor)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [9]:
class ParaphraseTargetDirection(Enum):
    org_support_to_gen_refute = 0
    org_refute_to_gen_support = 1
    both_majority_and_inverse_majority = 2
    
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(37)    

In [10]:
PARAPHRASE_PROJECT_SETTINGS={
    'file_and_dirs': {
        'file_paraphrased_no_tune_all_model_full' : '../../dfs_generated/paraphrased/t5_no_fine_tune_generated_claim_all_model_df_full_1.pkl', # can be deleted
        'file_org_claims_by_scifact' : '../../dfs_generated/scifact/org_claim_ext_label_roberta_large_fever.pkl',
    },
    'config_scifact' : {
        'cls_model_name': '../../scifact/model/label_roberta_large_fever_scifact',
        'rationale_model_name': '../../scifact/model/rationale_roberta_large_fever_scifact',
        'loc_gold_ds_corpus' : '../../scifact/data/corpus.jsonl', 
        'loc_gold_ds_train' : '../../scifact/data/claims_train.jsonl', 
        'loc_gold_ds_dev' : '../../scifact/data/claims_dev.jsonl', 

    },
    
    
    'paraphrase_model' :
    {
        'list_potential_paraphrase_models' : 
            [
                {'model_name' : 'parasci_base_no_fine_tune' , 'model_path_or_url' : 'HelloRusk/t5-base-parasci', 'is_selected' : False},
                {'model_name' : 'parrot_base_no_fine_tune' , 'model_path_or_url' : 'prithivida/parrot_paraphraser_on_T5', 'is_selected' : False},
                {'model_name' : 'parrot_div_base_no_fine_tune' , 'model_path_or_url' : 'prithivida/parrot_paraphraser_on_T5', 'is_selected' : False},
                {'model_name' : 'pegasus_base_no_fine_tune' , 'model_path_or_url' : 'tuner007/pegasus_paraphrase', 'is_selected' : False},
                {'model_name' : 'paws_base_no_fine_tune' , 'model_path_or_url' : 'Vamsi/T5_Paraphrase_Paws', 'is_selected' : True},
                {'model_name' : 'tapaco_base_no_fine_tune' , 'model_path_or_url' : 'hetpandya/t5-base-tapaco', 'is_selected' : False},
                {'model_name' : 'sci_five_pubmed' , 'model_path_or_url' : 'razent/SciFive-large-Pubmed_PMC', 'is_selected' : False}
            ],
        't5_paraphrase_model_params':
        {
            'max_length':256,
            'do_sample':True,
            'top_k':50,
            'top_p': 0.99,
            'repetition_penalty':3.5,
            'early_stopping':True,
            'num_return_sequences':10
        }
    },
    'entailment_model':
    {
        'model_path' : 'pytorch/fairseq',
        'model_name' : 'roberta.large.mnli',
    },
    'labels_multi_nli' :
    {
        0: 'contradiction', 
        1 : 'neutral', 
        2 : 'entailment'
    },
    
    'run_settings':
    {
        'PARAPHRASE_FT_TRAIN_SPLIT' : 0.1,
        'PARAPHRASE_FT_DATASET_DIRECTION' : ParaphraseTargetDirection.both_majority_and_inverse_majority,#ParaphraseTargetDirection.org_support_to_gen_refute,#ParaphraseTargetDirection.org_support_to_gen_refute,#ParaphraseTargetDirection.org_refute_to_gen_support,
        'NUM_OF_EPOCH_REQ_FT' : 2,
        'FILTER_BY' : 'TECH_TERMS',
        'SIMILARITY_THRESHOLD' : -100
        #'CUR_MODEL_NAME_PATHS' : (lambda: [_x['model_path_or_url'] for  _x in PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['list_potential_paraphrase_models'] if _x['is_selected'] == True])(),
    },
}

CUR_NO_OF_EPOCH_FT = 0

In [11]:
loc_df_scispacy_sentence_word_unq_ner_abr_filtered ='../../dfs_generated/linguistic/df_scispacy_sentence_word_unq_ner_abr_cust_2.pkl'

##  Load paraphrased dataset for selected `paraphrase_models`

In [12]:
def get_paraphrased_dataframe_all_model_no_fine_tuned():
    return pd.read_pickle(PARAPHRASE_PROJECT_SETTINGS['file_and_dirs']['file_paraphrased_no_tune_all_model_full'])

In [13]:
def get_paraphrased_dataframe_selected_models(df_all_model_paraphrased, list_model_names):
    return df_all_model_paraphrased[df_all_model_paraphrased['model'].isin(list_model_names)]

## Prepare dataset for fine tuning

In [14]:
def get_dataframes_by_majority_org_claim(df_all_paraphrased_org_claim):
    df_all_paraphrased_org_success = df_all_paraphrased_org_claim[df_all_paraphrased_org_claim['org_comment'] == 'success']
    
    # Select claims with majority
    df_paraphrased_org_support_major = df_all_paraphrased_org_success[
        df_all_paraphrased_org_success['org_count_support'] > df_all_paraphrased_org_success['org_count_refute']
    ]

    
    df_paraphrased_org_refute_major = df_all_paraphrased_org_success[
        df_all_paraphrased_org_success['org_count_support'] < df_all_paraphrased_org_success['org_count_refute']
    ]
    
    return df_paraphrased_org_support_major, df_paraphrased_org_refute_major, df_all_paraphrased_org_success


In [15]:
tmp_html_tag = ''

In [16]:
def report_dataframes_by_majority_org_claim(df_paraphrased_org_support_major, df_paraphrased_org_refute_major, df_all_paraphrased_org_success):
    count_org_claim_support_major = len(df_paraphrased_org_support_major['org_claim'].unique())
    count_org_claim_refute_major = len(df_paraphrased_org_refute_major['org_claim'].unique())
    count_successful_org_claim = len(df_all_paraphrased_org_success['org_claim'].unique())
    ## Report majority
    tmp_html_tag = ''
    tmp_html_tag += '<h3 style="color:#0080ff">' + 'Original Claim Stat for current SciFact model' +'</h3>'
    tmp_html_tag += '<h4 style="color:#0080ff">'+'# of unique successful claim with Support majority : '+str(count_org_claim_support_major)+'</h4>'
    tmp_html_tag += '<h4 style="color:#0080ff">'+'# of unique successful claim with Refute majority : '+str(count_org_claim_refute_major)+'</h4>'
    tmp_html_tag += '<h4 style="color:#0080ff">'+'# of unique successful claim : '+str(count_successful_org_claim)+'</h4>'
    display(HTML(tmp_html_tag))

In [17]:
def get_df_succesfully_attacked_claim(df_paraphrased_support_major, df_paraphrased_refute_major):
    df_org_refute_gen_support = df_paraphrased_refute_major[
    df_paraphrased_refute_major['gen_count_support'] > df_paraphrased_refute_major['gen_count_refute']
    ]

    df_org_support_gen_refute = df_paraphrased_support_major[
        df_paraphrased_support_major['gen_count_support'] < df_paraphrased_support_major['gen_count_refute']
    ]
    
    return df_org_support_gen_refute, df_org_refute_gen_support



In [18]:
def get_df_succesfully_attacked_claim_refute_major(df_paraphrased_refute_major):
    df_org_refute_gen_support = df_paraphrased_refute_major[
    df_paraphrased_refute_major['gen_count_support'] > df_paraphrased_refute_major['gen_count_refute']
    ]
    return df_org_refute_gen_support
        
def get_df_succesfully_attacked_claim_support_major(df_paraphrased_support_major):
    df_org_support_gen_refute = df_paraphrased_support_major[
        df_paraphrased_support_major['gen_count_support'] < df_paraphrased_support_major['gen_count_refute']
    ]
    return df_org_support_gen_refute

In [19]:
def report_df_succesfully_attacked_claim(df_org_support_gen_refute, df_org_refute_gen_support, cur_epoch):
    
    tmp_html_tag = ''
    tmp_html_tag += '<h3 style="color:#0080ff">' + 'Succesfully attacked Claim Stat ' +'</h3>'
    tmp_html_tag += '<h4 style="color:#0080ff">'+'# of toal org refute to gen support : '+str(len(df_org_refute_gen_support))+'</h4>'
    tmp_html_tag += '<h4 style="color:#0080ff">'+'# of total org support to gen refute : '+str(len(df_org_support_gen_refute))+'</h4>'
    tmp_html_tag += '<h4 style="color:#0080ff">'+'# of unique org refute to support : '+str(len(df_org_refute_gen_support['org_claim'].unique()))+'</h4>'
    tmp_html_tag += '<h4 style="color:#0080ff">'+'# of unique org support to refute : '+str(len(df_org_support_gen_refute['org_claim'].unique()))+'</h4>'
    display(HTML(tmp_html_tag))
    
    dict_no_ft_org_gen_count = {
    '# of toal org refute to gen support': len(df_org_refute_gen_support),
    '# of total org support to gen refute' : len(df_org_support_gen_refute), 
    '# of unique org refute to support' : len(df_org_refute_gen_support['org_claim'].unique()),
    '# of unique org support to refute' : len(df_org_support_gen_refute['org_claim'].unique()),
    }

    nep_run['no_ft_org_gen_count'] = dict_no_ft_org_gen_count

In [20]:
'org SUP gen REF'.lower().replace(' ', '_')

'org_sup_gen_ref'

In [21]:
def report_df_filter(df_cur_analyze_filter, name_analyzed_df ,cur_epoch):
    
    name_analyzed_df_formatted = name_analyzed_df.lower().replace(' ', '_')
    num_mlnli_ent_org_gen = len(df_cur_analyze_filter[df_cur_analyze_filter['mlnli_label_org_gen'] == 'entailment'])
    num_mlnli_ent_gen_org = len(df_cur_analyze_filter[df_cur_analyze_filter['mlnli_label_gen_org'] == 'entailment'])
    num_mlnli_ent_both = len(df_cur_analyze_filter[
        (df_cur_analyze_filter['mlnli_label_org_gen'] == 'entailment') &
        (df_cur_analyze_filter['mlnli_label_gen_org'] == 'entailment')])
    num_passed_ner_abr_filter_ic = len(df_cur_analyze_filter[df_cur_analyze_filter['passed_ner_abr_filter_ic'] == True])
    
    num_both_ent_ner_passed = len(df_cur_analyze_filter[
        (df_cur_analyze_filter['mlnli_label_org_gen'] == 'entailment') &
        (df_cur_analyze_filter['mlnli_label_gen_org'] == 'entailment') &
        (df_cur_analyze_filter['passed_ner_abr_filter_ic'] == True)
    ])
    #unique 
    num_unique_mlnli_ent_org_gen = len(df_cur_analyze_filter[df_cur_analyze_filter['mlnli_label_org_gen'] == 'entailment']['org_claim'].unique())
    
    num_unique_mlnli_ent_both = len(df_cur_analyze_filter[
        (df_cur_analyze_filter['mlnli_label_org_gen'] == 'entailment') &
        (df_cur_analyze_filter['mlnli_label_gen_org'] == 'entailment')]['org_claim'].unique())
    
    num_unique_both_ent_ner_passed = len(df_cur_analyze_filter[
        (df_cur_analyze_filter['mlnli_label_org_gen'] == 'entailment') &
        (df_cur_analyze_filter['mlnli_label_gen_org'] == 'entailment') &
        (df_cur_analyze_filter['passed_ner_abr_filter_ic'] == True)
    ]['org_claim'].unique())
    
    tmp_html_tag = ''
    tmp_html_tag += '<h3 style="color:#004f11">' + 'Filtered for '+name_analyzed_df+' ' +'</h3>'
    tmp_html_tag += '<h4 style="color:#004f11">'+'# mlnli_ent_org_gen : '+str(num_mlnli_ent_org_gen)+'</h4>'
    tmp_html_tag += '<h4 style="color:#004f11">'+'# mlnli_ent_gen_org : '+str(num_mlnli_ent_gen_org)+'</h4>'
    tmp_html_tag += '<h4 style="color:#004f11">'+'# mlnli_ent_both : '+str(num_mlnli_ent_both)+'</h4>'
    tmp_html_tag += '<h4 style="color:#004f11">'+'# passed_ner_abr_filter_ic : '+str(num_passed_ner_abr_filter_ic)+'</h4>'
    tmp_html_tag += '<h4 style="color:#004f11">'+'# both_ent_ner_passed : '+str(num_both_ent_ner_passed)+'</h4>'
    
    tmp_html_tag += '<h4 style="color:#7700a6">'+'# unique_mlnli_ent_org_gen : '+str(num_unique_mlnli_ent_org_gen)+'</h4>'
    tmp_html_tag += '<h4 style="color:#7700a6">'+'# unique_mlnli_ent_both : '+str(num_unique_mlnli_ent_both)+'</h4>'
    tmp_html_tag += '<h4 style="color:#7700a6">'+'# unique_both_ent_ner_passed : '+str(num_unique_both_ent_ner_passed)+'</h4>'
    display(HTML(tmp_html_tag))
    
    dict_no_ft_org_gen_count = {
    name_analyzed_df_formatted+'_mlnli_ent_org_gen': num_mlnli_ent_org_gen,
    name_analyzed_df_formatted+'_mlnli_ent_gen_org' : num_mlnli_ent_gen_org, 
    name_analyzed_df_formatted+'_mlnli_ent_both' : num_mlnli_ent_both,
    name_analyzed_df_formatted+'_passed_ner_abr_filter_ic' : num_passed_ner_abr_filter_ic,
    name_analyzed_df_formatted+'_both_ent_ner_passed' : num_both_ent_ner_passed,
    name_analyzed_df_formatted+'_unique_mlnli_ent_org_gen' : num_unique_mlnli_ent_org_gen,
    name_analyzed_df_formatted+'_unique_mlnli_ent_both' : num_unique_mlnli_ent_both,
    name_analyzed_df_formatted+'_unique_both_ent_ner_passed' : num_unique_both_ent_ner_passed,
    }

    nep_run['no_ft_org_gen_count'] = dict_no_ft_org_gen_count

## Load Abbraviation NER dataframe

In [22]:
df_scispacy_sentence_word_unq_ner_abr_filtered = pd.read_pickle(loc_df_scispacy_sentence_word_unq_ner_abr_filtered)

In [23]:
df_scispacy_sentence_word_unq_ner_abr_filtered['ner_text_stripped'] = df_scispacy_sentence_word_unq_ner_abr_filtered['ner_text'].apply(lambda x: re.sub('[^a-z]+', ' ', x.lower()))

In [24]:
df_scispacy_sentence_word_unq_ner_abr_filtered

Unnamed: 0,ner_text,ner_label,ner_model,claim,start_char,end_char,org_label,list_rationales,data_source,term_in_claim,ner_text_stripped
0,UK,GENE_OR_GENE_PRODUCT,en_ner_bionlp13cg_md,1 in 5 million in UK have abnormal PrP positiv...,18,20,REFUTES,"[RESULTS Of the 32,441 appendix samples 16 wer...",train,UK,uk
1,PrP,GENE_OR_GENE_PRODUCT,en_ner_bionlp13cg_md,1 in 5 million in UK have abnormal PrP positiv...,35,38,REFUTES,"[RESULTS Of the 32,441 appendix samples 16 wer...",train,PrP,prp
2,genomes,SO,en_ner_craft_md,"1,000 genomes project enables mapping of genet...",6,13,SUPPORTS,"[In conclusion, uncommon or rare genetic varia...",dev,genomes,genomes
3,genetic sequence,SO,en_ner_craft_md,"1,000 genomes project enables mapping of genet...",41,57,SUPPORTS,"[In conclusion, uncommon or rare genetic varia...",dev,genetic sequence,genetic sequence
4,variants,SO,en_ner_craft_md,"1,000 genomes project enables mapping of genet...",139,147,SUPPORTS,"[In conclusion, uncommon or rare genetic varia...",dev,variants,variants
...,...,...,...,...,...,...,...,...,...,...,...
1727,β-sheet,PROTEIN,en_ner_jnlpba_md,β-sheet opening occurs during pleurotolysin po...,0,7,SUPPORTS,[The major conformational changes in PlyB are ...,train,β-sheet,sheet
1728,pleurotolysin,PROTEIN,en_ner_jnlpba_md,β-sheet opening occurs during pleurotolysin po...,30,48,SUPPORTS,[The major conformational changes in PlyB are ...,train,pleurotolysin pore,pleurotolysin
1729,cRCT,EXPERIMENT,custom,Cost effectiveness evaluations based on cRCT d...,40,44,REFUTES,[CONCLUSIONS The published cost-effectiveness ...,train,cRCT,crct
1730,embryonic stem cells (ESCs),TAXON,custom,Androgenetic haploid mouse embryonic stem cell...,27,54,SUPPORTS,[Our results demonstrate that AG-haESCs can be...,train,embryonic stem cells (ESCs),embryonic stem cells escs


## Fine Tuning

### Setting up Fine tuner

In [25]:
# train_fine_tune, val_fine_tune = train_test_split(df_paraphrase_fine_tune_dataset[['org_claim', 'gen_claim']], 
#                                                   test_size=paraphraser_split_size)

def get_train_test_dataset(df_to_be_splitted, split_size): 
    df_train, df_val = train_test_split(df_to_be_splitted, 
                                                      test_size=split_size)
    df_train.reset_index(drop=True, inplace=True)
    df_val.reset_index(drop=True, inplace=True)
    return df_train, df_val

In [26]:
class ParaphraseDataset(Dataset):
    def __init__(self, tokenizer, target_dataframe, max_len=512, truncation=True):
        #self.path = os.path.join(data_dir, type_path + '.csv')

        self.source_column = "org_claim"
        self.target_column = "gen_claim"
        self.data = target_dataframe#pd.read_csv(self.path)
        #print(self.data)

        self.max_len = max_len
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

    def _build(self):
        for idx in range(len(self.data)):
            input_, target = self.data.loc[idx, self.source_column], self.data.loc[idx, self.target_column]

            input_ = "paraphrase: "+ input_ + ' </s>'
            target = target + " </s>"

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [27]:
def get_dataset(tokenizer, target_dataframe, args):
    return ParaphraseDataset(tokenizer=tokenizer, target_dataframe = target_dataframe,  max_len=args.max_seq_length)

In [28]:
class T5FineTuner(pl.LightningModule):
    def __init__(self,hparams):
        # Calling the super constructer
        super(T5FineTuner,self).__init__()

        self.hparams.update(vars(hparams))

        self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
        
    def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None):        
        return self.model(input_ids, attention_mask=attention_mask,
                decoder_input_ids=decoder_input_ids,
                decoder_attention_mask=decoder_attention_mask,
                labels=labels,)
     
    def is_logger(self):
        return True
        #return self.trainer.proc_rank <= 0        
    
    def _step(self, batch):
        labels = batch["target_ids"]
        labels[labels[:, :] == self.tokenizer.pad_token_id] = -100 #########

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            labels=labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]

        return loss    
    
    def training_step(self, batch, batch_idx):
        loss = self._step(batch)

        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}    
    
    def training_epoch_end(self, outputs):
        avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
        tensorboard_logs = {"avg_train_loss": avg_train_loss}
        return None#{"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}    
    
    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        return {"val_loss": loss}    
    
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        tensorboard_logs = {"val_loss": avg_loss}
        return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}    
    
    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        self.opt = optimizer
        return [optimizer]    
    
#     def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None, using_native_amp=None):
#         if self.trainer.use_tpu:
#             print('why tpu!')
#             xm.optimizer_step(optimizer)
#         else:
#             print('here!')
#             optimizer.step()
#         optimizer.zero_grad()
#         self.lr_scheduler.step()    
     
    def optimizer_step(self, epoch=None, batch_idx=None, optimizer=None, optimizer_idx=None,
                       optimizer_closure=None, on_tpu=None, using_native_amp=None, using_lbfgs=None):
        optimizer.step(closure=optimizer_closure) # remove 'closure=optimizer_closure' here
        optimizer.zero_grad()
        self.lr_scheduler.step()
    
    
    
    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict        
    
    def train_dataloader(self):
        train_dataset = get_dataset(tokenizer=self.tokenizer, target_dataframe=self.hparams.df_train, args=self.hparams)
        dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
        t_total = (
            (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
            // self.hparams.gradient_accumulation_steps
            * float(self.hparams.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        val_dataset = get_dataset(tokenizer=self.tokenizer, target_dataframe=self.hparams.df_val, args=self.hparams)
        return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

In [29]:

logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
    def on_validation_end(self, trainer, pl_module):
        logger.info("***** Validation results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
            # Log results
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, str(metrics[key])))

    def on_test_end(self, trainer, pl_module):
        logger.info("***** Test results *****")

        if pl_module.is_logger():
            metrics = trainer.callback_metrics
            output_test_results_file = log_file_fine_tune_callback
            with open(output_test_results_file, "w") as writer:
                for key in sorted(metrics):
                    if key not in ["log", "progress_bar"]:
                        logger.info("{} = {}\n".format(key, str(metrics[key])))
                        writer.write("{} = {}\n".format(key, str(metrics[key])))

In [30]:
# Hyper parameters

class FineTuneHyperParams:
    def __init__(self,model_name_path, num_train_epochs, df_train, df_val, df_train_val):
        self.args_dict_fine_tune = dict(
            #data_dir='./tmp_data/', # path for data files
            #output_dir='./tmp_data/', # path to save the checkpoints
            #temp_train_file_name = 'train.csv',
            #temp_validation_file_name = 'val.csv',
            #temp_train_val_file_name = 'all.csv',
            df_train = df_train,
            df_val = df_val,
            df_train_val = df_train_val,
            model_name_or_path= model_name_path,#'HelloRusk/t5-base-parasci',
            tokenizer_name_or_path= model_name_path,#'HelloRusk/t5-base-parasci',
            max_seq_length=512,
            learning_rate=3e-4,
            weight_decay=0.0,
            adam_epsilon=1e-8,
            warmup_steps=0,
            train_batch_size=4,
            eval_batch_size=4,
            num_train_epochs=num_train_epochs,
            gradient_accumulation_steps=16,
            n_gpu=1,
            early_stop_callback=False,
            fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
            opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
            max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
            seed=37,
        )

        self.args_fine_tune_ns = argparse.Namespace(**self.args_dict_fine_tune)

        self.checkpoint_callback_fine_tune = pl.callbacks.ModelCheckpoint(
            dirpath="checkpoints",
            filename="best-checkpoint",
            save_top_k=5,
            verbose=True,
            monitor="val_loss",
            mode="min")

        self.train_params_fine_tune = dict(
            accumulate_grad_batches=self.args_fine_tune_ns.gradient_accumulation_steps,
            gpus=self.args_fine_tune_ns.n_gpu,
            max_epochs=self.args_fine_tune_ns.num_train_epochs,
            #early_stop_callback=False, #
            precision=32,
            #amp_level=self.args_fine_tune_ns.opt_level, #
            gradient_clip_val=self.args_fine_tune_ns.max_grad_norm,
            #logger=wandb_logger,
            callbacks=[self.checkpoint_callback_fine_tune, LoggingCallback()],
        )

## Scifact Functinos

### Load data

In [31]:
def get_claim_label_from_jsonl(dataset_jsonl):
    claim_label_list_train = []


    for cur_claim in dataset_jsonl:
        claim_txt = cur_claim.claim

        for doc_id, evidence in cur_claim.evidence.items():

            ev_doc = cur_claim.release.corpus.get_document(doc_id)

            claim_label = evidence.label.name

            tmp_dic = {"claim" : claim_txt, "label" : claim_label}

            claim_label_list_train.append(tmp_dic)
    return claim_label_list_train

In [32]:
def get_claim_label_evidence_from_jsonl(dataset_jsonl, source):
    claim_label_list_train = []


    for cur_claim in dataset_jsonl:
        claim_txt = cur_claim.claim

        for doc_id, evidence in cur_claim.evidence.items():

            ev_doc = claim_train.release.corpus.get_document(doc_id)

            claim_label = evidence.label.name
            
            list_rationales = []
            for i, sents in enumerate(evidence.rationales):
                list_rationales = [sent for i, sent in enumerate(ev_doc.sentences) if i in sents]

            tmp_dic = {"claim" : claim_txt, "label" : claim_label, "list_rationales" :list_rationales, "source" :source}

            claim_label_list_train.append(tmp_dic)
    return claim_label_list_train

In [33]:
ds_train = GoldDataset(PARAPHRASE_PROJECT_SETTINGS['config_scifact']['loc_gold_ds_corpus'],
                       PARAPHRASE_PROJECT_SETTINGS['config_scifact']['loc_gold_ds_train'])
claim_train = ds_train.get_claim(39)
claim_train.pretty_print()

dic_train = get_claim_label_evidence_from_jsonl(ds_train, source = "train")

Example 39: A diminished ovarian reserve does not solely indicate infertility in an a priori non-infertile population.

Evidence sets:

####################

13497630: SUPPORTS
Set 0:
	- After adjusting for age, body mass index, race, current smoking status, and recent hormonal contraceptive use, women with low AMH values (<0.7 ng/mL [n = 84]) did not have a significantly different predicted probability of conceiving by 6 cycles of attempt (65%; 95% CI, 50%-75%) compared with women (n = 579) with normal values (62%; 95% CI, 57%-66%) or by 12 cycles of attempt (84% [95% CI, 70%-91%] vs 75% [95% CI, 70%-79%], respectively).
Set 1:
	- Women with high serum FSH values (>10 mIU/mL [n = 83]) did not have a significantly different predicted probability of conceiving after 6 cycles of attempt (63%; 95% CI, 50%-73%) compared with women (n = 654) with normal values (62%; 95% CI, 57%-66%) or after 12 cycles of attempt (82% [95% CI, 70%-89%] vs 75% [95% CI, 70%-78%], respectively).
Set 2:
	- Women

In [34]:
ds_valid = GoldDataset(PARAPHRASE_PROJECT_SETTINGS['config_scifact']['loc_gold_ds_corpus'],
                       PARAPHRASE_PROJECT_SETTINGS['config_scifact']['loc_gold_ds_dev'])
claim_valid = ds_valid.get_claim(42)
claim_valid.pretty_print()

dic_valid = get_claim_label_evidence_from_jsonl(ds_valid, source = "dev")

Example 42: A high microerythrocyte count raises vulnerability to severe anemia in homozygous alpha (+)- thalassemia trait subjects.

Evidence sets:

####################

18174210: REFUTES
Set 0:
	- Individuals homozygous for alpha(+)-thalassaemia have microcytosis and an increased erythrocyte count.
	- We estimated that the haematological profile in children homozygous for alpha(+)-thalassaemia reduces the risk of SMA during acute malaria compared to children of normal genotype (relative risk 0.52; 95% confidence interval [CI] 0.24-1.12, p = 0.09).   

Set 1:
	- CONCLUSIONS The increased erythrocyte count and microcytosis in children homozygous for alpha(+)-thalassaemia may contribute substantially to their protection against SMA.


In [35]:
df_claim_evid_label = pd.concat([pd.DataFrame(dic_train), pd.DataFrame(dic_valid)], ignore_index=True)

#df_claim_evid_label

In [36]:
df_claim_evid_label

Unnamed: 0,claim,label,list_rationales,source
0,1 in 5 million in UK have abnormal PrP positiv...,REFUTES,"[RESULTS Of the 32,441 appendix samples 16 wer...",train
1,32% of liver transplantation programs required...,SUPPORTS,[Policies requiring discontinuation of methado...,train
2,40mg/day dosage of folic acid and 2mg/day dosa...,SUPPORTS,[CONCLUSION Treatment with high doses of folic...,train
3,76-85% of people with severe mental disorder r...,SUPPORTS,[Although disorder severity was correlated wit...,train
4,A T helper 2 cell (Th2) environment impedes di...,REFUTES,"[Thus, in Lyn(-/-) mice, basophils and IgE aut...",train
...,...,...,...,...
768,Women with a higher birth weight are more like...,SUPPORTS,[Increased risk of breast cancer was noted wit...,dev
769,Women with a higher birth weight are more like...,SUPPORTS,[RESULTS We found that heavier birth weights w...,dev
770,aPKCz causes tumour enhancement by affecting g...,REFUTES,"[Taken together, this demonstrates that PKCζ i...",dev
771,cSMAC formation enhances weak ligand signalling.,SUPPORTS,[This conclusion was supported by experiments ...,dev


### Scifact Model

In [37]:
class ArgsScifact:
    def __init__(self, claim):
        self.claim = claim
        self.report_file = "../../scifact/results/covid/report" #not needed
        self.n_documents = 100
        self.rationale_selection_method = "topk"
        self.output_format = "markdown"
        self.rationale_threshold = 0.5
        self.label_threshold = 0.5
        self.keep_nei = False
        self.full_abstract = True
        self.verbose = True
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        
        ##
class PretrainedModelsForScifact:
    def __init__(self, args):
        if args.device is None:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        else:
            self.device = torch.device(args.device)
            
        #self.rationale_selection_model = '/home/qudratealahyratu/research/nlp/fact_checking/my_work/scifact/model/rationale_roberta_large_scifact'
        self.rationale_selection_model = PARAPHRASE_PROJECT_SETTINGS['config_scifact']['rationale_model_name']
        self.label_prediction_model = PARAPHRASE_PROJECT_SETTINGS['config_scifact']['cls_model_name']
        self.abstract_retriever = AbstractRetriever()
        self.rationale_selector = RationaleSelector(self.rationale_selection_model,
                                               args.rationale_selection_method,
                                               args.rationale_threshold,
                                               self.device)
        self.label_predictor = LabelPredictor(self.label_prediction_model,
                                         args.keep_nei,
                                         args.label_threshold,
                                         self.device)


In [38]:
args_sci = ArgsScifact("")

pretrained_models_config = PretrainedModelsForScifact(args_sci)

Some weights of the model checkpoint at ../../scifact/model/rationale_roberta_large_fever_scifact were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at ../../scifact/model/label_roberta_large_fever_scifact were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSe

In [39]:
log_failed_claim = []
def inference(args, pretraind_models_config):

    try:
#         if args.verbose:
#         print("Retrieving abstracts. inference > ", args.claim)
        results = pretraind_models_config.abstract_retriever(args.claim, k=args.n_documents)
        if len(results) == 0:
            temp_dic = {'failed_in' : 'abstract retrival', 'claim': args.claim}
            log_failed_claim.append(temp_dic)
            return []
        
        #print("abstract_retriever >> ", results)

#         if args.verbose:
#             print("Selecting rationales. inference > ", args.claim)
        results = pretraind_models_config.rationale_selector(args.claim, results)
        if len(results) == 0:
            temp_dic = {'failed_in' : 'Rationale selection', 'claim': args.claim}
            log_failed_claim.append(temp_dic)
            return []
        
#         if args.verbose:
#             print("Label predictions. inference > ", args.claim)
        results = pretraind_models_config.label_predictor(args.claim, results)

        if len(results) == 0:
            temp_dic = {'failed_in' : 'Label Prediction', 'claim': args.claim}
            log_failed_claim.append(temp_dic)
            return []
        
        results.sort(key=lambda r: r['label_confidence'], reverse=True)
        return results
    except Exception as e:
        print("Exception :: Inference cant retrive info for >> ", args.claim)
        print(sys.exc_info()[0])
        print(traceback.format_exc())
        temp_dic = {'failed_in' : sys.exc_info()[0], 'claim': args.claim}
        log_failed_claim.append(temp_dic)
        return []


In [40]:
def write_result(result, full_abstract):
    all_msg = ""
    all_msg = f"#### [{result['title']}]({result['url']}) \n"
    #print(msg, file=f)
    #all_msg = all_msg+msg
    ev_scores = [f"{x:0.2f}" for x in result["evidence_confidence"]]
    ev_scores = ", ".join(ev_scores)
    if result['label'].lower() == "support":
        msg = f"🟩 **Decision** : {result['label']} (score={result['label_confidence']:0.2f}, evidence scores={ev_scores})\n"
    elif result['label'].lower() == "refute":
        msg = f"🟥 **Decision** : {result['label']} (score={result['label_confidence']:0.2f}, evidence scores={ev_scores})\n"
    else:
        msg = f"⏺ **Decision** : {result['label']} (score={result['label_confidence']:0.2f}, evidence scores={ev_scores})\n"
    #print(msg, file=f)
    all_msg = all_msg+msg 
    
    for i, line in enumerate(result["abstract"]):
        # If we're showing the full abstract, show evidence in green.
        if full_abstract:
            if result['label'].lower() == "support":
                msg = (f"- <span style='color:green'>{line}</span>"
                       if i in result["evidence"]
                       else f"- {line}")
            elif result['label'].lower() == "refute":
                msg = (f"- <span style='color:red'>{line}</span>"
                       if i in result["evidence"]
                       else f"- {line}")                
            #print(msg, file=f)
            all_msg = all_msg+msg + " \n"
        else:
            if i in result["evidence"]:
                msg = f"- {line}"
                #print(msg, file=f)
                all_msg = all_msg+msg + " \n" 
    
    #print(file=f)
    #print(40 * "-", file=f)
    #print(file=f)
    all_msg = all_msg+msg 
    return all_msg + "\n"

In [41]:
def export(args, results):
    all_msg = ""
    claim = args.claim
    #report_file = args.report_file
    #f = open(f"{report_file}.md", "w")
    msg = f"### Claim \n > **{claim}** \n "
    #print(msg, file=f)
    #print(file=f)
    all_msg = all_msg +msg
    
    #support_confs = [], refute_confs = []
    confs = []
    for result in results:
        if result['label'].lower() == "support":
            tmp_dic = {'label' : 'Support', 'label_confidence' : result["label_confidence"], "no_of_evidence" : len(result['evidence_confidence'])}
            confs.append(tmp_dic)
        elif result['label'].lower() == "refute":
            tmp_dic = {'label' : 'Refute', 'label_confidence' : -result["label_confidence"], "no_of_evidence" : len(result['evidence_confidence'])}
            confs.append(tmp_dic)
        
    
    tpm_df = pd.DataFrame(confs)
    #HTML(tpm_df.style.bar(align='mid', color=['#d65f5f', '#5fba7d']))
    display(HTML(tpm_df.style.bar(subset=["label_confidence"], align='mid', color=['#ffa1a1', '#bfffcf']).render()))
    
    msg = "### Evidence \n "
    all_msg = all_msg +msg
    for result in results:
        cur_msg = write_result(result, args.full_abstract)
        all_msg = all_msg +cur_msg+"\n"

    return all_msg

In [42]:
claim_to_check = "ART substantially reduces infectiveness of HIV-positive people."#df_claim_evid_label.iloc[18, :]["claim"]
args_sci = ArgsScifact(claim_to_check)

#pretrained_models_config = pretrained_models_for_scifact(args_sci)

results_raw = inference(args_sci, pretrained_models_config)

if results_raw!= []:
    result_md = export(args_sci, results_raw)
    #result_md = export(args_sci, results_raw)
    display(Markdown(result_md))



Unnamed: 0,label,label_confidence,no_of_evidence
0,Support,0.69,3
1,Support,0.64,3
2,Support,0.54,3


### Claim 
 > **ART substantially reduces infectiveness of HIV-positive people.** 
 ### Evidence 
 #### [Autonomous Targeting of Infectious Superspreaders Using Engineered Transmissible Therapies](https://api.semanticscholar.org/10.1371/journal.pcbi.1002015) 
🟩 **Decision** : SUPPORT (score=0.69, evidence scores=0.11, 0.06, 0.01)
- Infectious disease treatments, both pharmaceutical and vaccine, face three universal challenges: the difficulty of targeting treatments to high-risk ‘superspreader’ populations who drive the great majority of disease spread, behavioral barriers in the host population (such as poor compliance and risk disinhibition), and the evolution of pathogen resistance. 
- Here, we describe a proposed intervention that would overcome these challenges by capitalizing upon Therapeutic Interfering Particles (TIPs) that are engineered to replicate conditionally in the presence of the pathogen and spread between individuals — analogous to ‘transmissible immunization’ that occurs with live-attenuated vaccines (but without the potential for reversion to virulence). 
- Building on analyses of HIV field data from sub-Saharan Africa, we construct a multi-scale model, beginning at the single-cell level, to predict the effect of TIPs on individual patient viral loads and ultimately population-level disease prevalence. 
- <span style='color:green'>Our results show that a TIP, engineered with properties based on a recent HIV gene-therapy trial, could stably lower HIV/AIDS prevalence by ∼30-fold within 50 years and could complement current therapies.</span> 
- <span style='color:green'>In contrast, optimistic antiretroviral therapy or vaccination campaigns alone could only lower HIV/AIDS prevalence by <2-fold over 50 years.</span> 
- The TIP's efficacy arises from its exploitation of the same risk factors as the pathogen, allowing it to autonomously penetrate superspreader populations, maintain efficacy despite behavioral disinhibition, and limit viral resistance. 
- <span style='color:green'>While demonstrated here for HIV, the TIP concept could apply broadly to many viral infectious diseases and would represent a new paradigm for disease control, away from pathogen eradication but toward robust disease suppression.</span> 
- <span style='color:green'>While demonstrated here for HIV, the TIP concept could apply broadly to many viral infectious diseases and would represent a new paradigm for disease control, away from pathogen eradication but toward robust disease suppression.</span>

#### [HIV: Biology to Treatment](https://api.semanticscholar.org/10.1007/978-981-32-9898-9_7) 
🟩 **Decision** : SUPPORT (score=0.64, evidence scores=0.67, 0.01, 0.37)
- AIDS is one of the most dreaded diseases of the twenty-first century caused by human immunodeficiency virus (HIV). 
- <span style='color:green'>Recently, there are reports which show decline in new infections due to better access to anti-retroviral drugs.</span> 
- <span style='color:green'>Still on a daily basis, ~2356 new HIV infections are being reported globally.</span> 
- New treatments and anti-HIV drugs are being continuously developed with the aim to control and cure AIDS. 
- The anti-HIV drugs that are in use usually target HIV entry and replication inside the host cells. 
- <span style='color:green'>However, these drugs are only partially effective in slowing the rate of HIV replication.</span> 
- Nevertheless, the virus manages to replicate at much slower rates even when anti-retroviral treatment is ongoing. 
- The HIV seropositives who are on anti-retroviral treatment for long periods of time are now developing different kinds of other complications including neuroAIDS. 
- The latest development in HIV therapy is a novel kind of bone marrow transplantation from donors who have a homozygous mutation in CCR5 gene. 
- The latest development in HIV therapy is a novel kind of bone marrow transplantation from donors who have a homozygous mutation in CCR5 gene.

#### [Human Immunodeficiency Virus-Associated Diarrhea: Still an Issue in the Era of Antiretroviral Therapy](https://api.semanticscholar.org/10.1007/s10620-015-3615-y) 
🟩 **Decision** : SUPPORT (score=0.54, evidence scores=0.01, 0.99, 0.02)
- <span style='color:green'>Over half of patients with human immunodeficiency virus (HIV) experience diarrhea that contributes negatively to quality of life and adherence to antiretroviral therapy (ART).</span> 
- Opportunistic infectious agents that cause diarrhea in patients with HIV span the array of protozoa, fungi, viruses, and bacteria. 
- <span style='color:green'>With global use of ART, the incidence of diarrhea because of opportunistic infections has decreased; however, the incidence of noninfectious diarrhea has increased.</span> 
- <span style='color:green'>The etiology of noninfectious diarrhea in patients with HIV is multifactorial and includes ART-associated diarrhea and gastrointestinal damage related to HIV infection (i.e., HIV enteropathy).</span> 
- A basic algorithm for the diagnosis of diarrhea in patients with HIV includes physical examination, a review of medical history, assessment of HIV viral load and CD4+ T cell count, stool microbiologic assessment, and endoscopic evaluation, if needed. 
- For patients with negative diagnostic results, the diagnosis of noninfectious diarrhea may be considered. 
- Pharmacologic options for the treatment of noninfectious diarrhea are primarily supportive; however, the use of many unapproved agents is based on unstudied and anecdotal information. 
- In addition, these agents can be associated with treatment-limiting adverse events (AEs), such as drug–drug interactions with ART regimens, abuse liability, and additional gastrointestinal AEs. 
- Currently, crofelemer, an antisecretory agent, is the only therapy approved in the USA for the symptomatic relief of noninfectious diarrhea in patients with HIV on ART. 
- Currently, crofelemer, an antisecretory agent, is the only therapy approved in the USA for the symptomatic relief of noninfectious diarrhea in patients with HIV on ART.



In [43]:
#print(pretraind_models_config)

## Filter

### Tech term

In [44]:
#https://stackoverflow.com/questions/29996079/match-a-whole-word-in-a-string-using-dynamic-regex
def filter_and_replace_tech_term_paraphrased_claim(claim_paraphrased, claim_original):
    #claim_para_trimmed = re.sub('[^a-z]+', ' ', claim_paraphrased.lower())
    df_cur_sentence_word_unq_ner_abr_filtered = df_scispacy_sentence_word_unq_ner_abr_filtered[
        df_scispacy_sentence_word_unq_ner_abr_filtered['claim'] == claim_original
    ]
    for cur_term_row in df_cur_sentence_word_unq_ner_abr_filtered.itertuples(index=False):
        cur_term_row_formatted = r'(?<!\S){}(?!\S)'.format(re.escape(cur_term_row.ner_text))
        res_num = re.findall(cur_term_row_formatted, claim_paraphrased)
        if res_num == []:
            return False
        
    return True

### Entailment

In [45]:
model_neg_checker_roberta = torch.hub.load(PARAPHRASE_PROJECT_SETTINGS['entailment_model']['model_path'], 
                                           PARAPHRASE_PROJECT_SETTINGS['entailment_model']['model_name'])

model_neg_checker_roberta.to(device)
#model_neg_checker_roberta.cuda()

model_neg_checker_roberta.eval() 

Using cache found in /home/qudratealahyratu/.cache/torch/hub/pytorch_fairseq_main


RobertaHubInterface(
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (sentence_encoder): TransformerEncoder(
        (dropout_module): FairseqDropout()
        (embed_tokens): Embedding(50265, 1024, padding_idx=1)
        (embed_positions): LearnedPositionalEmbedding(514, 1024, padding_idx=1)
        (layernorm_embedding): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (layers): ModuleList(
          (0): TransformerEncoderLayerBase(
            (self_attn): MultiheadAttention(
              (dropout_module): FairseqDropout()
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (dr

In [46]:
def get_mlnli_label(org_claim, gen_claim):    
    tokens_sentences_org_gen = model_neg_checker_roberta.encode(org_claim, gen_claim)
    logprobs_sentences_org_gen = model_neg_checker_roberta.predict('mnli', tokens_sentences_org_gen)      
    cal_val_mlnli_org_gen = logprobs_sentences_org_gen.argmax(dim=1).item()
    cal_label_mlnli_org_gen = PARAPHRASE_PROJECT_SETTINGS['labels_multi_nli'][cal_val_mlnli_org_gen]
    
    tokens_sentences_gen_org = model_neg_checker_roberta.encode(gen_claim, org_claim)
    logprobs_sentences_gen_org = model_neg_checker_roberta.predict('mnli', tokens_sentences_gen_org)      
    cal_val_mlnli_gen_org = logprobs_sentences_gen_org.argmax(dim=1).item()
    cal_label_mlnli_gen_org = PARAPHRASE_PROJECT_SETTINGS['labels_multi_nli'][cal_val_mlnli_gen_org]    
#     return {'val_mlnli_org_gen' : cal_val_mlnli_org_gen, 
#             'label_mlnli_org_gen': cal_label_mlnli_org_gen, 
#             'val_mlnli_gen_org': cal_val_mlnli_gen_org, 
#             'label_mlnli_gen_org': cal_label_mlnli_gen_org}
    
    return pd.Series([cal_val_mlnli_org_gen, cal_label_mlnli_org_gen, cal_val_mlnli_gen_org, cal_label_mlnli_gen_org])

## Apply Finetuned Model

In [47]:
def get_t5_gen_sentences(org_sentence, model_t5, tokenizer_t5):
    text =  "paraphrase: " + org_sentence + " </s>"

    encoding = tokenizer_t5.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    #PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['t5_paraphrase_model_params']
    outputs = []
    if type(model_t5) == T5ForConditionalGeneration:
        outputs = model_t5.generate(
            input_ids=input_ids, attention_mask=attention_masks,
            max_length=PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['t5_paraphrase_model_params']['max_length'],
            do_sample=PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['t5_paraphrase_model_params']['do_sample'],
            top_k=PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['t5_paraphrase_model_params']['top_k'],
            top_p=PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['t5_paraphrase_model_params']['top_p'],
            repetition_penalty=PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['t5_paraphrase_model_params']['repetition_penalty'],
            early_stopping=PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['t5_paraphrase_model_params']['early_stopping'],
            num_return_sequences=PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['t5_paraphrase_model_params']['num_return_sequences']
        )

    else:
        outputs = model_t5.model.generate(
            input_ids=input_ids, attention_mask=attention_masks,
            max_length=PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['t5_paraphrase_model_params']['max_length'],
            do_sample=PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['t5_paraphrase_model_params']['do_sample'],
            top_k=PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['t5_paraphrase_model_params']['top_k'],
            top_p=PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['t5_paraphrase_model_params']['top_p'],
            repetition_penalty=PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['t5_paraphrase_model_params']['repetition_penalty'],
            early_stopping=PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['t5_paraphrase_model_params']['early_stopping'],
            num_return_sequences=PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['t5_paraphrase_model_params']['num_return_sequences']
        )
        
    gen_sentences_t5 = []
    for output in outputs:
        line = tokenizer_t5.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
        gen_sentences_t5.append(line)
    
    #print('gen_sentences_t5_tapaco >> ', gen_sentences_t5_tapaco)
    return list(set(gen_sentences_t5))#[:5]

In [48]:
def get_stat_of_original_claim(row_org_claim):
    claim =  row_org_claim["org_claim"]
    logging.info("#### \n\n>>> Original claim >>> ")
    logging.info(claim)

    args_sci = ArgsScifact(claim)
    dic_info = {}
    
    dic_info["org_claim"] = row_org_claim["org_claim"]
    dic_info["ground_label"] = row_org_claim["ground_label"]
    dic_info["ground_list_rationales"] = row_org_claim["ground_list_rationales"]
    dic_info["source"] = row_org_claim["source"]
    dic_info["org_count_support"] = 0
    dic_info["org_count_refute"] = 0
    dic_info["org_list_supported_ids"] = []
    dic_info["org_list_refuted_ids"] = []
    dic_info["org_list_supported_confidence"] = []
    dic_info["org_list_refuted_confidence"] = []
    dic_info["org_list_supported_confidence_mean"] = 0.0
    dic_info["org_list_refuted_confidence_mean"] = 0.0
    dic_info["org_comment"] = ""
    
    try:
        results_raw_org = inference(args_sci, pretrained_models_config)  
        
        if results_raw_org == []:
            dic_info["org_comment"] = "no result"
            
        else:
            list_supported_ids = [cur_result['id'] for cur_result in results_raw_org if cur_result['label'] == 'SUPPORT']
            list_refuted_ids = [cur_result['id'] for cur_result in results_raw_org if cur_result['label'] == 'REFUTE']
            list_supported_label_confidence =  [cur_result['label_confidence'] for cur_result in results_raw_org if cur_result['label'] == 'SUPPORT']
            list_refuted_label_confidence =  [cur_result['label_confidence'] for cur_result in results_raw_org if cur_result['label'] == 'REFUTE']
            
            
            dic_info["org_count_support"] = len(list_supported_ids)
            dic_info["org_count_refute"] = len(list_refuted_ids)
            dic_info["org_list_supported_ids"] = list_supported_ids
            dic_info["org_list_refuted_ids"] = list_refuted_ids
            dic_info["org_list_supported_confidence"] = list_supported_label_confidence
            dic_info["org_list_refuted_confidence"] = list_refuted_label_confidence
            if len(list_supported_label_confidence) > 0:
                dic_info["org_list_supported_confidence_mean"] = mean(list_supported_label_confidence)
            if len(list_refuted_label_confidence) > 0:
                dic_info["org_list_refuted_confidence_mean"] = mean(list_refuted_label_confidence)
            dic_info["org_comment"] = "success"
            
    except Exception as e:
        dic_info["org_comment"] = "exception : "+e
        logging.info(">>> Exception original claim >>> ") 
        logging.info(claim) 
        logging.info(e)
    
    finally:
        return dic_info

In [49]:
def get_results_by_gen_claim(gen_claim, dic_original_claim_info):
    logging.info(":: Generated claim :: ")
    logging.info(gen_claim)
    #print(gen_claim)
    
    args_gen = ArgsScifact(gen_claim)
    gen_dic_info = {}
    
    #gen_dic_info["gen_claim"] = gen_claim
    gen_dic_info["gen_count_support"] = 0
    gen_dic_info["gen_count_refute"] = 0
    gen_dic_info["gen_list_supported_ids"] = []
    gen_dic_info["gen_list_refuted_ids"] = []
    gen_dic_info["gen_list_supported_confidence"] = []
    gen_dic_info["gen_list_refuted_confidence"] = []
    gen_dic_info["gen_list_supported_confidence_mean"] = 0.0
    gen_dic_info["gen_list_refuted_confidence_mean"] = 0.0
    gen_dic_info["gen_comment"] = ""    
    
    gen_dic_info["common_all"] = 0
    gen_dic_info["common_support_refute"] = 0
    gen_dic_info["common_refute_support"] = 0
    gen_dic_info["common_support_support"] = 0
    gen_dic_info["common_refute_refute"] = 0
    
    try:
        results_raw_gen = inference(args_gen, pretrained_models_config)  

        if results_raw_gen == []:
            gen_dic_info["gen_comment"] = "no result"
            
        else:
            list_supported_ids = [cur_result['id'] for cur_result in results_raw_gen if cur_result['label'] == 'SUPPORT']
            list_refuted_ids = [cur_result['id'] for cur_result in results_raw_gen if cur_result['label'] == 'REFUTE']
            list_supported_label_confidence =  [cur_result['label_confidence'] for cur_result in results_raw_gen if cur_result['label'] == 'SUPPORT']
            list_refuted_label_confidence =  [cur_result['label_confidence'] for cur_result in results_raw_gen if cur_result['label'] == 'REFUTE']
            
            
            gen_dic_info["gen_count_support"] = len(list_supported_ids)
            gen_dic_info["gen_count_refute"] = len(list_refuted_ids)
            gen_dic_info["gen_list_supported_ids"] = list_supported_ids
            gen_dic_info["gen_list_refuted_ids"] = list_refuted_ids
            gen_dic_info["gen_list_supported_confidence"] = list_supported_label_confidence
            gen_dic_info["gen_list_refuted_confidence"] = list_refuted_label_confidence
            if len(list_supported_label_confidence) > 0 :
                gen_dic_info["gen_list_supported_confidence_mean"] = mean(list_supported_label_confidence)
            if len(list_refuted_label_confidence) > 0:
                gen_dic_info["gen_list_refuted_confidence_mean"] = mean(list_refuted_label_confidence)
            gen_dic_info["gen_comment"] = "success"      
            
            
            common_all = (set(gen_dic_info["gen_list_supported_ids"]) | set(gen_dic_info["gen_list_refuted_ids"])) & \
                (set(dic_original_claim_info["org_list_supported_ids"]) | set(dic_original_claim_info["org_list_refuted_ids"]))
            
            common_support_refute = set(dic_original_claim_info["org_list_supported_ids"]) & set(gen_dic_info["gen_list_refuted_ids"])
            common_refute_support = set(dic_original_claim_info["org_list_refuted_ids"]) & set(gen_dic_info["gen_list_supported_ids"])
            common_support_support = set(dic_original_claim_info["org_list_supported_ids"]) & set(gen_dic_info["gen_list_supported_ids"])
            common_refute_refute = set(dic_original_claim_info["org_list_refuted_ids"]) & set(gen_dic_info["gen_list_refuted_ids"])
            
            gen_dic_info["common_all"] = len(common_all)
            gen_dic_info["common_support_refute"] = len(common_support_refute)
            gen_dic_info["common_refute_support"] = len(common_refute_support)
            gen_dic_info["common_support_support"] = len(common_support_support)
            gen_dic_info["common_refute_refute"] = len(common_refute_refute)
            
            gen_dic_info["gen_comment"] = "success" 
            
    except Exception as e:
        dic_info["gen_comment"] = "exception : "+e
        logging.info(">>> Exception gen claim >>> ") 
        logging.info(claim) 
        logging.info(e)
        
    finally:
        return gen_dic_info           

In [50]:
def get_paraphrased_sentence_no_ft_with_detail_stat(df_dataset_to_be_paraphrased, model_t5, tokenizer_t5, model_name_t5):
    '''
    args:
    df_dataset_to_be_paraphrased : Dataset, those were successfully retrived by current scifact model
    '''
    list_results_fine_tuned = []
    dic_key_sentence_info = model_name_t5+"_sentences_info"
    
    for index_df, cur_row in tqdm(df_dataset_to_be_paraphrased.iloc[:,:].iterrows(), total=len(df_dataset_to_be_paraphrased)):
        cur_res = {}

        dic_info_org_claim = get_stat_of_original_claim(cur_row)
        cur_res["org_claim_info"] = dic_info_org_claim    
        #print(cur_res)
        
        try:
            list_paraphrased_claims = get_t5_gen_sentences(org_sentence = cur_row["org_claim"], 
                                                          model_t5 = model_t5, tokenizer_t5 = tokenizer_t5)#get_t5_gen_sentences(cur_row["org_claim"])
            
            
            list_dic_paraphrased_info = []
            for cur_paraphrased_sent in list_paraphrased_claims_with_sim_threshold:
                cur_dic_paraphraased_claim_info = get_results_by_gen_claim(cur_paraphrased_sent, dic_info_org_claim)
                cur_dic_paraphraased_claim_info["model"] = model_name_t5
                
#                 cur_dic_paraphraased_claim_info['passed_ner_abr_filter_ic'] = filter_and_replace_tech_term_paraphrased_claim(cur_paraphrased_sent, 
#                                                                                                                              dic_info_org_claim['org_claim'])
#                 dict_mlnli_labels = get_mlnli_label(dic_info_org_claim['org_claim'], cur_paraphrased_sent)
#                 cur_dic_paraphraased_claim_info.update(dict_mlnli_labels)
                
                list_dic_paraphrased_info.append(cur_dic_paraphraased_claim_info)                 
                

            cur_res[dic_key_sentence_info] = list_dic_paraphrased_info
        except Exception as e:
            logging.info(">>> Exception genereted claim >>> ")
            logging.info(cur_row["org_claim"])
            logging.info(e)     
            print('exc : ', e)
        #print(cur_res)
        list_results_fine_tuned.append(cur_res)
            
    #print(len(list_results_fine_tuned))
    ## Formatting dataframe
    result_as_dict = []
    for cur_claim in list_results_fine_tuned:
        #print(cur_claim.keys())
        for cur_gen_paraphrased_claim in cur_claim[dic_key_sentence_info]:
            cur_merged_dict = {**cur_claim["org_claim_info"], **cur_gen_paraphrased_claim}
            result_as_dict.append(cur_merged_dict)
            #print('cur_merged_dict : ', cur_merged_dict)
    #print(len(result_as_dict))
    return pd.DataFrame(result_as_dict)

In [51]:
def get_paraphrased_sentence_with_detail_stat(df_dataset_to_be_paraphrased, model_t5, tokenizer_t5, model_name_t5):
    '''
    args:
    df_dataset_to_be_paraphrased : Dataset, those were successfully retrived by current scifact model
    '''
    list_results_fine_tuned = []
    dic_key_sentence_info = model_name_t5+"_sentences_info"
    
    for index_df, cur_row in tqdm(df_dataset_to_be_paraphrased.iloc[:,:].iterrows(), total=len(df_dataset_to_be_paraphrased)):
        cur_res = {}

        dic_info_org_claim = get_stat_of_original_claim(cur_row)
        cur_res["org_claim_info"] = dic_info_org_claim    
        #print(cur_res)
        
        try:
            list_paraphrased_claims = get_t5_gen_sentences(org_sentence = cur_row["org_claim"], 
                                                          model_t5 = model_t5, tokenizer_t5 = tokenizer_t5)#get_t5_gen_sentences(cur_row["org_claim"])
            
            list_paraphrased_claims_with_sim_threshold = []
            for cur_paraphrased_sent in list_paraphrased_claims:
                ## Enable if need to measure similarity score                
                #                 cur_similarity_score = get_sentence_similarity_score(model_sim_diltillroberta_base,
                #                                                                     cur_row['org_claim'], 
                #                                                                     cur_paraphrased_sent)
                cur_similarity_score = 1.0
                if cur_similarity_score >= PARAPHRASE_PROJECT_SETTINGS['run_settings']['SIMILARITY_THRESHOLD']:
                    list_paraphrased_claims_with_sim_threshold.append(cur_paraphrased_sent)
            #Filter paraphrased sentences with tech terms
                
            #for cur_paraphrased_sent in list_paraphrased_claims_with_sim_threshold:
                
            
            list_dic_paraphrased_info = []
            for cur_paraphrased_sent in list_paraphrased_claims_with_sim_threshold:
                cur_dic_paraphraased_claim_info = get_results_by_gen_claim(cur_paraphrased_sent, dic_info_org_claim)
                cur_dic_paraphraased_claim_info["model"] = model_name_t5
                
#                 cur_dic_paraphraased_claim_info['passed_ner_abr_filter_ic'] = filter_and_replace_tech_term_paraphrased_claim(cur_paraphrased_sent, 
#                                                                                                                              dic_info_org_claim['org_claim'])
#                 dict_mlnli_labels = get_mlnli_label(dic_info_org_claim['org_claim'], cur_paraphrased_sent)
#                 cur_dic_paraphraased_claim_info.update(dict_mlnli_labels)
                
                list_dic_paraphrased_info.append(cur_dic_paraphraased_claim_info)                 
                
            cur_res[dic_key_sentence_info] = list_dic_paraphrased_info      
            
        except Exception as e:
            logging.info(">>> Exception genereted claim >>> ")
            logging.info(cur_row["org_claim"])
            logging.info(e)     
            print('exc : ', e)
        #print(cur_res)
        list_results_fine_tuned.append(cur_res)
            
    #print(len(list_results_fine_tuned))
    ## Formatting dataframe
    result_as_dict = []
    for cur_claim in list_results_fine_tuned:
        #print(cur_claim.keys())
        for cur_gen_paraphrased_claim in cur_claim[dic_key_sentence_info]:
            cur_merged_dict = {**cur_claim["org_claim_info"], **cur_gen_paraphrased_claim}
            result_as_dict.append(cur_merged_dict)
            #print('cur_merged_dict : ', cur_merged_dict)
    #print(len(result_as_dict))
    return pd.DataFrame(result_as_dict)

In [52]:
df_org_claims_by_scifact = pd.read_pickle(PARAPHRASE_PROJECT_SETTINGS['file_and_dirs']['file_org_claims_by_scifact'])

### Filter first approach

In [53]:
def get_paraphrased_sentence_no_ft(df_dataset_to_be_paraphrased, model_t5, tokenizer_t5, model_name_t5):
    '''
    args:
    df_dataset_to_be_paraphrased : Dataset, those were successfully retrived by current scifact model
    '''
    list_results_fine_tuned = []
    dic_key_sentence_info = model_name_t5+"_sentences_info"
    
    for index_df, cur_row in df_dataset_to_be_paraphrased.iloc[:,:].iterrows():
        try:
            list_paraphrased_claims = get_t5_gen_sentences(org_sentence = cur_row["org_claim"], 
                                                          model_t5 = model_t5, tokenizer_t5 = tokenizer_t5)#get_t5_gen_sentences(cur_row["org_claim"])
            
            for cur_paraphrased_sent in list_paraphrased_claims:
                cur_tmp_dict = {'gen_claim' : cur_paraphrased_sent,
                                               'model_paraphrase' : model_name_t5}
                dict_cur_row = cur_row.to_dict()
                cur_tmp_dict.update(dict_cur_row)
                list_results_fine_tuned.append(cur_tmp_dict)
                
                
        except Exception as e:
            logging.info(">>> Exception genereted claim >>> ")
            logging.info(cur_row["org_claim"])
            logging.info(e)     
            print('exc : ', e)      
            
    return pd.DataFrame(list_results_fine_tuned)

In [54]:
def get_paraphrased_sentence_stat_no_ft(cur_row_org_detail_paraphrased_sent):
    dict_cur_org_claim = cur_row_org_detail_paraphrased_sent.to_dict()
    cur_paraphrased_sent = cur_row_org_detail_paraphrased_sent['gen_claim']
    dict_all_results = get_results_by_gen_claim(cur_paraphrased_sent, dict_cur_org_claim)
    return pd.Series(dict_all_results)
    

In [55]:
def get_paraphrased_sentence_with_ft(df_dataset_to_be_paraphrased, model_t5, tokenizer_t5, model_name_t5):
    '''
    args:
    df_dataset_to_be_paraphrased : Dataset, those were successfully retrived by current scifact model
    '''
    list_results_fine_tuned = []
    dic_key_sentence_info = model_name_t5+"_sentences_info"
    
    for index_df, cur_row in df_dataset_to_be_paraphrased.iloc[:,:].iterrows():
        try:
            list_paraphrased_claims = get_t5_gen_sentences(org_sentence = cur_row["org_claim"], 
                                                          model_t5 = model_t5, tokenizer_t5 = tokenizer_t5)#get_t5_gen_sentences(cur_row["org_claim"])
            
            for cur_paraphrased_sent in list_paraphrased_claims:
                cur_tmp_dict = {'gen_claim' : cur_paraphrased_sent,
                                               'model_paraphrase' : model_name_t5}
                dict_cur_row = cur_row.to_dict()
                cur_tmp_dict.update(dict_cur_row)
                list_results_fine_tuned.append(cur_tmp_dict)
                
                
        except Exception as e:
            logging.info(">>> Exception genereted claim >>> ")
            logging.info(cur_row["org_claim"])
            logging.info(e)     
            print('exc : ', e)      
            
    return pd.DataFrame(list_results_fine_tuned)

In [56]:
#df_org_support_major[10:, :].progress_apply(lambda x: get_paraphrased_sentence_stat_no_ft(x), axis=1)

In [57]:
def get_paraphrased_sentence(df_dataset_to_be_paraphrased, model_t5, tokenizer_t5, model_name_t5):
    '''
    args:
    df_dataset_to_be_paraphrased : Dataset, those were successfully retrived by current scifact model
    '''
    list_results_fine_tuned = []
    dic_key_sentence_info = model_name_t5+"_sentences_info"
    
    for index_df, cur_row in tqdm(df_dataset_to_be_paraphrased.iloc[:,:].iterrows(), total=len(df_dataset_to_be_paraphrased)):
        cur_res = {}

        dic_info_org_claim = get_stat_of_original_claim(cur_row)
        cur_res["org_claim_info"] = dic_info_org_claim    
        #print(cur_res)
        
        try:
            list_paraphrased_claims = get_t5_gen_sentences(org_sentence = cur_row["org_claim"], 
                                                          model_t5 = model_t5, tokenizer_t5 = tokenizer_t5)#get_t5_gen_sentences(cur_row["org_claim"])
            
            list_paraphrased_claims_with_sim_threshold = []
            for cur_paraphrased_sent in list_paraphrased_claims:
                ## Enable if need to measure similarity score                
                #                 cur_similarity_score = get_sentence_similarity_score(model_sim_diltillroberta_base,
                #                                                                     cur_row['org_claim'], 
                #                                                                     cur_paraphrased_sent)
                cur_similarity_score = 1.0
                if cur_similarity_score >= PARAPHRASE_PROJECT_SETTINGS['run_settings']['SIMILARITY_THRESHOLD']:
                    list_paraphrased_claims_with_sim_threshold.append(cur_paraphrased_sent)
            #Filter paraphrased sentences with tech terms
                
            #for cur_paraphrased_sent in list_paraphrased_claims_with_sim_threshold:
                
            
            list_dic_paraphrased_info = []
            for cur_paraphrased_sent in list_paraphrased_claims_with_sim_threshold:
                cur_dic_paraphraased_claim_info = get_results_by_gen_claim(cur_paraphrased_sent, dic_info_org_claim)
                cur_dic_paraphraased_claim_info["model"] = model_name_t5
                
#                 cur_dic_paraphraased_claim_info['passed_ner_abr_filter_ic'] = filter_and_replace_tech_term_paraphrased_claim(cur_paraphrased_sent, 
#                                                                                                                              dic_info_org_claim['org_claim'])
#                 dict_mlnli_labels = get_mlnli_label(dic_info_org_claim['org_claim'], cur_paraphrased_sent)
#                 cur_dic_paraphraased_claim_info.update(dict_mlnli_labels)
                
                list_dic_paraphrased_info.append(cur_dic_paraphraased_claim_info)                 
                
            cur_res[dic_key_sentence_info] = list_dic_paraphrased_info      
            
        except Exception as e:
            logging.info(">>> Exception genereted claim >>> ")
            logging.info(cur_row["org_claim"])
            logging.info(e)     
            print('exc : ', e)
        #print(cur_res)
        list_results_fine_tuned.append(cur_res)
            
    #print(len(list_results_fine_tuned))
    ## Formatting dataframe
    result_as_dict = []
    for cur_claim in list_results_fine_tuned:
        #print(cur_claim.keys())
        for cur_gen_paraphrased_claim in cur_claim[dic_key_sentence_info]:
            cur_merged_dict = {**cur_claim["org_claim_info"], **cur_gen_paraphrased_claim}
            result_as_dict.append(cur_merged_dict)
            #print('cur_merged_dict : ', cur_merged_dict)
    #print(len(result_as_dict))
    return pd.DataFrame(result_as_dict)

In [58]:
df_org_claims_by_scifact = pd.read_pickle(PARAPHRASE_PROJECT_SETTINGS['file_and_dirs']['file_org_claims_by_scifact'])
df_org_claims_by_scifact_support_major, df_org_claims_by_scifact_refute_major, df_org_claims_by_scifact_sci_success = get_dataframes_by_majority_org_claim(df_org_claims_by_scifact)
df_org_claims_by_scifact_majority = pd.concat([df_org_claims_by_scifact_support_major, df_org_claims_by_scifact_refute_major], ignore_index=True)

In [59]:
df_org_claims_by_scifact_support_major.to_csv(log_dir+'df_org_claims_by_scifact_support_major.csv')
df_org_claims_by_scifact_refute_major.to_csv(log_dir+'df_org_claims_by_scifact_refute_major.csv')

print(df_org_claims_by_scifact_support_major.shape)
print(df_org_claims_by_scifact_refute_major.shape)

(204, 13)
(118, 13)


In [60]:
log_dir

'../../dfs_generated/paraphrased/paws/separate_t5_for_majority_tech_term_mlnli/v1/logs/'

In [61]:
# Load experiment setup
paraphrase_model_path_url = [_x['model_path_or_url'] for  _x in PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['list_potential_paraphrase_models'] if _x['is_selected'] == True]
paraphrase_model_path_url = paraphrase_model_path_url[0]
list_paraphrase_model_names = [_x['model_name'] for  _x in PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['list_potential_paraphrase_models'] if _x['is_selected'] == True]
paraphrase_model_name = list_paraphrase_model_names[0]


In [62]:
# Load no-tuned scifact model
model_t5_not_fine_tuned = AutoModelForSeq2SeqLM.from_pretrained(paraphrase_model_path_url)
tokenizer_t5_not_fine_tuned = AutoTokenizer.from_pretrained(paraphrase_model_path_url)  
_ = model_t5_not_fine_tuned.to(device)


logging.info('### PARAPHRASE WITH NO-FINE-TUNED MODEL -> ###')
#remove duplicates from the original dataset
df_org_claims_by_scifact = df_org_claims_by_scifact.drop_duplicates('org_claim', keep='first')
df_org_claims_by_scifact.to_csv(log_dir+'df_paraphrased_org_majority_unique.csv')
#get paraphrased sentences from no-tuned model
df_paraphrased_selected_model_full = get_paraphrased_sentence_no_ft(df_dataset_to_be_paraphrased = df_org_claims_by_scifact_majority, 
                                                                               model_t5 = model_t5_not_fine_tuned, 
                                                                               tokenizer_t5 = tokenizer_t5_not_fine_tuned, 
                                                                               model_name_t5 = paraphrase_model_name)

model_t5_not_fine_tuned = model_t5_not_fine_tuned.cpu()    
del model_t5_not_fine_tuned

## Pass fillter
logging.info('### FILTER ALL WITH NO-FINE-TUNED MODEL -> ###')
# Filter for entailment check
df_paraphrased_selected_model_full[['mlnli_val_org_gen', 'mlnli_label_org_gen', 'mlnli_val_gen_org', 'mlnli_label_gen_org']] = df_paraphrased_selected_model_full.apply(lambda cur_row : get_mlnli_label (cur_row['org_claim'], cur_row['gen_claim']), axis=1)

#Filter for tech terms check
df_paraphrased_selected_model_full['passed_ner_abr_filter_ic'] = df_paraphrased_selected_model_full.apply(lambda x: filter_and_replace_tech_term_paraphrased_claim(x['gen_claim'], x['org_claim']), axis=1)

report_df_filter(df_paraphrased_selected_model_full, 'both dataset' ,0)
# pass all valids to scifact

df_paraphrased_filtered = df_paraphrased_selected_model_full[
    (df_paraphrased_selected_model_full['passed_ner_abr_filter_ic'] == True) &
    (df_paraphrased_selected_model_full['mlnli_label_org_gen'] == 'entailment') &
    (df_paraphrased_selected_model_full['mlnli_label_gen_org'] == 'entailment')
    
]

#Check majority
logging.info('### GET SUPPORT or REFUTE MAJORITY FILTERED ORG WITH NO-FINE-TUNED MODEL -> ###')
df_org_support_major_filtered, df_org_refute_major_filtered, df_all_cur_model_filtered = get_dataframes_by_majority_org_claim(df_paraphrased_filtered)
report_dataframes_by_majority_org_claim(df_org_support_major_filtered, df_org_refute_major_filtered, df_all_cur_model_filtered)


df_org_support_major_paraphrased_stat = df_org_support_major_filtered.iloc[:, :].progress_apply(lambda x: get_paraphrased_sentence_stat_no_ft(x), axis=1, result_type="expand")
df_org_refute_major_paraphrased_stat = df_org_refute_major_filtered.iloc[:, :].progress_apply(lambda x: get_paraphrased_sentence_stat_no_ft(x), axis=1, result_type="expand")

df_org_support_major_paraphrased_stat = pd.concat([df_org_support_major_filtered, df_org_support_major_paraphrased_stat], axis='columns')
df_org_refute_major_paraphrased_stat = pd.concat([df_org_refute_major_filtered, df_org_refute_major_paraphrased_stat], axis='columns')


df_org_support_major_paraphrased_stat.to_csv(log_dir+'df_org_support_major_paraphrased_stat.csv')
df_org_refute_major_paraphrased_stat.to_csv(log_dir+'df_org_refute_major_paraphrased_stat.csv')
# Get successfullt attacked claims after paraphrased
logging.info('### GET SUCCESSFULL ATTACK WITH NO-FINE-TUNED MODEL -> ###')
df_org_support_gen_refute_stat, df_org_refute_gen_support_stat = get_df_succesfully_attacked_claim(df_org_support_major_paraphrased_stat, df_org_refute_major_paraphrased_stat)
##report_df_succesfully_attacked_claim(df_org_support_gen_refute_stat, df_org_refute_gen_support_stat, cur_epoch = CUR_NO_OF_EPOCH_FT)

##report_df_filter(df_org_support_gen_refute_stat, 'org SUP gen REF' ,CUR_NO_OF_EPOCH_FT)
##report_df_filter(df_org_refute_gen_support_stat, 'org REF gen SUP' ,CUR_NO_OF_EPOCH_FT)

df_org_support_gen_refute_stat['attack_type'] = 'org_sup_to_gen_ref'
df_org_refute_gen_support_stat['attack_type'] = 'org_ref_to_gen_sup'

df_org_support_gen_refute_stat.to_csv(log_dir+'df_org_support_gen_refute_stat.csv')
df_org_refute_gen_support_stat.to_csv(log_dir+'df_org_refute_gen_support_stat.csv')
#report_df_succesfully_attacked_claim(df_org_support_gen_refute_stat, df_org_refute_gen_support_stat, cur_epoch = CUR_NO_OF_EPOCH_FT)

df_successful_filter_attacked = pd.concat([df_org_support_gen_refute_stat, df_org_refute_gen_support_stat], ignore_index=True)

fle_dataframe_to_save = paraphrase_model_name+'_'+str(PARAPHRASE_PROJECT_SETTINGS['run_settings']['PARAPHRASE_FT_DATASET_DIRECTION'])+'_'+str(CUR_NO_OF_EPOCH_FT)+'_FT'

with open(loc_project_opt_location+fle_dataframe_to_save+'_concat_prev.pkl', 'wb') as fp:
    pickle.dump(df_successful_filter_attacked, fp)



  0%|          | 0/556 [00:00<?, ?it/s]



  0%|          | 0/388 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
 
while(True):
#     df_fine_tuning_dataset = None    
#     ## Select dataset for fine-tuning ::either support major or refute major
#     if PARAPHRASE_PROJECT_SETTINGS['run_settings']['PARAPHRASE_FT_DATASET_DIRECTION'] == ParaphraseTargetDirection.org_support_to_gen_refute:
#         df_fine_tuning_dataset = df_successful_filter_attacked[df_successful_filter_attacked['attack_type'] == 'org_sup_to_gen_ref']
#     elif PARAPHRASE_PROJECT_SETTINGS['run_settings']['PARAPHRASE_FT_DATASET_DIRECTION'] == ParaphraseTargetDirection.org_refute_to_gen_support:
#         df_fine_tuning_dataset = df_successful_filter_attacked[df_successful_filter_attacked['attack_type'] == 'org_ref_to_gen_sup']
#     else:
#         raise ValueError('Select a direction of fine tuning dataset')

######################################## SUP INPUT #####################################
    df_org_refute_gen_support_basic = df_org_refute_gen_support_stat[[ 'org_claim','gen_claim','attack_type']]
    df_org_refute_gen_support_basic_inv = df_org_refute_gen_support_stat[['gen_claim', 'org_claim', 'attack_type']].rename(columns={'gen_claim': 'org_claim', 'org_claim': 'gen_claim'})
    df_org_refute_gen_support_basic_inv['attack_type'] = 'org_ref_gen_sup_inv'
    
    df_org_support_gen_refute_basic = df_org_support_gen_refute_stat[[ 'org_claim','gen_claim','attack_type']]
    df_org_support_gen_refute_basic_inv = df_org_support_gen_refute_stat[['gen_claim', 'org_claim', 'attack_type']].rename(columns={'gen_claim': 'org_claim', 'org_claim': 'gen_claim'})
    df_org_support_gen_refute_basic_inv['attack_type'] = 'org_sup_gen_ref_inv'
       
    df_fine_tuning_dataset_sup_to_ref = pd.concat([df_org_support_gen_refute_basic, df_org_refute_gen_support_basic_inv], ignore_index=True)
    df_fine_tuning_dataset_ref_to_sup = pd.concat([df_org_refute_gen_support_basic, df_org_support_gen_refute_basic_inv], ignore_index=True)
    
    CUR_NO_OF_EPOCH_FT += 1   
    
    notify.send(str(CUR_NO_OF_EPOCH_FT))
    logging.info('### FINE TUNING MODEL with SUP MAJOR-> ###')
    ## Train model with fine-tuning dataset
    df_fine_tuning_dataset_sup_to_ref.reset_index(drop=True, inplace=True)
    df_fine_tuning_dataset_sup_to_ref.to_csv(log_dir+'df_fine_tuning_dataset_sup_to_ref_'+str(CUR_NO_OF_EPOCH_FT)+'.csv')
    train_split_size = PARAPHRASE_PROJECT_SETTINGS['run_settings']['PARAPHRASE_FT_TRAIN_SPLIT']
    df_train_fine_tune_sup_to_ref, df_validate_fine_tune_sup_to_ref = get_train_test_dataset(df_fine_tuning_dataset_sup_to_ref, train_split_size)

    num_train_epochs = PARAPHRASE_PROJECT_SETTINGS['run_settings']['NUM_OF_EPOCH_REQ_FT']
    fineTuneHyperParam_sup_to_ref = FineTuneHyperParams(model_name_path = paraphrase_model_path_url, 
                                             num_train_epochs = num_train_epochs, df_train = df_train_fine_tune_sup_to_ref, 
                                             df_val = df_validate_fine_tune_sup_to_ref, df_train_val = df_fine_tuning_dataset_sup_to_ref)

    model_t5_fine_tuned_sup_to_ref = T5FineTuner(fineTuneHyperParam_sup_to_ref.args_fine_tune_ns)
    trainer_model_t5_fine_tune_sup_to_ref = pl.Trainer(**fineTuneHyperParam_sup_to_ref.train_params_fine_tune)
    trainer_model_t5_fine_tune_sup_to_ref.fit(model_t5_fine_tuned_sup_to_ref)        


    # ask model to generate paraphrase
    tokenizer_t5 = AutoTokenizer.from_pretrained(paraphrase_model_path_url)  
    _ = trainer_model_t5_fine_tune_sup_to_ref.model.to(device)

    # Ask fine-tuned model to paraphrase
    df_paraphrased_sup_major = get_paraphrased_sentence_with_ft(df_dataset_to_be_paraphrased = df_org_claims_by_scifact_support_major, 
                                                                               model_t5 = model_t5_fine_tuned_sup_to_ref, 
                                                                               tokenizer_t5 = tokenizer_t5, 
                                                                               model_name_t5 = paraphrase_model_name)


    # Filter for entailment check
    df_paraphrased_sup_major[['mlnli_val_org_gen', 'mlnli_label_org_gen', 'mlnli_val_gen_org', 'mlnli_label_gen_org']] = df_paraphrased_sup_major.apply(lambda cur_row : get_mlnli_label (cur_row['org_claim'], cur_row['gen_claim']), axis=1)

    #Filter for tech terms check
    df_paraphrased_sup_major['passed_ner_abr_filter_ic'] = df_paraphrased_sup_major.apply(lambda x: filter_and_replace_tech_term_paraphrased_claim(x['gen_claim'], x['org_claim']), axis=1)

    #report_df_filter(df_paraphrased_selected_model_full, 'both dataset' ,0)

    df_paraphrased_sup_major_filtered = df_paraphrased_sup_major[
        (df_paraphrased_sup_major['passed_ner_abr_filter_ic'] == True) &
        (df_paraphrased_sup_major['mlnli_label_org_gen'] == 'entailment') &
        (df_paraphrased_sup_major['mlnli_label_gen_org'] == 'entailment')
    ]

    #Check majority
    logging.info('### GET SUPPORT or REFUTE MAJORITY FILTERED ORG WITH FINE-TUNED MODEL -> ###')
    df_org_support_major_filtered, df_org_refute_major_filtered, df_all_success_filtered = get_dataframes_by_majority_org_claim(df_paraphrased_sup_major_filtered)
    #report_dataframes_by_majority_org_claim(df_org_support_major, df_org_refute_major, df_all_cur_model_filtered)

    logging.info('### GET DETAIL and STAT of GEN WITH FINE-TUNED MODEL -> ###')
    df_org_support_major_paraphrased_stat = df_org_support_major_filtered.iloc[:, :].progress_apply(lambda x: get_paraphrased_sentence_stat_no_ft(x), axis=1, result_type="expand")
#    #df_org_refute_major_paraphrased_stat = df_org_refute_major.iloc[:, :].progress_apply(lambda x: get_paraphrased_sentence_stat_no_ft(x), axis=1, result_type="expand")


    df_org_support_major_paraphrased_stat = pd.concat([df_paraphrased_sup_major_filtered, df_org_support_major_paraphrased_stat], axis='columns')
    #df_org_empty_refute_major_paraphrased_stat = pd.DataFrame(columns = df_org_support_major_paraphrased_stat.columns)
#    #df_org_refute_major_paraphrased_stat = pd.concat([df_org_refute_major, df_org_refute_major_paraphrased_stat], axis='columns')

#report
    # Get successfullt attacked claims after paraphrased
    logging.info('### GET SUCCESSFULL ATTACK WITH FINE-TUNED MODEL -> ###')
    df_org_support_gen_refute_stat= get_df_succesfully_attacked_claim_support_major(df_org_support_major_paraphrased_stat)
#    report_df_succesfully_attacked_claim(df_org_support_gen_refute_stat, df_org_refute_gen_support_stat, cur_epoch = CUR_NO_OF_EPOCH_FT)

    #report_df_filter(df_org_support_gen_refute_stat, 'org SUP gen REF' ,CUR_NO_OF_EPOCH_FT)
    #report_df_filter(df_org_refute_gen_support_stat, 'org REF gen SUP' ,CUR_NO_OF_EPOCH_FT)

    df_org_support_gen_refute_stat['attack_type'] = 'org_sup_to_gen_ref'
    df_org_support_gen_refute_stat.to_csv(log_dir+'df_org_support_gen_refute_stat_'+str(CUR_NO_OF_EPOCH_FT)+'.csv')
#    #df_org_refute_gen_support_stat['attack_type'] = 'org_ref_to_gen_sup'
    model_t5_fine_tuned_sup_to_ref = model_t5_fine_tuned_sup_to_ref.cpu()    
    del model_t5_fine_tuned_sup_to_ref
    
################################ refute major ####################################

    df_fine_tuning_dataset_ref_to_sup.reset_index(drop=True, inplace=True)
    df_fine_tuning_dataset_ref_to_sup.to_csv(log_dir+'df_fine_tuning_dataset_ref_to_sup_'+str(CUR_NO_OF_EPOCH_FT)+'.csv')
    train_split_size = PARAPHRASE_PROJECT_SETTINGS['run_settings']['PARAPHRASE_FT_TRAIN_SPLIT']
    df_train_fine_tune_ref_to_sup, df_validate_fine_tune_ref_to_sup = get_train_test_dataset(df_fine_tuning_dataset_ref_to_sup, train_split_size)

    num_train_epochs = PARAPHRASE_PROJECT_SETTINGS['run_settings']['NUM_OF_EPOCH_REQ_FT']
    fineTuneHyperParam_ref_to_sup = FineTuneHyperParams(model_name_path = paraphrase_model_path_url, 
                                             num_train_epochs = num_train_epochs, df_train = df_train_fine_tune_ref_to_sup, 
                                             df_val = df_validate_fine_tune_ref_to_sup, df_train_val = df_fine_tuning_dataset_ref_to_sup)

    model_t5_fine_tuned_ref_to_sup = T5FineTuner(fineTuneHyperParam_ref_to_sup.args_fine_tune_ns)
    trainer_model_t5_fine_tune_ref_to_sup = pl.Trainer(**fineTuneHyperParam_ref_to_sup.train_params_fine_tune)
    trainer_model_t5_fine_tune_ref_to_sup.fit(model_t5_fine_tuned_ref_to_sup)        


    # ask model to generate paraphrase
    tokenizer_t5 = AutoTokenizer.from_pretrained(paraphrase_model_path_url)  
    _ = trainer_model_t5_fine_tune_ref_to_sup.model.to(device)

    # Ask fine-tuned model to paraphrase
    df_paraphrased_ref_major = get_paraphrased_sentence_with_ft(df_dataset_to_be_paraphrased = df_org_claims_by_scifact_refute_major, 
                                                                               model_t5 = model_t5_fine_tuned_ref_to_sup, 
                                                                               tokenizer_t5 = tokenizer_t5, 
                                                                               model_name_t5 = paraphrase_model_name)


    # Filter for entailment check
    df_paraphrased_ref_major[['mlnli_val_org_gen', 'mlnli_label_org_gen', 'mlnli_val_gen_org', 'mlnli_label_gen_org']] = df_paraphrased_ref_major.apply(lambda cur_row : get_mlnli_label (cur_row['org_claim'], cur_row['gen_claim']), axis=1)

    #Filter for tech terms check
    df_paraphrased_ref_major['passed_ner_abr_filter_ic'] = df_paraphrased_ref_major.apply(lambda x: filter_and_replace_tech_term_paraphrased_claim(x['gen_claim'], x['org_claim']), axis=1)

    #report_df_filter(df_paraphrased_selected_model_full, 'both dataset' ,0)

    df_paraphrased_ref_major_filtered = df_paraphrased_ref_major[
        (df_paraphrased_ref_major['passed_ner_abr_filter_ic'] == True) &
        (df_paraphrased_ref_major['mlnli_label_org_gen'] == 'entailment') &
        (df_paraphrased_ref_major['mlnli_label_gen_org'] == 'entailment')
    ]

    #Check majority
    logging.info('### GET REFUTE or SUPPORT MAJORITY FILTERED ORG WITH FINE-TUNED MODEL -> ###')
    df_org_support_major_filtered, df_org_refute_major_filtered, df_all_success_filtered = get_dataframes_by_majority_org_claim(df_paraphrased_ref_major_filtered)
    #report_dataframes_by_majority_org_claim(df_org_support_major, df_org_refute_major, df_all_cur_model_filtered)

    logging.info('### GET DETAIL and STAT of GEN WITH FINE-TUNED MODEL -> ###')
    #df_org_support_major_paraphrased_stat = df_org_support_major_filtered.iloc[:, :].progress_apply(lambda x: get_paraphrased_sentence_stat_no_ft(x), axis=1, result_type="expand")
    df_org_refute_major_paraphrased_stat = df_org_refute_major_filtered.iloc[:, :].progress_apply(lambda x: get_paraphrased_sentence_stat_no_ft(x), axis=1, result_type="expand")


    df_org_refute_major_paraphrased_stat = pd.concat([df_org_refute_major_filtered, df_org_refute_major_paraphrased_stat], axis='columns')
    #df_org_empty_support_major_paraphrased_stat = pd.DataFrame(columns = df_org_refute_major_paraphrased_stat.columns)
#    #df_org_refute_major_paraphrased_stat = pd.concat([df_org_refute_major, df_org_refute_major_paraphrased_stat], axis='columns')

#report
    # Get successfullt attacked claims after paraphrased
    logging.info('### GET SUCCESSFULL ATTACK WITH FINE-TUNED MODEL -> ###')
    df_org_refute_gen_support_stat = get_df_succesfully_attacked_claim_refute_major(df_org_refute_major_paraphrased_stat)
#    report_df_succesfully_attacked_claim(df_org_support_gen_refute_stat, df_org_refute_gen_support_stat, cur_epoch = CUR_NO_OF_EPOCH_FT)

    #report_df_filter(df_org_support_gen_refute_stat, 'org SUP gen REF' ,CUR_NO_OF_EPOCH_FT)
    #report_df_filter(df_org_refute_gen_support_stat, 'org REF gen SUP' ,CUR_NO_OF_EPOCH_FT)

    #df_org_support_gen_refute_stat['attack_type'] = 'org_sup_to_gen_ref'
    df_org_refute_gen_support_stat['attack_type'] = 'org_ref_to_gen_sup'
    df_org_refute_gen_support_stat.to_csv(log_dir+'df_org_refute_gen_support_stat_'+str(CUR_NO_OF_EPOCH_FT)+'.csv')
    model_t5_fine_tuned_ref_to_sup = model_t5_fine_tuned_ref_to_sup.cpu()    
    del model_t5_fine_tuned_ref_to_sup

    #report_df_succesfully_attacked_claim(df_org_support_gen_refute_stat, df_org_refute_gen_support_stat, cur_epoch = CUR_NO_OF_EPOCH_FT)

    df_successful_filter_attacked = pd.concat([df_org_support_gen_refute_stat, df_org_refute_gen_support_stat], ignore_index=True)

    fle_dataframe_to_save = paraphrase_model_name+'_'+str(PARAPHRASE_PROJECT_SETTINGS['run_settings']['PARAPHRASE_FT_DATASET_DIRECTION'])+'_'+str(CUR_NO_OF_EPOCH_FT)+'_FT'

    #fle_dataframe_to_save = paraphrase_model_name+'_'+str(PARAPHRASE_PROJECT_SETTINGS['run_settings']['PARAPHRASE_FT_DATASET_DIRECTION'])+'_'+str(CUR_NO_OF_EPOCH_FT)+'_FT'

    with open(loc_project_opt_location+fle_dataframe_to_save+'_concat_prev.pkl', 'wb') as fp:
        pickle.dump(df_successful_filter_attacked, fp)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  "When using `Trainer(accumulate_grad_batches != 1)` and overriding"
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

  f"One of the returned values {set(extra.keys())} has a `grad_fn`. We will detach it automatically"


Validating: 0it [00:00, ?it/s]

Epoch 0, global step 0: val_loss was not in top 5


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 1: val_loss was not in top 5


  0%|          | 0/549 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 0: val_loss was not in top 5


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 1: val_loss was not in top 5


  0%|          | 0/340 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 0: val_loss was not in top 5


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 1: val_loss was not in top 5


  0%|          | 0/540 [00:00<?, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 0: val_loss was not in top 5


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 1: val_loss was not in top 5


  0%|          | 0/343 [00:00<?, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 0: val_loss was not in top 5


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 1: val_loss was not in top 5


  0%|          | 0/510 [00:00<?, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 0: val_loss was not in top 5


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 1: val_loss was not in top 5


  0%|          | 0/380 [00:00<?, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 0: val_loss was not in top 5


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 1: val_loss was not in top 5


  0%|          | 0/546 [00:00<?, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 0: val_loss was not in top 5


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 1: val_loss was not in top 5


  0%|          | 0/376 [00:00<?, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 0: val_loss was not in top 5


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 1: val_loss was not in top 5


  0%|          | 0/513 [00:00<?, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 0: val_loss was not in top 5


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 1: val_loss was not in top 5


  0%|          | 0/359 [00:00<?, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 0: val_loss was not in top 5


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 1: val_loss was not in top 5


  0%|          | 0/564 [00:00<?, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 0: val_loss was not in top 5


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 1: val_loss was not in top 5


  0%|          | 0/388 [00:00<?, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 0: val_loss was not in top 5


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 1: val_loss was not in top 5


  0%|          | 0/521 [00:00<?, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 0: val_loss was not in top 5


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 1: val_loss was not in top 5


  0%|          | 0/387 [00:00<?, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 0: val_loss was not in top 5


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 1: val_loss was not in top 5


  0%|          | 0/558 [00:00<?, ?it/s]

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 0: val_loss was not in top 5


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 1: val_loss was not in top 5


  0%|          | 0/348 [00:00<?, ?it/s]

In [67]:
df_org_support_gen_refute_stat[['org_claim', 'gen_claim', 'attack_type']]
#df_org_refute_gen_support_stat

Unnamed: 0,org_claim,gen_claim,attack_type
66,Adult tissue-resident macrophages are seeded b...,Adult tissue-resident macrophages are cultivat...,org_sup_to_gen_ref
67,Adult tissue-resident macrophages are seeded b...,Adult tissue-resident macrophages are planted ...,org_sup_to_gen_ref
160,Bariatric surgery leads to negative outcomes i...,The bariatric surgery leads to negative outcom...,org_sup_to_gen_ref
371,Genetic deficiency of mast cells leads to decr...,Genetic deficiency of mast cells leads to decr...,org_sup_to_gen_ref
400,Having a main partner improves HIV outcomes.,One main partner improves HIV outcomes.,org_sup_to_gen_ref
409,Having a main partner worsens HIV outcomes.,Having a principal partner worsens HIV outcomes.,org_sup_to_gen_ref
410,Having a main partner worsens HIV outcomes.,Having a main partner worsens HIV health outco...,org_sup_to_gen_ref
411,Having a main partner worsens HIV outcomes.,Being a main partner worsens HIV results.,org_sup_to_gen_ref
412,Having a main partner worsens HIV outcomes.,A main partner worsens HIV outcomes.,org_sup_to_gen_ref
414,Having a main partner worsens HIV outcomes.,Having a principal partner worsens the HIV out...,org_sup_to_gen_ref


In [71]:
df_org_refute_gen_support_stat[[ 'org_claim','gen_claim','attack_type']]

Unnamed: 0,org_claim,gen_claim,attack_type
1647,76-85% of people with severe mental disorder r...,76-85% of people with severe mental disorder h...,org_ref_to_gen_sup
1649,76-85% of people with severe mental disorder r...,"In low and middle income countries, 76-85% of ...",org_ref_to_gen_sup
1678,Antimicrobial agents are less effective due to...,Antimicrobial agents are less effective due to...,org_ref_to_gen_sup
1703,Autophagy deficiency in the liver increases vu...,Autophagy deficit in the liver increases vulne...,org_ref_to_gen_sup
1730,Birth-weight is negatively associated with bre...,Birth weight is associated with breast cancer ...,org_ref_to_gen_sup
1732,Birth-weight is negatively associated with bre...,A negative association with breast cancer is b...,org_ref_to_gen_sup
1915,General exercise therapy is more effective tha...,Exercise therapy is more effective at reducing...,org_ref_to_gen_sup
1981,MafA phosphorylation enhances its ubiquitination.,MafA phosphorylation increases its ubiquitina...,org_ref_to_gen_sup
2132,Risk-adjusted mortality rates are similar in t...,Risk-adjusteable mortality rates are similar i...,org_ref_to_gen_sup
2139,Sepsis related mortality has remained stable b...,"Between 2009-2014, mortality due to Sepsis rem...",org_ref_to_gen_sup


In [73]:
df_org_refute_gen_support_stat[['gen_claim', 'org_claim', 'attack_type']].rename(columns={'gen_claim': 'org_claim', 'org_claim': 'gen_claim'})

Unnamed: 0,org_claim,gen_claim,attack_type
1647,76-85% of people with severe mental disorder h...,76-85% of people with severe mental disorder r...,org_ref_to_gen_sup
1649,"In low and middle income countries, 76-85% of ...",76-85% of people with severe mental disorder r...,org_ref_to_gen_sup
1678,Antimicrobial agents are less effective due to...,Antimicrobial agents are less effective due to...,org_ref_to_gen_sup
1703,Autophagy deficit in the liver increases vulne...,Autophagy deficiency in the liver increases vu...,org_ref_to_gen_sup
1730,Birth weight is associated with breast cancer ...,Birth-weight is negatively associated with bre...,org_ref_to_gen_sup
1732,A negative association with breast cancer is b...,Birth-weight is negatively associated with bre...,org_ref_to_gen_sup
1915,Exercise therapy is more effective at reducing...,General exercise therapy is more effective tha...,org_ref_to_gen_sup
1981,MafA phosphorylation increases its ubiquitina...,MafA phosphorylation enhances its ubiquitination.,org_ref_to_gen_sup
2132,Risk-adjusteable mortality rates are similar i...,Risk-adjusted mortality rates are similar in t...,org_ref_to_gen_sup
2139,"Between 2009-2014, mortality due to Sepsis rem...",Sepsis related mortality has remained stable b...,org_ref_to_gen_sup


In [None]:
df_org_claims_by_scifact_majority_original = df_org_claims_by_scifact_majority.copy()

In [None]:


df_org_claims_by_scifact_majority = df_org_claims_by_scifact_majority_original.iloc[200:230, :]

In [None]:
# get all model's no_fine_tuned dataset
#df_paraphrased_all_model_full = get_paraphrased_dataframe_all_model_no_fine_tuned()

#Filter and select dataset only for the selected model

paraphrase_model_path_url = [_x['model_path_or_url'] for  _x in PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['list_potential_paraphrase_models'] if _x['is_selected'] == True]
paraphrase_model_path_url = paraphrase_model_path_url[0]
list_paraphrase_model_names = [_x['model_name'] for  _x in PARAPHRASE_PROJECT_SETTINGS['paraphrase_model']['list_potential_paraphrase_models'] if _x['is_selected'] == True]
paraphrase_model_name = list_paraphrase_model_names[0]
# df_paraphrased_selected_model_full = get_paraphrased_dataframe_selected_models(df_paraphrased_all_model_full, 
#                                                                                       list_paraphrase_model_names)

#df_org_claims_by_scifact= df_org_claims_by_scifact.iloc[:50, :].copy()
df_org_support_major, df_org_refute_major, df_all_cur_model_org_success= get_dataframes_by_majority_org_claim(df_org_claims_by_scifact)
report_dataframes_by_majority_org_claim(df_org_support_major, df_org_refute_major, df_all_cur_model_org_success)

df_all_cur_model_org_success = df_all_cur_model_org_success.drop_duplicates('org_claim', keep='first') 

model_t5_not_fine_tuned = AutoModelForSeq2SeqLM.from_pretrained(paraphrase_model_path_url)
tokenizer_t5_not_fine_tuned = AutoTokenizer.from_pretrained(paraphrase_model_path_url)  
_ = model_t5_not_fine_tuned.to(device)
                                                                
df_paraphrased_selected_model_full = get_paraphrased_sentence_with_detail_stat(df_dataset_to_be_paraphrased = df_all_cur_model_org_success, 
                                                                               model_t5 = model_t5_not_fine_tuned, 
                                                                               tokenizer_t5 = tokenizer_t5_not_fine_tuned, 
                                                                               model_name_t5 = paraphrase_model_name)

df_paraphrased_selected_model_full['passed_ner_abr_filter_ic'] = df_paraphrased_selected_model_full.apply(lambda x: filter_and_replace_tech_term_paraphrased_claim(x['gen_claim'], x['org_claim']), axis=1)


df_paraphrased_selected_model_full[['mlnli_val_org_gen', 'mlnli_label_org_gen', 'mlnli_val_gen_org', 'mlnli_label_gen_org']] = df_paraphrased_selected_model_full.apply(lambda cur_row : get_mlnli_label (cur_row['org_claim'], cur_row['gen_claim']), axis=1)


fle_dataframe_to_save = paraphrase_model_name+'_'+str(PARAPHRASE_PROJECT_SETTINGS['run_settings']['PARAPHRASE_FT_DATASET_DIRECTION'])+'_'+str(CUR_NO_OF_EPOCH_FT)+'_FT'

with open('../../dfs_generated/paraphrased/paws/tech_term_ner_mlnli/'+fle_dataframe_to_save+'_concat_prev.pkl', 'wb') as fp:
    pickle.dump(df_paraphrased_selected_model_full, fp)


#df_paraphrased_selected_model_full = pd.read_pickle('../../dfs_generated/paraphrased/paws/cumulative_tech_term_ner_ic/paws_base_no_fine_tune_ParaphraseTargetDirection.org_refute_to_gen_support_0_FT_concat_prev.pkl')


model_t5_not_fine_tuned = model_t5_not_fine_tuned.cpu()    
del model_t5_not_fine_tuned

#df_all_cur_model_org_success = pd.DataFrame(df_all_cur_model_org_success['columns_for_org_claim'].unique(), columns = ['org_claim'])
#split dataframe of support major and refute major
while(True):
    # Get paraphrased sentences with majority of Support or refute.
    df_paraphrased_org_support_major, df_paraphrased_org_refute_major, df_all_paraphrased_cur_model_org_success= get_dataframes_by_majority_org_claim(df_paraphrased_selected_model_full)
    report_dataframes_by_majority_org_claim(df_paraphrased_org_support_major, df_paraphrased_org_refute_major, df_all_paraphrased_cur_model_org_success)


    # Get successfullt attacked claims after paraphrased
    df_org_support_gen_refute, df_org_refute_gen_support = get_df_succesfully_attacked_claim(df_paraphrased_org_support_major, df_paraphrased_org_refute_major)
    report_df_succesfully_attacked_claim(df_org_support_gen_refute, df_org_refute_gen_support, cur_epoch = CUR_NO_OF_EPOCH_FT)

    report_df_filter(df_org_support_gen_refute, 'org SUP gen REF' ,CUR_NO_OF_EPOCH_FT)
    report_df_filter(df_org_refute_gen_support, 'org REF gen SUP' ,CUR_NO_OF_EPOCH_FT)
    
    
    df_fine_tuning_dataset = None

    ## Select dataset for fine-tuning ::either support major or refute major
    if PARAPHRASE_PROJECT_SETTINGS['run_settings']['PARAPHRASE_FT_DATASET_DIRECTION'] == ParaphraseTargetDirection.org_support_to_gen_refute:
        df_fine_tuning_dataset = df_org_support_gen_refute.copy()
    elif PARAPHRASE_PROJECT_SETTINGS['run_settings']['PARAPHRASE_FT_DATASET_DIRECTION'] == ParaphraseTargetDirection.org_refute_to_gen_support:
        df_fine_tuning_dataset = df_org_refute_gen_support.copy()
    else:
        raise ValueError('Select a direction of fine tuning dataset')

    ##Filter tech terms
    df_fine_tuning_dataset = df_fine_tuning_dataset[df_fine_tuning_dataset['passed_ner_abr_filter_ic'] == True]
    
    df_fine_tuning_dataset = df_fine_tuning_dataset[(df_fine_tuning_dataset['mlnli_label_org_gen'] == 'entailment') &
                                       (df_fine_tuning_dataset['mlnli_label_gen_org'] == 'entailment')]
    
    ## Train model with fine-tuning dataset
    df_fine_tuning_dataset.reset_index(drop=True, inplace=True)
    train_split_size = PARAPHRASE_PROJECT_SETTINGS['run_settings']['PARAPHRASE_FT_TRAIN_SPLIT']
    df_train_fine_tune, df_validate_fine_tune = get_train_test_dataset(df_fine_tuning_dataset, train_split_size)

    num_train_epochs = PARAPHRASE_PROJECT_SETTINGS['run_settings']['NUM_OF_EPOCH_REQ_FT']
    fineTuneHyperParam = FineTuneHyperParams(model_name_path = paraphrase_model_path_url, 
                                             num_train_epochs = num_train_epochs, df_train = df_train_fine_tune, 
                                             df_val = df_validate_fine_tune, df_train_val = df_fine_tuning_dataset)

    model_t5_fine_tuned = T5FineTuner(fineTuneHyperParam.args_fine_tune_ns)
    trainer_model_t5_fine_tune = pl.Trainer(**fineTuneHyperParam.train_params_fine_tune)
    trainer_model_t5_fine_tune.fit(model_t5_fine_tuned)

    CUR_NO_OF_EPOCH_FT += 1
    tokenizer_t5 = AutoTokenizer.from_pretrained(paraphrase_model_path_url)  
    _ = trainer_model_t5_fine_tune.model.to(device)
    ## Ask fine-tuned model to paraphrase

    ## Evaluate paraphrased dataset
    df_paraphrased_fine_tuned = get_paraphrased_sentence_with_detail_stat(df_dataset_to_be_paraphrased = df_all_cur_model_org_success.iloc[:, :], 
                                              model_t5 = model_t5_fine_tuned, 
                                              tokenizer_t5 = tokenizer_t5, 
                                              model_name_t5 = paraphrase_model_name)

    
    df_paraphrased_fine_tuned['passed_ner_abr_filter_ic'] = df_paraphrased_fine_tuned.apply(lambda x: filter_and_replace_tech_term_paraphrased_claim(x['gen_claim'], x['org_claim']), axis=1)


    df_paraphrased_fine_tuned[['mlnli_val_org_gen', 'mlnli_label_org_gen', 'mlnli_val_gen_org', 'mlnli_label_gen_org']] = df_paraphrased_fine_tuned.apply(lambda cur_row : get_mlnli_label (cur_row['org_claim'], cur_row['gen_claim']), axis=1)

    fle_dataframe_to_save = paraphrase_model_name+'_'+str(PARAPHRASE_PROJECT_SETTINGS['run_settings']['PARAPHRASE_FT_DATASET_DIRECTION'])+'_'+str(CUR_NO_OF_EPOCH_FT)+'_FT'
    #list_paraphrase_model_names[0]+'_'+PARAPHRASE_PROJECT_SETTINGS['run_settings']['PARAPHRASE_FT_DATASET_DIRECTION']+'_'+NUM_OF_EPOCH_REQ_FT
    with open('../../dfs_generated/paraphrased/paws/tech_term_ner_mlnli/'+fle_dataframe_to_save+'_concat_prev.pkl', 'wb') as fp:
        pickle.dump(df_paraphrased_fine_tuned, fp)
    
    print('>> before epoch '+str(CUR_NO_OF_EPOCH_FT)+' size was '+str(len(df_paraphrased_selected_model_full)))
    df_paraphrased_selected_model_full = pd.concat([df_paraphrased_selected_model_full, df_paraphrased_fine_tuned], ignore_index=True)
    print('>> after epoch '+str(CUR_NO_OF_EPOCH_FT)+' size was '+str(len(df_paraphrased_selected_model_full)))

In [None]:
report_dataframes_by_majority_org_claim(df_paraphrased_org_support_major, df_paraphrased_org_refute_major, df_all_paraphrased_cur_model_org_success)

In [None]:
report_df_succesfully_attacked_claim(df_org_support_gen_refute, df_org_refute_gen_support, cur_epoch = CUR_NO_OF_EPOCH_FT)

In [None]:
x = df_org_support_gen_refute.progress_apply(lambda x: filter_and_replace_tech_term_paraphrased_claim(x['gen_claim'], x['org_claim']), axis=1)

In [None]:
#https://stackoverflow.com/questions/29996079/match-a-whole-word-in-a-string-using-dynamic-regex
def filter_and_replace_tech_term_paraphrased_claim(claim_paraphrased, claim_original):
    print('\n>>')
    print(claim_paraphrased)
    print(claim_original)
    #claim_para_trimmed = re.sub('[^a-z]+', ' ', claim_paraphrased.lower())
    df_cur_sentence_word_unq_ner_abr_filtered = df_scispacy_sentence_word_unq_ner_abr_filtered[
        df_scispacy_sentence_word_unq_ner_abr_filtered['claim'] == claim_original
    ]
    for cur_term_row in df_cur_sentence_word_unq_ner_abr_filtered.itertuples(index=False):
        cur_term_row_formatted = r'(?<!\S){}(?!\S)'.format(re.escape(cur_term_row.ner_text))
        res_num = re.findall(cur_term_row_formatted, claim_paraphrased)
        if res_num == []:
            print(cur_term_row_formatted.casefold())
            return False
        
    return True

In [None]:
len(x[x == True])

In [None]:
cur_term_row = 'HIV'
claim_paraphrased = 'HIV Having a main partner worsens HIVd outcomes HIV .'
cur_term_row_formatted = r'(?<!\w){}(?!\w)'.format(re.escape(cur_term_row.))
re.findall(cur_term_row_formatted, claim_paraphrased)

In [None]:
x

In [None]:
cur_term_row_formatted

In [None]:
claim_paraphrased.lower().strip()

In [None]:
re.escape(cur_term_row).lower()

In [None]:
 filter_and_replace_tech_term_paraphrased_claim

In [None]:
print(df_org_support_gen_refute[df_org_support_gen_refute['mlnli_label_org_gen'] == 'entailment'].shape)
print(df_org_support_gen_refute[
    (df_org_support_gen_refute['mlnli_label_org_gen'] == 'entailment') &
    (df_org_support_gen_refute['mlnli_label_gen_org'] == 'entailment')
].shape)
print(df_org_support_gen_refute[df_org_support_gen_refute['passed_ner_abr_filter_ic'] == True].shape)

In [None]:
df_org_support_gen_refute[df_org_support_gen_refute['passed_ner_abr_filter_ic'] == False][['org_claim', 'gen_claim']].values

In [None]:
print(df_org_refute_gen_support[df_org_refute_gen_support['mlnli_label_org_gen'] == 'entailment'].shape)


print(df_org_refute_gen_support[
    (df_org_refute_gen_support['mlnli_label_org_gen'] == 'entailment') &
    (df_org_refute_gen_support['mlnli_label_gen_org'] == 'entailment')
].shape)

print(df_org_refute_gen_support[df_org_refute_gen_support['passed_ner_abr_filter_ic'] == True].shape)

In [None]:
len(df_org_refute_gen_support[df_org_refute_gen_support['mlnli_label_org_gen'] == 'entailment']['org_claim'].unique())

In [None]:
'IL-1β'.lower() == 'IL-1β'.casefold()


In [None]:
report_df_filter(df_org_support_gen_refute, 'org SUP gen REF' ,CUR_NO_OF_EPOCH_FT)

## No fine tuned model

In [None]:
import transformers
import pytorch_lightning

In [None]:
print(transformers.__version__)
print(pytorch_lightning.__version__)

In [None]:
df_fine_tuning_dataset

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("razent/SciFive-large-Pubmed_PMC")  
model = AutoModelForSeq2SeqLM.from_pretrained("razent/SciFive-large-Pubmed_PMC")


In [None]:
from transformers import AutoTokenizer, AutoModel

In [None]:
model_t5 = T5ForConditionalGeneration.from_pretrained('Vamsi/T5_Paraphrase_Paws')
tokenizer_t5 = T5Tokenizer.from_pretrained('razent/SciFive-large-Pubmed_PMC')       
_ = model_t5.to(device)

In [None]:
org_sentence = 'BCL-2 promotes the apoptotic effects of c-Myc.'
#'A T helper 2 cell (Th2) environment impedes disease development in patients with systemic lupus erythematosus (SLE)'

In [None]:
text =  "paraphrase: " + org_sentence 

encoding = tokenizer_t5.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)


outputs = model_t5.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    max_length=256,
    do_sample=True,
    top_k=50,
    top_p=0.99,
    repetition_penalty=3.5,
    early_stopping=True,
    num_return_sequences=10
)

gen_sentences_t5 = []
for output in outputs:
    line = tokenizer_t5.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    gen_sentences_t5.append(line)

In [None]:
set(gen_sentences_t5)

In [None]:
set(gen_sentences_t5)

In [None]:
import re

In [None]:
target_claim = 'BCL-2 promotes the apoptosis effects of c-myce a4pt gene.'
org_term = 'c-Myce A4PT Gene'
term_simple_form = org_term.lower()
#pattern = re.compile(r'\b{}\b'.format(term_simple_form), re.IGNORECASE)
regex_to_search = r'\b(?=\w)' + re.escape(term_simple_form) + r'\b(?!\w)'
pattern = re.compile(r'\b{}\b'.format(regex_to_search), re.IGNORECASE)
s2=pattern.sub(org_term,target_claim)


In [None]:
pattern

In [None]:
s2

https://blog.devgenius.io/different-ways-to-replace-occurences-of-a-substring-in-python-strings-2911b1f7bf86

https://betterprogramming.pub/5-ways-to-find-the-index-of-a-substring-in-python-13d5293fc76d

In [None]:
!conda env list

In [None]:
!/home/qudratealahyratu/anaconda3/envs/scifact/bin/pip install -U spacy[cuda113]

In [None]:
import thinc
thinc.__version__

In [None]:
!pip help uninstall

In [None]:
import pandas as pd
df_paraphrased_selected_model_full = pd.read_pickle('../../dfs_generated/paraphrased/paws/cumulative_tech_term_ner_ic/paws_base_no_fine_tune_ParaphraseTargetDirection.org_refute_to_gen_support_0_FT_concat_prev.pkl')

In [None]:
df_paraphrased_selected_model_full[]

In [None]:
df_tmp = pd.read_pickle('../../dfs_generated/paraphrased/paws/cumulative_tech_term_ner_ic/paws_base_no_fine_tune_ParaphraseTargetDirection.org_support_to_gen_refute_0_FT_concat_prev.pkl')

In [None]:
df_tmp.columns

In [None]:
df_tmp['passed_ner_abr_filter_ic'].value_counts()

In [None]:
df_tmp['passed_ner_abr_filter_ic'] = df_tmp.progress_apply(lambda x: filter_and_replace_tech_term_paraphrased_claim(x['gen_claim'], x['org_claim']), axis=1)

In [None]:
HTML(df_tmp[df_tmp['passed_ner_abr_filter_ic'] == True][['org_claim', 'gen_claim']].to_html())

In [None]:
pd.read_pickle('../../dfs_generated/paraphrased/t5_no_fine_tune_generated_claim_all_model_df_full_1.pkl').columns

In [None]:
df_tmp = pd.read_pickle('../../dfs_generated/paraphrased/t5_no_fine_tune_generated_claim_all_model_df_full_1.pkl')[['org_claim', 'ground_label', 'ground_list_rationales', 'source',
       'org_count_support', 'org_count_refute', 'org_list_supported_ids',
       'org_list_refuted_ids', 'org_list_supported_confidence',
       'org_list_refuted_confidence', 'org_list_supported_confidence_mean',
       'org_list_refuted_confidence_mean', 'org_comment']]

In [None]:
df_tmp = df_tmp.drop_duplicates('org_claim', keep='first')
df_tmp.reset_index(drop=True, inplace = True)

In [None]:
df_tmp.to_pickle('../../dfs_generated/scifact/org_claim_ext_roberta_roberta.pkl')