In [None]:
import pandas as pd
import json
import datetime
from pandas.io.json import json_normalize
import requests
import bs4
import re
from tqdm import tqdm
from bs4 import BeautifulSoup
import nltk



##function to find wikipage with the revision and query negative examples
#function to query claims for real revisions (with revision_id) and synthetic revisions(with no revision id)
def query_claims(revision_id, page, gold_evidence, revision_type):
    negative_evidences = []
    if revision_type == 'real':
        url = "https://en.wikipedia.org/w/index.php?title=" + str(page) + "&oldid=" + str(revision_id)
    else:
        url = "https://en.wikipedia.org/wiki/"+ str(page)
    try:
        page = requests.get(url)
    except:
        list_none = [None] * 10
        return list_none
    
    soup = BeautifulSoup(page.content, 'html.parser')
    object = soup.find(id="mw-content-text")
    items = object.find_all('p')
    for i in range(len(items)):
        extracted_para = items[i].get_text()
        extracted_sentences = nltk.tokenize.sent_tokenize(extracted_para)
        if len(extracted_sentences) > 0:
            for j in range(len(extracted_sentences)):
                neg_evidence = re.sub("[\(\[].*?[\)\]]", "", extracted_sentences[j])
                if not check_evidence(neg_evidence,gold_evidence) and len(neg_evidence) >= 5:
                    negative_evidences.append(neg_evidence)
                    if (len(negative_evidences)==9):
                        return negative_evidences
                else:
                    pass

    return [None]*10
            

#function to check that negative example is not the actual evidence
def check_evidence(neg_example, gold_evidence):
    neg_example=re.sub("[\(\[].*?[\)\]]", "", neg_example)
    gold_evidence=re.sub("[\(\[].*?[\)\]]", " ", gold_evidence)
    neg_example=re.sub(r'[^\w]', ' ', neg_example)
    gold_evidence=re.sub(r'[^\w]', ' ', gold_evidence)
    neg_example = neg_example.replace(" ","").lower()
    gold_evidence = gold_evidence.replace(" ","").lower()
    if neg_example == gold_evidence:
        return True
    return False
    
#function to add negative examples to original evidence and format evidences into dataframe that can be used for training
def format_examples(df):
    final_df = df[['unique_id','claim', 'evidence','label']].copy()
    final_df = final_df.rename(columns={'unique_id':'id'})
    neg_examples_list_dict = []
    
    for row in tqdm(range(len(df))):
        neg_eg = query_claims(df.iloc[row]['wiki_revision_id'],df.iloc[row]['page'],
                          df.iloc[row]['evidence'], df.iloc[row]['revision_type'])
        #print(neg_eg)
        if neg_eg[0] != None:
            for neg in neg_eg:
                neg_examples_dict = {}
                neg_examples_dict['id'] = df.iloc[row]['unique_id']
                neg_examples_dict['claim'] = df.iloc[row]['claim']
                neg_examples_dict['evidence'] = neg
                neg_examples_dict['label'] = 0
                neg_examples_list_dict.append(neg_examples_dict)
        
        else:
            final_df = final_df[final_df.id != df.iloc[row]['unique_id']]
        
        
        
    neg_df = pd.DataFrame(neg_examples_list_dict) 
    frames = [final_df, neg_df]
    result = pd.concat(frames)
    result_sort = result.sort_values('id') 
    
    return result_sort
    
###function to read data into pandas dataframe and remove NEI
def read_tsv(file):
    df_data = pd.read_json(file, lines=True)
    df_data = df_data[df_data.label != 'NOT ENOUGH INFO']
    df_data.loc[df_data.label == "SUPPORTS", "label"] = 1
    df_data.loc[df_data.label == "REFUTES", "label"] = 2
    return df_data
    
train_set = read_tsv('train.jsonl')
train_df = format_examples(train_set)
train_df = train_df.set_index('id')
train_df.to_csv('train.tsv', sep="\t")

dev_set = read_tsv('dev.jsonl')
dev_df = format_examples(dev_set)
dev_df = dev_df.set_index('id')
dev_df.to_csv('dev.tsv', sep="\t")

test_set = read_tsv('test.jsonl')
test_df = format_examples(test_set)
test_df = test_df.set_index('id')
test_df.to_csv('test.tsv', sep="\t")