This file contains our system for the FEVER task

# COMP90042 Project 2019: Automatic Fact Verification

Team: ***Halation***

Group Members: ***Zhouhui Wu*** & ***Dongsheng Xie***

## 0. Dependency

In [0]:
!pip3 install nltk
!pip3 install tensorflow
!pip3 install keras
!pip3 install spacy
!python3 -m spacy download en_vectors_web_lg
!python3 -m spacy download en_core_web_sm
!pip3 install allennlp

## 1. Wiki Document Preprocess

In [0]:
import os,json,sys,math
import nltk
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from collections import Counter
import spacy
import random
import allennlp
from allennlp.predictors.predictor import Predictor
from scipy import spatial
import unicodedata
import numpy as np
import keras

class InvertedIndex:
    # code reuse from homework 1
    def __init__(self, vocab, doc_term_freqs):
        self.vocab = vocab
        self.doc_len = [0] * len(doc_term_freqs)
        self.doc_term_freqs = [[] for i in range(len(vocab))]
        self.doc_ids = [[] for i in range(len(vocab))]
        self.doc_freqs = [0] * len(vocab)
        self.total_num_docs = 0
        self.max_doc_len = 0
        for docid, term_freqs in enumerate(doc_term_freqs):
            doc_len = sum(term_freqs.values())
            self.max_doc_len = max(doc_len, self.max_doc_len)
            self.doc_len[docid] = doc_len
            self.total_num_docs += 1
            for term, freq in term_freqs.items():
                term_id = vocab[term]
                self.doc_ids[term_id].append(docid)
                self.doc_term_freqs[term_id].append(freq)
                self.doc_freqs[term_id] += 1

    def num_terms(self):
        return len(self.doc_ids)

    def num_docs(self):
        return self.total_num_docs

    def docids(self, term):
        term_id = self.vocab[term]
        return self.doc_ids[term_id]

    def freqs(self, term):
        term_id = self.vocab[term]
        return self.doc_term_freqs[term_id]

    def f_t(self, term):
        term_id = self.vocab[term]
        return self.doc_freqs[term_id]

    def space_in_bytes(self):
    # this function assumes each integer is stored using 8 bytes
        space_usage = 0
        for doc_list in self.doc_ids:
            space_usage += len(doc_list) * 8
        for freq_list in self.doc_term_freqs:
            space_usage += len(freq_list) * 8
        return space_usage

def get_index(wiki_documents):
    processed_titles =[]
    wiki_titles = []
    for wiki_title in wiki_documents:
        processed_title = get_raw_word(wiki_title)
        processed_titles.append(processed_title)
        wiki_titles.append(wiki_title)
    processed_docs = []
    vocab = {}
    doc_term_freqs = []
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
    for raw_doc in processed_titles:
        norm_doc = []
        tokens = nltk.tokenize.word_tokenize(raw_doc)
        for token in tokens:
            term = lemmatizer.lemmatize(token).lower()
            norm_doc.append(term)        
            if term not in vocab.keys():
                vocab[term]= len(vocab)
        processed_docs.append(norm_doc)
    for norm_doc in processed_docs:
        temp = Counter(norm_doc)
        doc_term_freqs.append(temp)
    index = InvertedIndex(vocab, doc_term_freqs)
    return index,wiki_titles

def query_tfidf(query, index, k=5):
    scores = Counter()
    termScore=0
    N=index.num_docs()
    for term in query:
        position=0
        if term not in index.vocab:
            continue
        docids = index.docids(term)
        for docid in docids:
            fdtList= index.freqs(term)
            fdt = fdtList[position]
            ft= index.f_t(term)
            termScore = math.log(1+fdt)*math.log(N/ft)
            if docid not in scores.keys():
                scores[docid] = termScore
            else:
                scores[docid] += termScore
            position +=1
    for docid in scores.keys():
        scores[docid] =(1/math.sqrt(index.doc_len[docid]))*scores[docid]
    return scores.most_common(k)

def file_name(file_dir):   
    L=[]   
    for root, dirs, files in os.walk(file_dir):  
        for file in files:  
            if os.path.splitext(file)[1] == '.txt':  
                L.append(str(file))  
    return L

def wiki_title_preprocess():
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
    names = file_name('wiki-pages-text')
    documents={}
    for name in names:
        file = open('wiki-pages-text/'+name, 'r')
        for line_number,line in enumerate(file):
            tokens=line.split(' ')
            title=tokens[0]
            if title not in documents:
                documents[title]=(name,line_number)
        file.close()
    return documents

def get_wiki_document(filename,location):
    document={}
    title=''
    file = open('wiki-pages-text/'+filename, 'r')
    for line_number,line in enumerate(file):
        if line_number==location:
            tokens=line.split(' ')
            title=tokens[0]
            if tokens[1].isdigit():
                sentence_number=int(tokens[1])
                tokens[len(tokens)-1]=tokens[len(tokens)-1].replace('\n','')
                content=tokens[2:]
                document[sentence_number]=content
        elif line_number<location:
            continue
        else:
            tokens=line.split(' ')
            current_title=tokens[0]
            if title==current_title:
                if tokens[1].isdigit():
                    sentence_number=int(tokens[1])
                    tokens[len(tokens)-1]=tokens[len(tokens)-1].replace('\n','')
                    content=tokens[2:]
                    document[sentence_number]=content
            else:
                break
    file.close()
    return document

def get_wiki_sentence(filename,location,number):
    file = open('wiki-pages-text/'+filename, 'r')
    title=''
    sentence=[]
    for line_number,line in enumerate(file):
        if line_number==location:
            tokens=line.split(' ')
            title=tokens[0]
            if tokens[1].isdigit():
                sentence_number=int(tokens[1])
                tokens[len(tokens)-1]=tokens[len(tokens)-1].replace('\n','')
                content=tokens[2:]
                if number==sentence_number:
                    sentence=tokens[2:]
                    break
        elif line_number<location:
            continue
        else:
            tokens=line.split(' ')
            current_title=tokens[0]
            if title==current_title:
                if tokens[1].isdigit():
                    sentence_number=int(tokens[1])
                    tokens[len(tokens)-1]=tokens[len(tokens)-1].replace('\n','')
                    content=tokens[2:]
                    if number==sentence_number:
                        sentence=tokens[2:]
                        break
            else:
                break
    file.close()
    sentence=get_raw_sentence(sentence)
    return sentence

def get_wiki_first_sentence(filename,location):
    title=''
    sentence=''
    number=0
    file = open('wiki-pages-text/'+filename, 'r')
    for line_number,line in enumerate(file):
        if line_number==location:
            tokens=line.split(' ')
            title=tokens[0]
            if tokens[1].isdigit():
                number=int(tokens[1])
                tokens[len(tokens)-1]=tokens[len(tokens)-1].replace('\n','')
                content=tokens[2:]
                sentence=get_raw_sentence(content)
            else:
                number=0
                tokens[len(tokens)-1]=tokens[len(tokens)-1].replace('\n','')
                content=tokens[1:]
                sentence=get_raw_sentence(content)
    file.close()
    return sentence,number

def get_raw_sentence(sentence):
    raw_sentence=''
    for word in sentence:
        word=unicodedata.normalize('NFC',word)
        if word=='_':
            raw_sentence+=' '
        elif word=='-LRB-':
            raw_sentence+='( '
        elif word=='-RRB-':
            raw_sentence+=') '
        elif word=='-LCB-':
            raw_sentence+='{ '
        elif word=='-RCB-':
            raw_sentence+='} '
        elif word=='-LSB-':
            raw_sentence+='[ '
        elif word=='-RSB-':
            raw_sentence+='] '
        elif word=='\n':
            continue
        else:
            raw_sentence+=(word+' ')
    return raw_sentence

def get_raw_word(word):
    raw_word=word.replace('_',' ')
    raw_word=raw_word.replace('-LRB-','(')
    raw_word=raw_word.replace('-RRB-',')')
    raw_word=raw_word.replace('-LCB-','{')
    raw_word=raw_word.replace('-RCB-','}')
    raw_word=raw_word.replace('-LSB-','[')
    raw_word=raw_word.replace('-RSB-',']')
    raw_word=unicodedata.normalize('NFC',raw_word)
    return raw_word

def get_entities_by_spacy(query):
    entities=set()
    doc = nlp(query)
    for entity in doc.ents:
        entities.add(entity.text)
    return entities

def get_entities_by_allen_nlp(query,ner):
    results = ner.predict(sentence=query)
    entities=set()
    i=0
    while i <len(results["words"]):
        word=results['words'][i]
        tag=results['tags'][i]
        new_word=word
        while i+1<len(results["words"]):
            next_word=results['words'][i+1]
            next_tag=results['tags'][i+1]
            if sametag(tag,next_tag) and tag!='O':
                i+=1
                new_word+=' '+next_word
            else:
                break
        if i==len(results["words"])-1:
            if tag!='O' and word not in entities:
                entities.add(word.lower())
        i+=1
        if new_word not in entities and tag!='O':
            entities.add(new_word.lower())
    return entities

def sametag(tag1,tag2):
    if len(tag1)>1 and len(tag2)>1:
        if tag1[1:]==tag2[1:]:
            return True
    return False

def get_relevant_document_entity(query,ner,alias_wiki,index,wiki_titles):
    entities=get_entities_by_allen_nlp(query,ner)
    documents=set()
    for entity in entities:
        if entity in alias_wiki:
            entity_documents=alias_wiki[entity]
            for document in entity_documents:
                if document not in documents:
                    documents.add(document)
        else:
            _documents=get_relevant_document_tf_idf(entity,index,wiki_titles,3)
            for document in _documents:
                if document not in documents:
                    documents.add(document)
    return documents

def get_alias_dictionaries(wiki):
    alias_wiki={}
    for processed_title in wiki:
        raw_title=get_raw_word(processed_title)
        if raw_title in alias_wiki:
            alias_wiki[raw_title].append(processed_title)
        else:
            alias_wiki[raw_title]=[processed_title]
        title_lower=raw_title.lower()
        if title_lower in alias_wiki:
            alias_wiki[title_lower].append(processed_title)
        else:
            alias_wiki[title_lower]=[processed_title]

        if '(' in raw_title and ')' in raw_title:
            position_l=raw_title.index('(')
            title_1=raw_title[0:position_l].rstrip()
            title_1_lower=title_1.lower()
            if title_1 in alias_wiki:
                alias_wiki[title_1].append(processed_title)
            else:
                alias_wiki[title_1]=[processed_title]
            if title_1_lower in alias_wiki:
                alias_wiki[title_1_lower].append(processed_title)
            else:
                alias_wiki[title_1_lower]=[processed_title]  
    return alias_wiki

def get_relevant_document_tf_idf(query,index,wiki_titles,num):
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
    tokens = nltk.tokenize.word_tokenize(query)
    processed_query=[]
    for token in tokens:
        term = lemmatizer.lemmatize(token).lower()
        processed_query.append(term)
    ids=query_tfidf(processed_query,index,k=num)
    documents=set()
    for id in ids:
        documents.add(wiki_titles[id[0]])
    return documents

def get_relevant_document_part(query,wiki,alias_wiki,nlp):
    doc_query = nlp(query)
    documents=set()
    _split=-1
    for i,token in enumerate(doc_query):
        if token.pos_=='VERB':
            _split=i
            break
            
    if _split!=-1 and i+1<len(query):
        doc1=doc_query[0:i]
        doc2=doc_query[i+1:]
        doc1_text=''
        doc2_text=''
        for i in doc1:
            doc1_text+=i.text+' '
        for i in doc2:
            doc2_text+=i.text+' '
        _doc1=get_relevant_document_tf_idf(doc1_text,index,wiki_titles,3)
        _doc2=get_relevant_document_tf_idf(doc2_text,index,wiki_titles,3)
        _doc3=get_relevant_document_part_entity(alias_wiki,doc1_text)
        _doc4=get_relevant_document_part_entity(alias_wiki,doc2_text)
        documents=_doc1 |_doc2 | _doc3 | _doc4
    return documents

def get_relevant_document_part_entity(alias_wiki,entity):
    documents=set()
    if entity in alias_wiki:
        entity_documents=alias_wiki[entity]
        for document in entity_documents:
            if document not in documents:
                documents.add(document)
    entity=entity.lower()
    if entity in alias_wiki:
        entity_documents=alias_wiki[entity]
        for document in entity_documents:
            if document not in documents:
                documents.add(document)
    return documents

def get_relevant_document_verb(query,wiki,alias_wiki,wiki_titles,index,nlp):
    doc_query = nlp(query)
    for i,token in enumerate(doc_query):
        if token.pos_=='VERB':
            if i+1<=len(doc_query)-1:
                part1=query[:i]
                part2=query[i+1:]
                documents_tf_idf_1=get_relevant_document_tf_idf(part1,index,wiki_titles,2)
                documents_tf_idf_2=get_relevant_document_tf_idf(part2,index,wiki_titles,2)
                documents=documents_tf_idf_1 | documents_tf_idf_2
                return documents
    return set()
                
            

def get_retrieval_documents(claim,wiki,alias_wiki,wiki_titles,index,ner,nlp,parser):
    claim=unicodedata.normalize('NFC',claim)
    documents_entity=get_relevant_document_entity(claim,ner,alias_wiki,index,wiki_titles)
    documents_tf_idf=get_relevant_document_tf_idf(claim,index,wiki_titles,5)
#     documents_part=get_relevant_document_part(claim,wiki,alias_wiki,parser)
#     documents_verb=get_relevant_document_verb(claim,wiki,alias_wiki,wiki_titles,index,nlp)
    documents=documents_entity  | documents_tf_idf 
    return documents

def allen_nlp_ner():
    ner = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz")
    return ner

def save_wiki(wiki):
    file=open('wiki.txt','w')
    for entry in wiki:
        data=entry+'\t'+wiki[entry][0]+'\t'+str(wiki[entry][1])
        file.write(data+'\n')
    file.close()

def save_alias_wiki(alias_wiki):
    file=open('alias_wiki.txt','w')
    for entry in alias_wiki:
        data=entry
        for one_alias in alias_wiki[entry]:
            data+='\t'+one_alias
        file.write(data+'\n')
    file.close()

def load_wiki():
    file=open('wiki.txt','r')
    wiki={}
    for line in file:
        tokens=line.split('\t')
        wiki[tokens[0]]=(tokens[1],int(tokens[2]))
    file.close()
    return wiki

def load_alias_wiki():
    file=open('alias_wiki.txt','r')
    alias_wiki={}
    for line in file:
        tokens=line.split('\t')
        alias_wiki[tokens[0]]=[]
        for token in tokens[1:]:
            alias_wiki[tokens[0]].append(token.replace('\n',''))
    file.close()
    return alias_wiki



In [0]:
ner=allen_nlp_ner()
wiki=wiki_title_preprocess()
alias_wiki=get_alias_dictionaries(wiki)
print('wiki data processed')
index,wiki_titles=get_index(wiki)

## 2. Document Selection

In [None]:
def test_on_document_retrieval(wiki,_test_file,_output_file):
    perfect_file=open(_test_file,'r')
    retrieval_file=open('result.txt','r')
    all_documents=set()
    find_documents=set()
    perfect=json.load(perfect_file)
    _ids={}
    for line in retrieval_file:
        items=line.split('\t')
        _id=items[0]
        titles=items[1][2:len(items[1])-3].split("', '")
        _ids[_id]=set()
        for title in titles:
            _ids[_id].add(title)
    output={}
    for data in perfect:
        real=perfect[data]['evidence']
        for _real in real:
            if _real[0] not in all_documents:
                all_documents.add(_real[0])
        one_output={}
        one_output['evidence']=[]
        for _id in _ids[data]:
            filename,location=wiki[_id]
            sentences=get_wiki_document(filename,location)
            for i in sentences:
                one_output['evidence'].append((_id,i,sentences[i]))
            if _id not in find_documents:
                find_documents.add(_id)
        one_output['label']='SUPPORTS'
        output[data]=one_output
    find_documents=find_documents&all_documents
    output_file=open(_output_file,'w')
    output_file.write(json.dumps(output))
    output_file.close()
    print('real recall: '+str(len(find_documents)/len(all_documents)))
        
        
def document_retrieval(wiki,alias_wiki,index,wiki_titles,ner,_test_file):
    nlp = spacy.load('en_core_web_sm')
    file = open(_test_file, 'r')
    result=open('result.txt','w')
    wrong=open('wrong.txt','w')
    training_data=json.load(file)
    correct_number=0
    total_number=0
    perfect_retrieval_instance=0
    all_instance=0
    for i,data in enumerate(training_data):

        claim=training_data[data]['claim']
        correct_documents=set()
        for document in training_data[data]['evidence']:
            correct_documents.add(document[0])
        total_number+=len(correct_documents)
        retrieval_documents=get_retrieval_documents(claim,wiki,alias_wiki,wiki_titles,index,ner,nlp)
        found_documents=correct_documents & retrieval_documents
        correct_number+=len(found_documents)
        if found_documents==correct_documents:
            perfect_retrieval_instance+=1
        else:
            wrong.write(claim+'\t'+str(retrieval_documents)+'\t'+str(correct_documents)+'\n')
        result.write(data+'\t'+str(retrieval_documents)+'\n')
        all_instance+=1

        if i%100==0:
            print(i)
    file.close()
    result.close()
    wrong.close()
    found=perfect_retrieval_instance/all_instance
    print(found)
    
def test_on_document_retrieval_test(wiki,_test_file,_output_file):
    perfect_file=open(_test_file,'r')
    retrieval_file=open('test_result.txt','r')
    all_documents=set()
    find_documents=set()
    perfect=json.load(perfect_file)
    _ids={}
    for line in retrieval_file:
        items=line.split('\t')
        _id=items[0]
        titles=items[1:]
        _ids[_id]=set()
        for title in titles:
            _ids[_id].add(title.replace('\n',''))
    output={}
    for j,data in enumerate(perfect):
        if j%100==0:
            print(j)
        one_output={}
        one_output['claim']=perfect[data]['claim']
        one_output['evidence']=[]
        for _id in _ids[data]:
            filename,location=wiki[_id]
            sentences=get_wiki_document(filename,location)
            for i in sentences:
                sentence=sentences[i]
                one_output['evidence'].append((_id,i,sentence))
            if _id not in find_documents:
                find_documents.add(_id)
        one_output['label']='SUPPORTS'
        output[data]=one_output
    perfect_file.close()
    retrieval_file.close()
    output_file=open(_output_file,'w')
    output_file.write(json.dumps(output))
    output_file.close()
        
        
def document_retrieval_test(wiki,alias_wiki,index,wiki_titles,ner,_test_file,parser):
    nlp = spacy.load('en_vectors_web_lg')
    file = open(_test_file, 'r')
    result=open('test_result.txt','w')
    training_data=json.load(file)
    for i,data in enumerate(training_data):
        claim=training_data[data]['claim']
        retrieval_documents=get_retrieval_documents(claim,wiki,alias_wiki,wiki_titles,index,ner,nlp,parser)
        documents_str=''
        for doc in retrieval_documents:
            documents_str+='\t'+doc
        result.write(data+documents_str+'\n')
        if i%100==0:
            print(i)
    file.close()
    result.close()


In [0]:
# document selection validation
parser = spacy.load('en_core_web_sm')
document_retrieval(wiki,alias_wiki,index,wiki_titles,ner,'devset.json',parser)
test_on_document_retrieval(wiki,'devset.json','document_selection_output.json')

In [None]:
# document selection test
parser = spacy.load('en_core_web_sm')
document_retrieval_test(wiki,alias_wiki,index,wiki_titles,ner,'test-unlabelled.json',parser)
test_on_document_retrieval_test(wiki,'test-unlabelled.json','document_selection_output_test.json')

## 3. Sentence Selection

### 3.1 Selection Part 1: Using Sentence Embeddings

In [0]:
def sentence_selection_step(_test_file,_input_file,_output_file,k):
    test_file=open(_test_file,'r')
    test=json.load(test_file)
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
    nlp=spacy.load('en_vectors_web_lg')
    parser=spacy.load('en_core_web_sm')
    results={}
    last_step_result=open(_input_file,'r')
    sentences={}
    _result=json.load(last_step_result)
    for data in _result:
        evidences=_result[data]['evidence']
        sentences[data]={}
        for evidence in evidences:
            document,number,sentence=evidence
            if '-LRB-disambiguation-RRB-' in document:
                continue
            if (document,number) not in sentences[data]:
                sentence=replace_all(sentence,get_raw_word(document))
                sentences[data][(document,number)]=get_raw_sentence(sentence)
    
    for i,data in enumerate(test):
        result={}
        claim=test[data]['claim']
        result['claim']=claim
        all_sentences=sentences[data] 
        best_sentences=sentence_selection(all_sentences,claim,nlp,k)
        result['evidence']=[]
        for document,location in best_sentences:
            result['evidence'].append((document,location,sentences[data][(document,location)]))
        result['label']='SUPPORTS'
        results[data]=result
        if i%100==0:
            print(i)
    test_file.close()
    output=json.dumps(results)
    output_file=open(_output_file,'w')
    output_file.write(output)
    output_file.close()

              
    
def sentence_selection(all_sentences,query,nlp,k):
    _doc1=nlp(query)
    doc1=_doc1.vector
    results=Counter()
    for document,sentence_number in all_sentences:
        title=nlp(get_raw_word(document))
        gonext=False
        for token in title:
            if token.pos_=='NUM' and token.is_alpha==False:
                if token.text not in query:
                    results[(document,sentence_number)]=0
                    gonext=True
                    break
        if gonext:
            continue
        sentence=all_sentences[(document,sentence_number)]
        _doc2=nlp(sentence)
        doc2=_doc2.vector
        similarity=1 - spatial.distance.cosine(doc1, doc2)
        results[(document,sentence_number)]=similarity
    sentences=results.most_common(k)
    outputs={}
    for item in sentences:
        outputs[item[0]]=all_sentences[item[0]]
    return outputs

def replace_all(sentence,title):
    new_sentence=[]
    for word in sentence:
        if word=='It' or word=='it' or word=='He' or word=='he' or word=='She' or word=='she' or word=='They' or word=='they' or word=='Them' or word=='them' or word=='Her' or word=='her' or word=='His' or word=='his' or word=='Its' or word=='its' or word=='Their' or word=='their':
            word=title
        new_sentence.append(word)
    return new_sentence

def test_on_sentence_selection(_input_file,_test_file,_output_file):
    input_file=open(_input_file,'r')
    file=json.load(input_file)
    perfect=open(_test_file,'r')
    _perfect=json.load(perfect)
    perfect_instance=0
    total=0
    rec=0.0
    for data in _perfect:
        if _perfect[data]['label']=='NOT ENOUGH INFO':
            continue
        total+=1
        perfect_evidence=_perfect[data]['evidence']
        pe=set()
        fe=set()
        for evidence in perfect_evidence:
            pe.add((evidence[0],evidence[1]))
        predict_evidence=file[data]['evidence']
        for evidence in predict_evidence:
            fe.add((evidence[0],evidence[1]))
        found=pe & fe
        rec+=float(len(found))/len(pe)
        if found==pe:
            perfect_instance+=1
    input_file.close()
    perfect.close()
    found=perfect_instance/total
    print(found)
    print(rec/total)
    _validate={}
    for data in file:
        instance={}
        instance['label']='SUPPORTS'
        instance['evidence']=[]
        for i in file[data]['evidence']:
            instance['evidence'].append((i[0],i[1]))
        _validate[data]=instance
    output=open(_output_file,'w')
    output.write(json.dumps(_validate))
    output.close()
  
    
def complete(guess,perfect,_output_file):
    _predict=open(guess,'r')
    predict=json.load(_predict)
    _devset=open(perfect,'r')
    devset=json.load(_devset)
    for id in devset:
        if id not in predict:
            temp={}
            temp['label']='NOT ENOUGH INFO'
            temp['evidence']=[]
            predict[id]=temp
    output=open(_output_file,'w')
    output.write(json.dumps(predict))
    output.close()

In [0]:
# sentence selection pt 1. validation
sentence_selection_step('devset.json','document_selection_output.json','sentence_selection_output.json',50)
test_on_sentence_selection('sentence_selection_output.json','devset.json','sentence_selection_output_validate.json')
complete('sentence_selection_output_validate.json','devset.json','o_0.json')

In [0]:
# sentence selection pt 1. test
sentence_selection_step('test-unlabelled.json','document_selection_output_test.json','sentence_selection_output_test.json',50)

### 3.2 Selection Part 2: Using Key words and Phrases

In [0]:
def sentence_further_selection_entity_step(_test_file,_input_file,_output_file):
    test_file=open(_test_file,'r')
    test=json.load(test_file)
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
    nlp=spacy.load('en_core_web_sm')
    parser=spacy.load('en_core_web_sm')
    last_step_result=open(_input_file,'r')
    cache={}
    sentences={}
    ner=allen_nlp_ner()
    _result=json.load(last_step_result)
    for i,data in enumerate(_result):
        if i%100==0:
            print(i)
        evidences=_result[data]['evidence']
        evi_dic={}
        sentences[data]={}
        sentences[data]['claim']=test[data]['claim']
        sentences[data]['evidence']=[]
        words_claim,numbers_claim=get_entity_noun(test[data]['claim'],nlp,ner)
        similarities=Counter()
        for evidence in evidences:
            document,number,sentence=evidence
            if '-LRB-disambiguation-RRB-' in document:
                continue
            evi_dic[(document,number)]=sentence
            words_sentence,numbers_sentence=set(),set()
            if (document,number) in cache:
                words_sentence,numbers_sentence=cache[(document,number)]
            else:
                words_sentence,numbers_sentence=get_entity_noun(sentence,nlp,ner)
                cache[(document,number)]=(words_sentence,numbers_sentence)
            gonext=False
            for a_number in numbers_claim:
                if a_number not in numbers_sentence:
                    gonext=True
                    break
            if gonext==True:
                continue
            word_useful=words_sentence & words_claim
            similarities[(document,number)]=len(word_useful)
        best_sentences=similarities.most_common(2)
        if len(best_sentences)==2 and best_sentences[0][1]>best_sentences[1][1]:
            item=best_sentences[0]
            sentences[data]['evidence'].append((item[0][0],item[0][1],evi_dic[(item[0][0],item[0][1])]))
        else:
            for item in best_sentences:
                sentences[data]['evidence'].append((item[0][0],item[0][1],evi_dic[(item[0][0],item[0][1])]))

    test_file.close()
    output=json.dumps(sentences)
    output_file=open(_output_file,'w')
    output_file.write(output)
    output_file.close()
    
def get_words(sentence,nlp):
    words,numbers=set(),set()
    doc=nlp(sentence)
    for token in doc:
        if token.lemma_ not in words:
            words.add(token.lemma_)
        if token.pos_=='NUM' and token.lemma_.isdigit():
            if try_float(token.lemma_):
                number=float(token.lemma_)
                if number not in numbers:
                    numbers.add(number)
    return words,numbers
    
def get_entity_noun(sentence,nlp,ner):
    entity_noun=set()
    numbers=set()
    doc_sentence=nlp(sentence)
    verb_index=-1
    target=''
    temp=''
    for ent in doc_sentence.ents:
        temp+=ent.text+' '
    for i,token in enumerate(doc_sentence):
        if token.pos_=='VERB':
            if verb_index==-1:
                verb_index==i
        if verb_index==-1:
            target+=token.lemma_+' '
        if token.pos_=='NOUN':
            temp+=token.lemma_+' '
        if token.pos_=='NUM' and token.lemma_.isdigit():
            if try_float(token.lemma_):
                number=float(token.lemma_)
                if number not in numbers:
                    numbers.add(number)
                    temp+=token.lemma_+' '
    temp+=target
    doc_new_sentence=nlp(temp)
    for token in doc_new_sentence:
        lemma=token.lemma_
        if lemma not in entity_noun:
            entity_noun.add(lemma)
    return entity_noun,numbers

  
def try_float(number):
    try:
        if number=='NaN':
            return False
        float(number)
        return True
    except ValueError:
        return False


In [0]:
# sentence selection pt 2. validation
sentence_further_selection_entity_step('devset.json','sentence_selection_output.json','sentence_further_selection_output_50.json')
test_on_sentence_selection('sentence_further_selection_output_50.json','devset.json','sentence_further_selection_output_validate.json')
complete('sentence_further_selection_output_validate.json','devset.json','o_f.json')
!python3 score.py devset.json o_f.json

In [0]:
# sentence selection pt 2. test
sentence_further_selection_entity_step('test-unlabelled.json','sentence_selection_output_test.json','sentence_further_selection_output_test.json')

## 4. Inference

### 4.1 NLI Model Training

In [0]:
# generate FEVER dataset for NLI training
!python3 fever_set.py
# train on SNLI dataset
!python3 train_nli_fever.py
# fine-tune on FEVER dataset
!python3 da_fever.py

### 4.2 Tesing FEVER Dataset

In [0]:
def probability_file(name):
    file=open(name,'r')
    import re
    claims={}
    for line in file:
        items=line.split('\t')
        _id=items[0]
        title=items[1]
        number=int(items[2])
        _array=re.split('\s',items[3][1:len(items[3])-2])
        
        if _id not in claims:
            claims[_id]={}
        claims[_id][(title,number)]=[]
        for i in _array:
            if i!='':
                claims[_id][(title,number)].append(float(i))

    return claims

  
def performance_predict(_test_file):
    devset=probability_file('nli_output_fever.txt')
    result=open('nli_performance.txt','w')
    _perfect=open(_test_file,'r')
    _all=0
    perfect=json.load(_perfect)
    correct=0
    not_related_error=0
    not_related=0
    related=0
    related_error=0
    for data in devset:
        pe=set()
        for k in perfect[data]['evidence']:
            pe.add((k[0],k[1]))
        for item in devset[data]:
            predict=np.argmax(devset[data][item])

            if predict==0:
                if perfect[data]['label']=='SUPPORTS' and item in pe:
                    correct+=1
            elif predict==1:
                if perfect[data]['label']=='REFUTES' and item in pe:
                    correct+=1
            else:
                if item not in pe:
                    correct+=1
            if item not in pe:
                not_related+=1
            if item not in pe and predict!=2:
                not_related_error+=1
            if item in pe:
                if perfect[data]['label']=='SUPPORTS' and predict!=0:
                    related_error+=1
                if perfect[data]['label']=='REFUTES' and predict!=1:
                    related_error+=1
                related+=1

        _all+=len(devset[data])
    print(correct/_all)
    print(not_related_error,not_related,related_error,related,_all)
    print(not_related/_all,related/_all)
    print(not_related_error/not_related,related_error/related)
    
def calculate_probability_fever_nil(_input_file,_nil_output_file):
    predictor=keras.models.load_model('da_fever.h5')
    nlp=spacy.load('en_vectors_web_lg')
    nli_output=open(_nil_output_file,'w')
    sentences_file=open(_input_file,'r')
    sentences=json.load(sentences_file)
    claims=[]
    supporting_sentences=[]
    info=[]
    for i,data in enumerate(sentences):
        claim=sentences[data]['claim']
        for document,location,evidence_sentence in sentences[data]['evidence']:
            claims.append(claim)
            supporting_sentences.append(evidence_sentence)
            info.append((data,document,location))
    sem_vectors, text_vectors, hypothesis_vectors=create_dataset(nlp, supporting_sentences, claims, 100, 50, norm_vectors = True)
    probabilities = predictor.predict([np.array(text_vectors),np.array(hypothesis_vectors)])
      
    for i,probability in enumerate(probabilities):
        data,document,location=info[i]
        nli_output.write(data+'\t'+document+'\t'+str(location)+'\t'+str(probability)+'\n')
    sentences_file.close()
    nli_output.close()

def final_prediction(_input_file,_output_file):
    devset=probability_file(_input_file)
    all_result={}
    for data in devset:
        result=devset[data]
        prediction={}
        final_result={}
        probabilities_dic={}
        for title,number in result:
            if '-LRB-disambiguation-RRB-' in title:
                continue
            probabilities=result[(title,number)]
            probabilities_dic[(title,number)]={}
            probabilities_dic[(title,number)]['SUPPORTS']=probabilities[0]
            probabilities_dic[(title,number)]['REFUTES']=probabilities[1]
            probabilities_dic[(title,number)]['NOT ENOUGH INFO']=probabilities[2]
            if probabilities[2] > probabilities[0] and probabilities[2] > probabilities[1]:
                continue
            elif probabilities[0] > probabilities[1] and probabilities[0] > probabilities[2]:
                prediction[(title,number)]='SUPPORTS'
            elif probabilities[1] > probabilities[0] and probabilities[1] > probabilities[2]:
                prediction[(title,number)]='REFUTES'

        num_support,num_regute=0,0
        exp_support,exp_regute=0.0,0.0
        for one_prediction in prediction:
            if prediction[one_prediction]=='SUPPORTS':
                num_support+=1
                exp_support+=result[one_prediction][0]
            else:
                num_regute+=1
                exp_regute+=result[one_prediction][1]
        if num_regute==0 and num_support==0:
            final_result['label']='NOT ENOUGH INFO'
            final_result['evidence']=[]
        else:
            if exp_support>exp_regute:
                final_result['label']='SUPPORTS'
                final_result['evidence']=[]
                temp=Counter()
                for one_prediction in prediction:
                    if prediction[one_prediction]=='SUPPORTS':
                        final_result['evidence'].append(one_prediction)
            else:
                final_result['label']='REFUTES'
                final_result['evidence']=[]
                temp=Counter()
                for one_prediction in prediction:
                    if prediction[one_prediction]=='REFUTES':
                        final_result['evidence'].append(one_prediction)
        all_result[data]=final_result
    file=open(_output_file,'w')
    file.write(json.dumps(all_result))
    file.close()

    
def create_dataset(nlp, texts, hypotheses, num_oov, max_length, norm_vectors = True):
    # some code in this function reused from https://github.com/explosion/spaCy/blob/master/examples/notebooks/Decompositional%20Attention.ipynb
    # which is an implementation of Decompositional Attention model
    sents = texts + hypotheses
    num_vectors = max(lex.rank for lex in nlp.vocab) + 2 
    oov = np.random.normal(size=(num_oov, nlp.vocab.vectors_length))
    oov = oov / oov.sum(axis=1, keepdims=True)
    vectors = np.zeros((num_vectors + num_oov, nlp.vocab.vectors_length), dtype='float32')
    vectors[num_vectors:, ] = oov
    for lex in nlp.vocab:
        if lex.has_vector and lex.vector_norm > 0:
            vectors[lex.rank + 1] = lex.vector / lex.vector_norm if norm_vectors == True else lex.vector
    sents_as_ids = []
    for sent in sents:
        doc = nlp(sent)
        word_ids = []
        for i, token in enumerate(doc):
            if token.has_vector and token.vector_norm == 0:
                continue
            if i > max_length:
                break
            if token.has_vector:
                word_ids.append(token.rank + 1)
            else:
                word_ids.append(token.rank % num_oov + num_vectors) 
        word_id_vec = np.zeros((max_length), dtype='int')
        clipped_len = min(max_length, len(word_ids))
        word_id_vec[:clipped_len] = word_ids[:clipped_len]
        sents_as_ids.append(word_id_vec)
    return vectors, np.array(sents_as_ids[:len(texts)]), np.array(sents_as_ids[len(texts):])

def get_vectors(support_sentence,claim):
    text1=[support_sentence]
    text2=[claim]
    return text1, text2

LABELS = {'SUPPORTS': 0, 'REFUTES': 1, 'NOT ENOUGH INFO': 2}

In [0]:
# inference validation
calculate_probability_fever_nil('sentence_further_selection_output_50.json','nli_output_fever.txt')
final_prediction('nli_output_fever.txt','o_2.json')
complete('o_2.json','devset.json','o_3.json')
!python score.py devset.json o_3.json

In [0]:
# inference test
calculate_probability_fever_nil('sentence_further_selection_output_test.json','nli_output_fever_test.txt')
final_prediction('nli_output_fever_test.txt','t_2.json')
complete('t_2.json','test-unlabelled.json','testoutput.json')