# HeroX Numerical Fact Checking System. Univeristy of Sheffield

Pre-requisites:
 * Gradle
 * Java jdk8
 * Python 3
  * jnius
  * fuzzywuzzy
  * sklearn
  * urllib3
 

In [6]:
import sys
import os
import re

#Set path manually to incldue sources location
if 'src/' not in sys.path:
    sys.path.append('src/')


In [7]:
#Load Java classpath for stanford corenlp using gradle. this will also install it if missing
from subprocess import run,PIPE
if 'CLASSPATH' not in os.environ:
    if not (os.path.exists('build') and os.path.exists('build/classpath.txt')):
        print("Generating classpath")
        r=run(["./gradlew", "writeClasspath"],stdout=PIPE, stderr=PIPE, universal_newlines=True)
        print(r.stdout)
        print(r.stderr)
              
    print("Loading classpath")
    os.environ['CLASSPATH'] = open('build/classpath.txt','r').read()
    print("Done")

In [18]:
from classifier.LogisticRegressionClassifier import LogisticRegressionClassifier
from classifier.features.generate_features import FeatureGenerator, num, is_num
from distant_supervision.utterance_detection import f_threshold_match
from factchecking.question import Question
from tabular.filtering import load_collection

In [9]:
fg = FeatureGenerator()
Xs,ys = fg.generate_training()

Done: 0.0
Search for "Exxon Mobil" "Market Value"
Query already executed
Done: 6.25
Search for "Unaccompanied children" "claimed asylum"
Query already executed
Done: 12.5
Search for "Hamas" "Founded"
Query already executed
Done: 18.75
Search for "United States" "Average Temperature"
Query already executed
Done: 25.0
Search for "United States" "Life expectancy"
Query already executed
Done: 31.25
Search for "United States" "Number of abortions"
Query already executed
Done: 37.5
Search for "United States" "Abortion Rate per 1,000 births"
Query already executed
Done: 43.75
Search for "United States Teenagers" "Percentage Enrolled in education"
Query already executed
Done: 50.0
Search for "United States Teenagers" "Enrolled in education"
Query already executed
Done: 56.25
Search for "America" "bee colonies 201"
Query already executed
Done: 62.5
Search for "United States" "Financial Intermediary Funds 2016"
Query already executed
Done: 68.75
Search for "United States" "Homocides by firearm"


In [10]:
classifier = LogisticRegressionClassifier()
classifier.train(Xs,ys)


Training classifier
Trained


In [12]:
tables = load_collection("herox")

register table herox/1.csv
register table herox/2.csv
register table herox/3.csv
register table herox/4.csv
register table herox/5.csv
register table herox/8.csv
register table herox/9.csv
register table herox/10.csv
register table herox/11.csv
register table herox/12.csv
register table herox/13.csv
register table herox/14.csv


In [31]:
def fact_check(q):
    question = Question(text=q, type="NUM")
    tuples,q_features = fg.generate_test(tables,question)

    q_match = False

    if len(tuples)>0:
    
        
        
        q_predicted = classifier.predict(q_features)

        for i in range(len(tuples)):
            tuple = tuples[i]
            
            skip = False
            if len(tuple[1]['date']) and len(question.dates):
                for date in question.dates:
                    dstrs = set()
                    for d in question.dates:
                        dstrs.add(str(d))
                    if not len(set(tuple[1]['date']).intersection(dstrs)):
                        skip = True
                        
            if skip:
                continue
    

            if is_num(tuple[1]['value']):
                prediction = q_predicted[i]
                features = q_features[i]

                if prediction == 1:
                    print(str(tuple) + "\t\t" + ("Possible Match" if prediction else "No match"))
                    for number in question.numbers:
                        value = num(tuple[1]['value'])

                        if value is None:
                            continue

                        if f_threshold_match(number, value, 0.05):
                            print(str(tuple) + "\t\t" + "Threshold Match to 5%")
                            q_match = True

                    for number in question.dates:
                        value = num(tuple[1]['value'])
                        if number == value:
                            print(str(tuple) + "\t\t" + "Exact Match")
                            q_match = True
        print(question.text)
        print(q_match)

    else:
        print(question.text)
        print("No supporting information can be found in the knowledge base")
    print("\n\n")
    
fact_check("Around 90,000 unaccompanied children claimed asylum in the EU in 2015")

unaccompanied children
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applican

In [20]:
fact_check("Around 90,000 unaccompanied children claimed asylum in the EU in 2015")

unaccompanied children
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applican

In [102]:
import csv
import numpy as np
import re
from stanford.corenlpy import * 

def read_table(filename,base="data/WikiTableQuestions"):
    header = []
    rows = []

    header_read = False
    filename = filename.replace(".csv",".tsv")
    with open(base+"/"+filename,"r",encoding='utf-8') as table:
        has_header = csv.Sniffer().has_header(table.readline())
        table.seek(0)

        for line in csv.reader(table, delimiter="\t"):
            if has_header and not header_read:
                header = line
                header_read = True
            else:
                rows.append(line)
    return {"header": header, "rows":rows}

def table_nes(table):
    header = table['header']
    rows = table['rows']

    ret_tokens = []
    for col in transpose(rows):
        text = ". ".join(col)
        doc = Annotation(text)
        SharedNERPipeline().getInstance().annotate(doc)


        num_ne_cell = 0
        tokens = []
        for cell in range(doc.get(CoreAnnotations.SentencesAnnotation).size()):
            col = doc.get(CoreAnnotations.SentencesAnnotation).get(cell)

            words = []
            col_ne_tags = []
            has_ne = False
            for i in range(col.get(CoreAnnotations.TokensAnnotation).size()):
                corelabel = col.get(CoreAnnotations.TokensAnnotation).get(i)
                ne =corelabel.get(CoreAnnotations.NamedEntityTagAnnotation)

                words.append(corelabel.get(CoreAnnotations.TextAnnotation))
                if ne not in ['O','NUMBER','NUMERIC']:
                    has_ne = True

            if len(words) > 1:
                tokens.append(" ".join(words[:-1]))

            if has_ne:
                num_ne_cell += 1

        if num_ne_cell >= len(tokens)/2 and len(tokens) > 0:
            ret_tokens.extend(tokens)

    return ret_tokens


def number_tuples(table):
    header = table['header']
    rows = table['rows']

    ret_tokens = []
    entity_col = []
    date_col = []
    number_col = []

    col_id = 0

    table_trans = transpose(rows)

    hnums = set()
    hidx = 0
    for h in header:
        doc = Annotation(h)
        SharedNERPipeline().getInstance().annotate(doc)

        for s in range(doc.get(CoreAnnotations.SentencesAnnotation).size()):
            c = doc.get(CoreAnnotations.SentencesAnnotation).get(s)
            for i in range(c.get(CoreAnnotations.TokensAnnotation).size()):
                corelabel = c.get(CoreAnnotations.TokensAnnotation).get(i)
                ne = corelabel.get(CoreAnnotations.NamedEntityTagAnnotation)

                if ne in ['YEAR', 'DATE']:
                    hnums.add(hidx)
        hidx += 1

    hseries = False
    if (len(hnums) >= len(header) / 2):
        hseries = True

        
    num_nes = []
    num_dates = []
    num_numbers = []
    
    all_tokens = []
    for col in table_trans:

        text = ". ".join(col)
        doc = Annotation(text)
        SharedNERPipeline().getInstance().annotate(doc)

        num_ne_cell = 0
        num_date_cell = 0
        num_number_cell = 0

        tokens = []
        for cell in range(doc.get(CoreAnnotations.SentencesAnnotation).size()):
            col = doc.get(CoreAnnotations.SentencesAnnotation).get(cell)

            words = []
            col_ne_tags = []
            has_ne = False
            has_date = False
            has_number = False
            for i in range(col.get(CoreAnnotations.TokensAnnotation).size()):
                corelabel = col.get(CoreAnnotations.TokensAnnotation).get(i)
                ne = corelabel.get(CoreAnnotations.NamedEntityTagAnnotation)

                words.append(corelabel.get(CoreAnnotations.TextAnnotation))
                if ne not in ['O', 'NUMBER', 'NUMERIC', 'DATE', 'YEAR']:
                    has_ne = True

                if ne in ['YEAR', 'DATE']:
                    has_date = True

                if ne in ['NUMBER', 'NUMERIC', 'PERCENTAGE', 'ORDINAL']:
                    has_number = True

            if len(words) > 0:
                tokens.append(" ".join(words[:-1]))

            
            if has_ne:
                num_ne_cell += 1

            if has_date:
                num_date_cell += 1

            if has_number:
                num_number_cell += 1
                
        num_nes.append(num_ne_cell)
        num_dates.append(num_date_cell)
        num_numbers.append(num_number_cell)
        all_tokens.append(tokens)
        

    
    col_id = 0
    for col in table_trans:     
        if num_nes[col_id] >= len(all_tokens[col_id]) / 2 and len(all_tokens[col_id]) > 0:
            entity_col.append(col_id)
        col_id += 1
   

    col_id = 0
    for col in table_trans:     
        if len(entity_col) and num_dates[col_id] >= len(all_tokens[col_id]) / 2 and len(all_tokens[col_id]) >0:
            if col_id < max(entity_col) and  col_id > min(entity_col):
                number_col.append(col_id)
            else:
                date_col.append(col_id)

        if num_numbers[col_id] >= len(all_tokens[col_id]) / 2:
            number_col.append(col_id)
        col_id += 1

    tuples = []


    if not hseries:
        for col in entity_col:
            for col1 in number_col:
                if col1 in entity_col: 
                    continue
                if col in number_col:
                    continue
                    
                extra = []
                if len(date_col) > 0:
                    for dc in date_col:
                        extra.append(table_trans[dc])
                    extra = transpose(extra)
                # TODO entity/relation/value

                for i in range(len(rows)):
                    t = dict()
                    t['relation'] = header[col1]
                    t['value'] = table_trans[col1][i]
                    t['entity'] = table_trans[col][i]

                    if len(extra):
                        t['date'] = extra[i]
                    tuples.append(t)

    else:

        extra = []
        hnums = list(hnums)

        nh = (set(range(len(header))).difference(hnums))
        tr = []
        for col in nh:
            if col not in entity_col:
                extra.append(table_trans[col])
        el = len(extra)
        extra = transpose(extra)

        for hnum in hnums:
            for ecol in entity_col:
                l = list(zip([header[hnum]] * len(rows), table_trans[hnum], extra))
                i=0
                for t in l:
                    for rel in t[2:][0]:
                        t2 = dict()

                        t2['entity'] = table_trans[ecol][i]
                        t2['relation'] = rel
                        for item in t[0:1]:
                            if 'date' not in t2:
                                t2['date'] = []
                            t2['date'].append(item)

                        t2['value'] = t[1]

                        tuples.append(t2)
                    i+=1

    return tuples


def transpose(l):
    return list(map(list, zip(*l)))


b = 5
for tnum in range(b,b+1):
    print(tnum)
    t = read_table("herox/"+str(tnum)+".csv")
    tuples = number_tuples(t)

    for tuple in tuples:
        print(tuple)

5
{'entity': 'Aruba', 'date': ['1960'], 'relation': 'ABW', 'value': '65.56936585'}
{'entity': 'Aruba', 'date': ['1960'], 'relation': 'Life expectancy at birth', 'value': '65.56936585'}
{'entity': 'Andorra', 'date': ['1960'], 'relation': 'AND', 'value': ''}
{'entity': 'Andorra', 'date': ['1960'], 'relation': 'Life expectancy at birth', 'value': ''}
{'entity': 'Afghanistan', 'date': ['1960'], 'relation': 'AFG', 'value': '32.3285122'}
{'entity': 'Afghanistan', 'date': ['1960'], 'relation': 'Life expectancy at birth', 'value': '32.3285122'}
{'entity': 'Angola', 'date': ['1960'], 'relation': 'AGO', 'value': '32.98482927'}
{'entity': 'Angola', 'date': ['1960'], 'relation': 'Life expectancy at birth', 'value': '32.98482927'}
{'entity': 'Albania', 'date': ['1960'], 'relation': 'ALB', 'value': '62.25436585'}
{'entity': 'Albania', 'date': ['1960'], 'relation': 'Life expectancy at birth', 'value': '62.25436585'}
{'entity': 'Arab World', 'date': ['1960'], 'relation': 'ARB', 'value': '46.8500222'}
