# HeroX Numerical Fact Checking System. Univeristy of Sheffield

Pre-requisites:
 * Gradle
 * Java jdk8
 * Python 3
  * jnius
  * fuzzywuzzy
  * sklearn
  * urllib3
 

## Common setup

Import required dependencies and download/install Stanford CoreNLP

In [6]:
import sys
import os
import re

#Set path manually to incldue sources location
if 'src/' not in sys.path:
    sys.path.append('src/')


In [7]:
#Load Java classpath for stanford corenlp using gradle. this will also install it if missing
from subprocess import run,PIPE
if 'CLASSPATH' not in os.environ:
    if not (os.path.exists('build') and os.path.exists('build/classpath.txt')):
        print("Generating classpath")
        r=run(["./gradlew", "writeClasspath"],stdout=PIPE, stderr=PIPE, universal_newlines=True)
        print(r.stdout)
        print(r.stderr)
              
    print("Loading classpath")
    os.environ['CLASSPATH'] = open('build/classpath.txt','r').read()
    print("Done")

## Data Generation

### Generate Search Engine Queries From Tables

In [112]:
from distant_supervision.query_generation import generate_queries
from tabular.table_reader import read_table, number_tuples
from wikitablequestions.dataset_reader import load_instances

world = "herox"
all_instances = []
all_instances.extend(load_instances(world))
table_files = []

done = 0
for instance in all_instances:
    table_files.append(instance['table'])

table_files = set(table_files)

with open("data/distant_supervision/queries_"+world +".txt", "w+") as file:
    for table_file in table_files:
        done += 1
        print("Parsing " + str(done) +"/"+str(len(table_files)) + "\t\t\t" + table_file)
        table = number_tuples(read_table(table_file))
        tuples = generate_queries(table)


        for tuple in tuples:
            file.write(table_file + "\t" + tuple + "\n")
        file.flush()
        os.fsync(file.fileno())

Parsing 1/12			herox/13.csv


KeyError: 1

## Fact Checking

### Training
Load Modules for fact checking, generate the features and train our classifier from our training data

In [18]:
from classifier.LogisticRegressionClassifier import LogisticRegressionClassifier
from classifier.features.generate_features import FeatureGenerator, num, is_num
from distant_supervision.utterance_detection import f_threshold_match
from factchecking.question import Question
from tabular.filtering import load_collection

In [9]:
fg = FeatureGenerator()
Xs,ys = fg.generate_training()

Done: 0.0
Search for "Exxon Mobil" "Market Value"
Query already executed
Done: 6.25
Search for "Unaccompanied children" "claimed asylum"
Query already executed
Done: 12.5
Search for "Hamas" "Founded"
Query already executed
Done: 18.75
Search for "United States" "Average Temperature"
Query already executed
Done: 25.0
Search for "United States" "Life expectancy"
Query already executed
Done: 31.25
Search for "United States" "Number of abortions"
Query already executed
Done: 37.5
Search for "United States" "Abortion Rate per 1,000 births"
Query already executed
Done: 43.75
Search for "United States Teenagers" "Percentage Enrolled in education"
Query already executed
Done: 50.0
Search for "United States Teenagers" "Enrolled in education"
Query already executed
Done: 56.25
Search for "America" "bee colonies 201"
Query already executed
Done: 62.5
Search for "United States" "Financial Intermediary Funds 2016"
Query already executed
Done: 68.75
Search for "United States" "Homocides by firearm"


In [108]:
classifier = LogisticRegressionClassifier()
classifier.train(Xs,ys)

Training classifier
Trained


### Runtime

Load the source data

In [12]:
tables = load_collection("herox")

register table herox/1.csv
register table herox/2.csv
register table herox/3.csv
register table herox/4.csv
register table herox/5.csv
register table herox/8.csv
register table herox/9.csv
register table herox/10.csv
register table herox/11.csv
register table herox/12.csv
register table herox/13.csv
register table herox/14.csv


Define the fact checking function

In [31]:
def fact_check(q):
    question = Question(text=q, type="NUM")
    tuples,q_features = fg.generate_test(tables,question)

    q_match = False

    if len(tuples)>0:
    
        
        
        q_predicted = classifier.predict(q_features)

        for i in range(len(tuples)):
            tuple = tuples[i]
            
            skip = False
            if len(tuple[1]['date']) and len(question.dates):
                for date in question.dates:
                    dstrs = set()
                    for d in question.dates:
                        dstrs.add(str(d))
                    if not len(set(tuple[1]['date']).intersection(dstrs)):
                        skip = True
                        
            if skip:
                continue
    

            if is_num(tuple[1]['value']):
                prediction = q_predicted[i]
                features = q_features[i]

                if prediction == 1:
                    print(str(tuple) + "\t\t" + ("Possible Match" if prediction else "No match"))
                    for number in question.numbers:
                        value = num(tuple[1]['value'])

                        if value is None:
                            continue

                        if f_threshold_match(number, value, 0.05):
                            print(str(tuple) + "\t\t" + "Threshold Match to 5%")
                            q_match = True

                    for number in question.dates:
                        value = num(tuple[1]['value'])
                        if number == value:
                            print(str(tuple) + "\t\t" + "Exact Match")
                            q_match = True
        print(question.text)
        print(q_match)

    else:
        print(question.text)
        print("No supporting information can be found in the knowledge base")
    print("\n\n")

unaccompanied children
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applican

# Fact checking

In [20]:
fact_check("Around 90,000 unaccompanied children claimed asylum in the EU in 2015")

unaccompanied children
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applicants considered to be unaccompanied minors']
['Asylum applican