# HeroX Numerical Fact Checking System. Univeristy of Sheffield

Pre-requisites:
 * Gradle
 * Java jdk8
 * Python 3
  * numpy
  * jnius
  * fuzzywuzzy
  * sklearn
  * urllib3
 

## Common setup

Import required dependencies and download/install Stanford CoreNLP

In [None]:
import sys
import os
import re

#Set path manually to incldue sources location
if 'src/' not in sys.path:
    sys.path.append('src/')


If the following step fails. Run `gradlew writeClasspath` on the terminal in this folder. Then try again

In [None]:
#Load Java classpath for stanford corenlp using gradle. this will also install it if missing
from subprocess import run,PIPE
if 'CLASSPATH' not in os.environ:
    if not (os.path.exists('build') and os.path.exists('build/classpath.txt')):
        print("Generating classpath")
        r=run(["./gradlew", "writeClasspath"],stdout=PIPE, stderr=PIPE, universal_newlines=True)
        print(r.stdout)
        print(r.stderr)
              
    print("Loading classpath")
    os.environ['CLASSPATH'] = open('build/classpath.txt','r').read()
    print("Done")

## [Optional] Data Generation

### Generate Search Engine Queries From Tables

Run if a new table is added

In [None]:
from distant_supervision.query_generation import generate_queries
from tabular.table_reader import read_table, number_tuples
from wikitablequestions.dataset_reader import load_instances

print("Generating Queries")
world = "herox"
if os.path.exists("data/distant_supervision/queries_"+world +".txt"):
    print("Already done. No need to run again")
else:
    all_instances = []
    all_instances.extend(load_instances(world))
    table_files = []

    done = 0
    for instance in all_instances:
        table_files.append(instance['table'])

    table_files = set(table_files)

    with open("data/distant_supervision/queries_"+world +".txt", "w+") as file:
        for table_file in table_files:
            done += 1
            print("Parsing " + str(done) +"/"+str(len(table_files)) + "\t\t\t" + table_file)
            table = number_tuples(read_table(table_file))
            tuples = generate_queries(table)


            for tuple in tuples:
                file.write(table_file + "\t" + tuple + "\n")
            file.flush()
            os.fsync(file.fileno())
print("Finished")

## Download all web pages for queries

We include the web pages with this submission as downloading the web pages takes considerable time. This script will not re-download web pages it already has

In [None]:
import sys

from distant_supervision.clean_html import get_text
from distant_supervision.search import Search


world = "herox"

with open("data/distant_supervision/queries_"+world+".txt", "r") as file:
    lines = file.readlines()
    num_qs = len(lines)
    done = 0
    for line in lines:
        done += 1
        query = line.replace("\n"," ").strip().split("\t")

        table = query[0]
        search = query[2]

        if search.split("\" \"")[1].replace("\"","").isnumeric():
            print("skipped")
            print (query)
        else:
            try:
                urls = Search.instance().search(search)

                for url in urls:
                    a = get_text(url)
            except:
                pass
            print(str(100*done/num_qs) + "%")

### Feature Generation

For each of the downloaded web pages. Parse the page and identify matches between the values in our tables and the data given in the web page. This only needs to be run once and will rememeber if it has been run before

In [None]:
from run.ds_generate_positive_features_for_query import precompute_features

precompute_features("herox")

## Fact Checking

### Training
Load Modules for fact checking, generate the features and train our classifier from our training data

In [None]:
from classifier.LogisticRegressionClassifier import LogisticRegressionClassifier
from classifier.features.generate_features import FeatureGenerator, num, is_num
from distant_supervision.utterance_detection import f_threshold_match
from factchecking.question import Question
from tabular.filtering import load_collection

In [None]:
fg = FeatureGenerator()
Xs,ys = fg.generate_training()

In [None]:
classifier = LogisticRegressionClassifier()
classifier.train(Xs,ys)

### Runtime

Load the source data

In [None]:
tables = load_collection("herox")

Define the fact checking function

In [None]:
def fact_check(q):
    question = Question(text=q, type="NUM")
    tuples,q_features = fg.generate_test(tables,question)

    q_match = False

    if len(tuples)>0:
    
        
        
        q_predicted = classifier.predict(q_features)

        for i in range(len(tuples)):
            tuple = tuples[i]
            
            skip = False
            if 'date' in tuple[1].keys() and len(question.dates):
                for date in question.dates:
                    dstrs = set()
                    for d in question.dates:
                        dstrs.add(str(d))
                    if not len(set(tuple[1]['date']).intersection(dstrs)):
                        skip = True
                        
            if skip:
                continue
    

            if is_num(tuple[1]['value']):
                prediction = q_predicted[i]
                features = q_features[i]

                if prediction == 1:
                    print(str(tuple) + "\t\t" + ("Possible Match" if prediction else "No match"))
                    for number in question.numbers:
                        value = num(tuple[1]['value'])

                        if value is None:
                            continue

                        if f_threshold_match(number, value, 0.05):
                            print(str(tuple) + "\t\t" + "Threshold Match to 5%")
                            q_match = True

                    for number in question.dates:
                        value = num(tuple[1]['value'])
                        if number == value:
                            print(str(tuple) + "\t\t" + "Exact Match")
                            q_match = True
        print(question.text)
        print(q_match)

    else:
        print(question.text)
        print("No supporting information can be found in the knowledge base")
    print("\n\n")

# Fact checking

In [None]:
fact_check("Around 22250 unaccompanied children claimed asylum in Germany in 2015")