# HeroX Numerical Fact Checking System. Univeristy of Sheffield

Pre-requisites:
 * Gradle
 * Java jdk8
 * Python 3
  * numpy
  * jnius
  * fuzzywuzzy
  * sklearn
  * urllib3
 

## Common setup

Import required dependencies and download/install Stanford CoreNLP

In [1]:
import sys
import os
import re

#Set path manually to incldue sources location
if 'src/' not in sys.path:
    sys.path.append('src/')


If the following step fails. Run `gradlew writeClasspath` on the terminal in this folder. Then try again

In [2]:
#Load Java classpath for stanford corenlp using gradle. this will also install it if missing
from subprocess import run,PIPE
if 'CLASSPATH' not in os.environ:
    if not (os.path.exists('build') and os.path.exists('build/classpath.txt')):
        print("Generating classpath")
        r=run(["./gradlew", "writeClasspath"],stdout=PIPE, stderr=PIPE, universal_newlines=True)
        print(r.stdout)
        print(r.stderr)
              
    print("Loading classpath")
    os.environ['CLASSPATH'] = open('build/classpath.txt','r').read()
    print("Done")

Loading classpath
Done


## [Optional] Data Generation

### Generate Search Engine Queries From Tables

Run if a new table is added

In [None]:
from distant_supervision.query_generation import generate_queries
from tabular.table_reader import read_table, number_tuples
from wikitablequestions.dataset_reader import load_instances

print("Generating Queries")
world = "herox"
if os.path.exists("data/distant_supervision/queries_"+world +".txt"):
    print("Already done. No need to run again")
else:
    all_instances = []
    all_instances.extend(load_instances(world))
    table_files = []

    done = 0
    for instance in all_instances:
        table_files.append(instance['table'])

    table_files = set(table_files)

    with open("data/distant_supervision/queries_"+world +".txt", "w+") as file:
        for table_file in table_files:
            done += 1
            print("Parsing " + str(done) +"/"+str(len(table_files)) + "\t\t\t" + table_file)
            table = number_tuples(read_table(table_file))
            tuples = generate_queries(table)


            for tuple in tuples:
                file.write(table_file + "\t" + tuple + "\n")
            file.flush()
            os.fsync(file.fileno())
print("Finished")

## Download all web pages for queries

We include the web pages with this submission as downloading the web pages takes considerable time. This script will not re-download web pages it already has

In [None]:
import sys

from distant_supervision.clean_html import get_text
from distant_supervision.search import Search


world = "herox"

with open("data/distant_supervision/queries_"+world+".txt", "r") as file:
    lines = file.readlines()
    num_qs = len(lines)
    done = 0
    for line in lines:
        done += 1
        query = line.replace("\n"," ").strip().split("\t")

        table = query[0]
        search = query[2]

        if search.split("\" \"")[1].replace("\"","").isnumeric():
            print("skipped")
            print (query)
        else:
            try:
                urls = Search.instance().search(search)

                for url in urls:
                    a = get_text(url)
            except:
                pass
            print(str(100*done/num_qs) + "%")

### Feature Generation

For each of the downloaded web pages. Parse the page and identify matches between the values in our tables and the data given in the web page. This only needs to be run once and will rememeber if it has been run before

In [None]:
from run.ds_generate_positive_features_for_query import precompute_features

precompute_features("emnlp")



Done 1 out of 10625
Search for "Barbados" GDP (current US$)  2012
Query already executed
Done URL 1 out of 50
 
Looking in document for values similar to 4332141067
https://www.bing.com/cr?IG=A7362BC59D6B43288271323AA77D8E1C&CID=33B946C9F5416F5834104F39F4A66ED2&rd=1&h=oG7kld5RbInomlQ7EtWICQ3PW05C4Jgbr9DeVdgvvIw&v=1&r=https%3a%2f%2fen.wikipedia.org%2fwiki%2fEconomy_of_Barbados&p=DevEx,5132.1
Done URL 2 out of 50
 
Looking in document for values similar to 4332141067
http://www.bing.com/cr?IG=A7362BC59D6B43288271323AA77D8E1C&CID=33B946C9F5416F5834104F39F4A66ED2&rd=1&h=uolkG39uS4CM-mAIVBqaLsqV6i41opiQRb8f6jyKbio&v=1&r=http%3a%2f%2fdata.worldbank.org%2fcountry%2fbarbados&p=DevEx,5145.1
No meaningful text in this document
Done URL 3 out of 50
 
Looking in document for values similar to 4332141067
http://www.bing.com/cr?IG=A7362BC59D6B43288271323AA77D8E1C&CID=33B946C9F5416F5834104F39F4A66ED2&rd=1&h=xHeDJ4f_6UqhWdAkGiv5cfQrPJWhvQAN2e75A_XNce4&v=1&r=http%3a%2f%2fwww.heritage.org%2findex%2fcoun

## Fact Checking

### Training
Load Modules for fact checking, generate the features and train our classifier from our training data

In [3]:
from classifier.Classifier import Classifier
from classifier.LogisticRegressionClassifier import LogisticRegressionClassifier
from classifier.features.generate_features import FeatureGenerator, num, is_num
from distant_supervision.utterance_detection import f_threshold_match
from factchecking.question import Question
from tabular.filtering import load_collection



In [8]:
fg = FeatureGenerator()
Xs,ys = fg.generate_training("herox")

Done: 0.0


IndexError: list index out of range

In [None]:

from sklearn.linear_model import LogisticRegression
class LogisticRegressionClassifier(Classifier):
    def train(self, Xs, ys):
        print("Training classifier 3")
        self.lr = LogisticRegression(penalty='l1', C=0.78)
        self.lr.fit(Xs, ys)
        print("Trained")

    def predict(self, q_features):
        ys = (self.lr.predict(q_features), self.lr.predict_proba(q_features))
        return ys


classifier = LogisticRegressionClassifier()
classifier.train(Xs,ys)

### Runtime

Load the source data

In [None]:
tables = load_collection("emnlp")

Define the fact checking function

In [None]:
def fact_check(q):
    question = Question(text=q, type="NUM")
    question.parse()
    tuples,q_features = fg.generate_test(tables,question)
    q_match = False
    
  
        
    if len(tuples)>0:
  
    
        
        
        q_predicted = classifier.predict(q_features)

        for i in range(len(tuples)):
            tuple = tuples[i]
            
            skip = False
            if 'date' in tuple[1].keys() and len(question.dates):
                for date in question.dates:
                    dstrs = set()
                    for d in question.dates:
                        dstrs.add(str(d))
                    if not len(set(tuple[1]['date']).intersection(dstrs)):
                        skip = True
                        
            if skip:
                continue
    

            if is_num(tuple[1]['value']):
                prediction = q_predicted[0][i]
                features = q_features[i]

                
             
                if prediction == 1:
                    print(str(tuple) + "\t\t" + ("Possible Match" if prediction else "No match"))
                    for number in question.numbers:
                        value = num(tuple[1]['value'])

                        if value is None:
                            continue

                        if f_threshold_match(number, value, 0.05):
                            print(str(tuple) + "\t\t" + "Threshold Match to 5%")
                            q_match = True

                    for number in question.dates:
                        value = num(tuple[1]['value'])
                        if number == value:
                            print(str(tuple) + "\t\t" + "Exact Match")
                            q_match = True
        print(question.text)
        print(q_match)

    else:
        print(question.text)
        print("No supporting information can be found in the knowledge base")
    print("\n\n")
    
fact_check("Germany had a population of 80 million people residing there in 2015")

# Fact checking

In [None]:
fact_check("Germany's GDP is growth is 10")

In [None]:
import csv
base = "/Users/james/Dropbox/Fact Checking/james"
rels = []
from distant_supervision.normalisation import normalise,normalise_keep_nos
from collections import defaultdict
import re 
import pickle
import ast

from fuzzywuzzy import process


def normalise(text):
    text = text.replace("lrb","")
    text = text.replace("lsb", "")
    text = text.replace("rrb", "")
    text = text.replace("rsb", "")
    
    text = text.replace("-lrb-","")
    text = text.replace("-lsb-", "")
    text = text.replace("-rrb-", "")
    text = text.replace("-rsb-", "")
    
    text = text.replace("-LRB-","")
    text = text.replace("-LSB-", "")
    text = text.replace("-RRB-", "")
    text = text.replace("-LSB-", "")

    text = re.sub(r'[^\w]', ' ', text)
    text = re.sub(r'[0-9]','D', text.lower())
    return text

def fact_check_and_test(q, rel):
    question = q # Question(text=q, type="NUM")
    tuples, q_features = fg.generate_test(tables, question)
    q_match = False

    matches = dict()
    
    p_match = 0.0
    found_match = False 
    total_geq = 0
    total_gt = 0
    total_match= 0
    
    entities = set()
    if len(tuples) > 0:
        for i in range(len(tuples)):
            
            tuple = tuples[i]
            skip = False
            if 'date' in tuple[1].keys() and len(question.dates):
                for date in question.dates:
                    dstrs = set()
                    for d in question.dates:
                        dstrs.add(str(d))
                    if not len(set(tuple[1]['date']).intersection(dstrs)):
                        skip = True
            
            if skip or not is_num(tuple[1]['value']):
                continue
            
            entities.add(tuple[1]['entity'])
            matches[tuple[1]['entity'] + "-----" + tuple[1]['relation']] = (tuple,q_features[i])

    if len(matches.keys()) > 0:
        for i in matches.keys():
            
            tuple = matches[i][0]
          
            features = matches[i][1]
            q_predicted = classifier.predict([features])

            skip = False
            if 'date' in tuple[1].keys() and len(question.dates):
                for date in question.dates:
                    dstrs = set()
                    for d in question.dates:
                        dstrs.add(str(d))
                    if not len(set(tuple[1]['date']).intersection(dstrs)):
                        skip = True

            if skip or not is_num(tuple[1]['value']):
                continue

            prediction = q_predicted[0][0]
            
            if prediction == 1:
                
                if (tuple[1]['relation'] == rel):
                    p_match = q_predicted[1][0][1]
                    found_match = True
            
    else:
        return (-1,0,0)

    
    if found_match:
        for i in matches.keys():
            
            tuple = matches[i][0]
            features = matches[i][1]
            q_predicted = classifier.predict([features])

            
            skip = False
            if 'date' in tuple[1].keys() and len(question.dates):
                for date in question.dates:
                    dstrs = set()
                    for d in question.dates:
                        dstrs.add(str(d))
                    if not len(set(tuple[1]['date']).intersection(dstrs)):
                        skip = True

            if skip or not is_num(tuple[1]['value']):
                continue

            prediction = q_predicted[0][0]
           
            if prediction == 1:
                if not (tuple[1]['relation'] == rel):
                    if q_predicted[1][0][1] > p_match:
                        total_gt += 1
                    if q_predicted[1][0][1] >= p_match:
                        total_geq += 1
                    total_match += 1
                    
        
    if found_match:
        print("matched - ")
        print(total_gt)
        print(total_geq)
        print(total_match)
        return (1,total_gt,total_match,total_geq)
    
    rs = set()
    for tuple in tuples:
        if not is_num(tuple[1]['value']):
            pass
        rs.add(tuple[1]['relation'])

    if rel not in rs:
        return (-1,0,0,0)
    return (0,0,0,0)







for filename in os.listdir(base):
    if filename.endswith(".tsv"):
        with open(base+"/"+filename,encoding = "ISO-8859-1") as tsv:
            for line in tsv.readlines():
                row = line.split("\t")
                if(len(row) == 12) and len(row[5].strip())>0:
                    if(row[0].lower().strip()=='y') or (row[1].lower().strip()=='y') :
                        rels.append({"claim":row[2],"relation":row[5],"entity":row[3],"num":row[9],"parsed":row[8]})
                elif len(row) == 11:
                    if(row[0].lower().strip()=='y') and len(row[4].strip())>0:
                        rels.append({"claim":row[1],"relation":row[4],"entity":row[2],"num":row[8],"parsed":row[7]})
           
                 
    


property_names = dict()

property_names['fertility_rate'] = "Fertility rate, total (births per woman)"
property_names['gdp_growth_rate'] = "GDP growth (annual %)"
property_names['gdp_nominal'] = "GDP (current US$)"
property_names['gdp_nominal_per_capita'] = "GDP per capita (current US$)"
property_names['gni_per_capita_in_ppp_dollars'] = "GNI per capita, PPP (current international $)"
property_names['life_expectancy'] = "Life expectancy at birth, total (years)"
property_names['cpi_inflation_rate'] = "Inflation, consumer prices (annual %)"
property_names['consumer_price_index'] = "Consumer price index (2010 = 100)"
property_names['diesel_price_liter'] = "Pump price for diesel fuel (US$ per liter)"
property_names['gni_in_ppp_dollars'] = "GNI (current US$)"
property_names['population_growth_rate'] = "Population growth (annual %)"
property_names['population'] = "Population, total"
property_names['prevalence_of_undernourisment'] = "Prevalence of undernourishment (% of population)"
property_names['renewable_freshwater_per_capita'] = "Renewable internal freshwater resources per capita (cubic meters)"
property_names['health_expenditure_as_percent_of_gdp'] = "Health expenditure, total (% of GDP)"
property_names['internet_users_percent_population'] = "Internet users (per 100 people)"

tested = defaultdict(int)
results = defaultdict(int)
pr = defaultdict(int)

num_better = defaultdict(int)
num_total = defaultdict(int)
num_better_or_equal = defaultdict(int)

print(len(rels))


claim_loc = re.compile(r'<location[^>]*>([^<]+)</location>')
claim_num = re.compile(r'<number[^>]*>([^<]+)</number>')


class NewQuestion():
    def __init__(self,text,entity,number):
        self.text = text
        self.nes = {entity}
        self.numbers = {num(number)}
        self.dates = set()
        self.nps = set()
    def parse(self):
        pass
        
qs = []
        
for rel in rels:
    if len(claim_loc.findall(rel['claim'])) > 0:
        rel['num'] = claim_num.findall(rel['claim'])[0]

    start_claim_idx = rel['claim'].index(rel['entity'])
    end_claim_idx = start_claim_idx + len(rel['entity'])


    start_num_idx = rel['claim'].index(rel['num'])
    end_num_idx = start_num_idx + len(rel['num'])

    span = ""
    if end_claim_idx < start_num_idx:
        span = (rel['claim'][end_claim_idx:start_num_idx])
    else:
        span =(rel['claim'][start_num_idx:end_claim_idx])

    span = re.sub('<[^<]+?>', '', span)
    #print(normalise(span).split())
    
    spanwords = span.split()
    
    if(rel['parsed'][0]=="\""):
        rel['parsed'] = rel['parsed'][1:-1]
        
    dep_parse = ast.literal_eval(rel['parsed'])
    
    tokens = []
    for token in dep_parse:
        
        for w in (token.replace("*extend*","").split("+")):
            we = w.split("~")[0].replace("\"","")
            
            if "," in we:
                for t in we.split(","):
                    if not(t  == "NUMBER_SLOT" or t == "LOCATION_SLOT"):
                        tokens.append(t)
            elif not(we  == "NUMBER_SLOT" or we == "LOCATION_SLOT"):
                tokens.append(we)
    tokens = " ".join(tokens).replace("DATE","1000").replace("PERCENT","10").replace("MISC","$")
    tokens += " "
    tokens += " ".join(spanwords).replace("DATE","1000").replace("PERCENT","10").replace("MISC","$")
    
    q = NewQuestion(rel['claim'],rel['entity'],rel['num'])
    words = normalise_keep_nos(q.text).split()

    qs.append((q,rel))

    
done= 0
for item in qs:
    done += 1
    print(rel['claim'])
    print(rel['relation'])
    
    rel = item[1]
    q = item[0]
    result = fact_check_and_test(q, property_names[rel['relation']])
    
    if result[0] == 1:
        results[rel['relation']] += 1
        if result[1] == 0:
            pr[rel['relation']] += result[3]
        num_better[rel['relation']] += result[1]
        num_better_or_equal[rel['relation']] += result[3]    
        num_total[rel['relation']] += result[2]

    if result[0] != -1:
        tested[rel['relation']] += 1

    print(result)
    print("done" + str(done) )
    print("")
    
    if done%5000 == 0:
        for key in tested.keys():
            print(key + " " + str(results[key]) + " " + str(num_better[key]) + " " + str(num_better_or_equal[key]) + " " + str(num_total[key]) + " " + str(pr[key]) + " " + str(tested[key]) + " " + str(results[key] / tested[key]))

print("Done")
for key in tested.keys():
    print(key + " " + str(results[key]) + " " + str(num_better[key]) + " " + str(num_better_or_equal[key]) + " " + str(num_total[key]) + " " + str(pr[key]) + " " + str(tested[key]) + " " + str(results[key] / tested[key]))
