# HeroX Numerical Fact Checking System. Univeristy of Sheffield

Pre-requisites:
 * Gradle
 * Java jdk8
 * Python 3
  * jnius
  * fuzzywuzzy
  * sklearn
  * urllib3
 

## Common setup

Import required dependencies and download/install Stanford CoreNLP

In [1]:
import sys
import os
import re

#Set path manually to incldue sources location
if 'src/' not in sys.path:
    sys.path.append('src/')


In [2]:
#Load Java classpath for stanford corenlp using gradle. this will also install it if missing
from subprocess import run,PIPE
if 'CLASSPATH' not in os.environ:
    if not (os.path.exists('build') and os.path.exists('build/classpath.txt')):
        print("Generating classpath")
        r=run(["./gradlew", "writeClasspath"],stdout=PIPE, stderr=PIPE, universal_newlines=True)
        print(r.stdout)
        print(r.stderr)
              
    print("Loading classpath")
    os.environ['CLASSPATH'] = open('build/classpath.txt','r').read()
    print("Done")

Loading classpath
Done


## Data Generation

### Generate Search Engine Queries From Tables

In [5]:
from distant_supervision.query_generation import generate_queries
from tabular.table_reader import read_table, number_tuples
from wikitablequestions.dataset_reader import load_instances

print("Generating Queries")
world = "herox"
if os.path.exists("data/distant_supervision/queries_"+world +".txt"):
    print("Already done. No need to run again")
else:
    all_instances = []
    all_instances.extend(load_instances(world))
    table_files = []

    done = 0
    for instance in all_instances:
        table_files.append(instance['table'])

    table_files = set(table_files)

    with open("data/distant_supervision/queries_"+world +".txt", "w+") as file:
        for table_file in table_files:
            done += 1
            print("Parsing " + str(done) +"/"+str(len(table_files)) + "\t\t\t" + table_file)
            table = number_tuples(read_table(table_file))
            tuples = generate_queries(table)


            for tuple in tuples:
                file.write(table_file + "\t" + tuple + "\n")
            file.flush()
            os.fsync(file.fileno())
print("Finished")

Generating Queries
Already done. No need to run again
Finished


## Download all web pages for queries

In [None]:
import sys

from distant_supervision.clean_html import get_text
from distant_supervision.search import Search


world = "herox"

with open("data/distant_supervision/queries_"+world+".txt", "r") as file:
    lines = file.readlines()
    num_qs = len(lines)
    done = 0
    for line in lines:
        done += 1
        query = line.replace("\n"," ").strip().split("\t")

        table = query[0]
        search = query[2]

        if search.split("\" \"")[1].replace("\"","").isnumeric():
            print("skipped")
            print (query)
        else:
            try:
                urls = Search.instance().search(search)

                for url in urls:
                    a = get_text(url)
            except:
                pass
            print(str(100*done/num_qs) + "%")

Search for "America" "Number of honey producing bee colonies" 2013
Query already executed
0.0035239806885858267%
Search for "America" "Number of honey producing bee colonies" 2006
Query already executed
https://www.bing.com/cr?IG=F9338EB0B81F41DF91166058EC895A83&CID=28124D299BB56D8519C844C29A526C5F&rd=1&h=KPVJZ3tbruO5W1iCVCbR4m_-GkBt-34O3aaTWIJYWGI&v=1&r=https%3a%2f%2fwww.reddit.com%2fr%2fworldnews%2fcomments%2f35yfyj%2fhoneybees_dying_situation_unheard_of%2f&p=DevEx,5333.1




0.0070479613771716534%
Search for "America" "Number of honey producing bee colonies" 2003
Query already executed
0.01057194206575748%
Search for "America" "Number of honey producing bee colonies" 2011
Query already executed
http://www.bing.com/cr?IG=4D4ACD8956F242F0B7631F796E4B4810&CID=36E29FF58701604E0C06961E86E66193&rd=1&h=_U8MWc1Ap4eFodbKJoRlxagL8hTgvtimjHbmC-YBlFA&v=1&r=http%3a%2f%2fwww.startribune.com%2fnation-had-more-bees-and-honey-last-year-usda-says%2f116961113%2f&p=DevEx,5110.1
http://www.bing.com/cr?IG=4D4ACD8956F242F0B7631F796E4B4810&CID=36E29FF58701604E0C06961E86E66193&rd=1&h=72tw_nNy2PQKBYL2CexP-7aRhKrUJsXL5911qhj2ck8&v=1&r=http%3a%2f%2fescholarship.org%2fuc%2fitem%2f8pp7r3bj.pdf&p=DevEx,5122.1
https://www.bing.com/cr?IG=4D4ACD8956F242F0B7631F796E4B4810&CID=36E29FF58701604E0C06961E86E66193&rd=1&h=9_EDTqNhxdgAq1-fENnvvR-04C3cS9boLXPB05SU_sk&v=1&r=https%3a%2f%2fwww.scribd.com%2fdocument%2f268914519%2fGlobal-Bee-Colony-Disorder-and-Threats-Insect-Pollinators&p=DevEx,5134.1




http://www.bing.com/cr?IG=4D4ACD8956F242F0B7631F796E4B4810&CID=36E29FF58701604E0C06961E86E66193&rd=1&h=RCN0fTV3zWFDqyqDv-jtKBO85pUX7vy640uqERYdj58&v=1&r=http%3a%2f%2fhiveharvest.com%2fblog%2f&p=DevEx,5147.1
https://www.bing.com/cr?IG=4D4ACD8956F242F0B7631F796E4B4810&CID=36E29FF58701604E0C06961E86E66193&rd=1&h=URRJvErT5_ootr4bQl7POee1oMSFI8P1BUxpagTQsrg&v=1&r=https%3a%2f%2fwww.nass.usda.gov%2fData_and_Statistics%2fSpecial_Tabulations%2fRequest_a_Tabulation%2fdata-lab-records4.php&p=DevEx,5159.1




http://www.bing.com/cr?IG=4D4ACD8956F242F0B7631F796E4B4810&CID=36E29FF58701604E0C06961E86E66193&rd=1&h=99G_BRoEgUCKtBrv1U3GRfuHDRxk7u2cJlVI1CA-Qmk&v=1&r=http%3a%2f%2fwww.sustainalytics.com%2feu-pesticide-ban-and-potential-impact-chemicals-industry&p=DevEx,5172.1
http://www.bing.com/cr?IG=4D4ACD8956F242F0B7631F796E4B4810&CID=36E29FF58701604E0C06961E86E66193&rd=1&h=qnjE5XzKZpEA7hVWAcPzg9ampFwvW4pocbMpjjC6IOc&v=1&r=http%3a%2f%2fwww.thedailymeal.com%2fhog-wild-and-produce-free-statistical-breakdown-our-food-landscape&p=DevEx,5184.1
https://www.bing.com/cr?IG=4D4ACD8956F242F0B7631F796E4B4810&CID=36E29FF58701604E0C06961E86E66193&rd=1&h=VhhdjciyIKtMJEs6qboqoop7Ql49QPUr57WoiEe7D68&v=1&r=https%3a%2f%2fissuu.com%2fbendeines%2fdocs%2fthesis_pages_small&p=DevEx,5197.1




http://www.bing.com/cr?IG=4D4ACD8956F242F0B7631F796E4B4810&CID=36E29FF58701604E0C06961E86E66193&rd=1&h=KJ-dSeMe8dPJ3p7MmRj-6cmg5x910lTk_6Iwr2Suxj4&v=1&r=http%3a%2f%2fwww.bremnerinvestments.com%2fU.S.%2bQueen%2bBees%2bWork%2bOvertime%2bto%2bSave%2bHives&p=DevEx,5208.1
https://www.bing.com/cr?IG=4D4ACD8956F242F0B7631F796E4B4810&CID=36E29FF58701604E0C06961E86E66193&rd=1&h=SDu8m3GSZUOxH2l3aSdvncDREqJ-B4MVYedeTgaCSF8&v=1&r=https%3a%2f%2fwildcatpsychology.wordpress.com%2fresearch-methods%2fawesome-headlines%2f&p=DevEx,5221.1




https://www.bing.com/cr?IG=4D4ACD8956F242F0B7631F796E4B4810&CID=36E29FF58701604E0C06961E86E66193&rd=1&h=KPVJZ3tbruO5W1iCVCbR4m_-GkBt-34O3aaTWIJYWGI&v=1&r=https%3a%2f%2fwww.reddit.com%2fr%2fworldnews%2fcomments%2f35yfyj%2fhoneybees_dying_situation_unheard_of%2f&p=DevEx,5234.1




http://www.bing.com/cr?IG=4D4ACD8956F242F0B7631F796E4B4810&CID=36E29FF58701604E0C06961E86E66193&rd=1&h=LrzspxZMYkGHmAmDh0SU5_wIgLaB2OPOhs5Bk6MSkDY&v=1&r=http%3a%2f%2fsociologyadventures.blogspot.com%2f&p=DevEx,5245.1
0.014095922754343307%
Search for "America" "Number of honey producing bee colonies" 2012
New Query




http://www.bing.com/cr?IG=2DBBFD9216514AEA80AB626FFFE7BD69&CID=1D28F9D7A5C264073EE0F03CA42565A8&rd=1&h=99G_BRoEgUCKtBrv1U3GRfuHDRxk7u2cJlVI1CA-Qmk&v=1&r=http%3a%2f%2fwww.sustainalytics.com%2feu-pesticide-ban-and-potential-impact-chemicals-industry&p=DevEx,5072.1
http://www.bing.com/cr?IG=2DBBFD9216514AEA80AB626FFFE7BD69&CID=1D28F9D7A5C264073EE0F03CA42565A8&rd=1&h=c8tstJuN9iiu2V5C57_L6Qu60Qv3LpoQOBcdudibPzw&v=1&r=http%3a%2f%2fusda.mannlib.cornell.edu%2fMannUsda%2fviewDocumentInfo.do%3fdocumentID%3d1191&p=DevEx,5085.1
http://www.bing.com/cr?IG=2DBBFD9216514AEA80AB626FFFE7BD69&CID=1D28F9D7A5C264073EE0F03CA42565A8&rd=1&h=1zCXnIy558IfW2PAG64Uyi1PHE-3JossGOPaH4x4mDA&v=1&r=http%3a%2f%2fwjon.com%2fhoney-production-drops-in-minnesota%2f&p=DevEx,5099.1
https://www.bing.com/cr?IG=2DBBFD9216514AEA80AB626FFFE7BD69&CID=1D28F9D7A5C264073EE0F03CA42565A8&rd=1&h=w_l1Fiq8Kn5jLyuzvM857DpTr4d1c3xapJHGfMRetqU&v=1&r=https%3a%2f%2fwww.washingtonpost.com%2fnews%2fwonk%2fwp%2f2015%2f07%2f23%2fcall-off-the-bee-p



http://www.bing.com/cr?IG=2DBBFD9216514AEA80AB626FFFE7BD69&CID=1D28F9D7A5C264073EE0F03CA42565A8&rd=1&h=5cZsqwB5fp8VRNw2wHwa1elG-JEHqrkDD3KGd24LCe8&v=1&r=http%3a%2f%2ffox6now.com%2f2012%2f04%2f05%2fnot-so-busy-bees-honey-production-drops-in-wisconsin-in-2011%2f&p=DevEx,5128.1
https://www.bing.com/cr?IG=2DBBFD9216514AEA80AB626FFFE7BD69&CID=1D28F9D7A5C264073EE0F03CA42565A8&rd=1&h=9_EDTqNhxdgAq1-fENnvvR-04C3cS9boLXPB05SU_sk&v=1&r=https%3a%2f%2fwww.scribd.com%2fdocument%2f268914519%2fGlobal-Bee-Colony-Disorder-and-Threats-Insect-Pollinators&p=DevEx,5141.1




http://www.bing.com/cr?IG=2DBBFD9216514AEA80AB626FFFE7BD69&CID=1D28F9D7A5C264073EE0F03CA42565A8&rd=1&h=7PKDvPs1i0gkmDSkUmx4eZF2FmL5FNBVPYYTFK27j64&v=1&r=http%3a%2f%2fwww.slideshare.net%2fChiman%2fglobal-bee-colonydisorderandthreatsinsectpollinators&p=DevEx,5156.1
https://www.bing.com/cr?IG=2DBBFD9216514AEA80AB626FFFE7BD69&CID=1D28F9D7A5C264073EE0F03CA42565A8&rd=1&h=URRJvErT5_ootr4bQl7POee1oMSFI8P1BUxpagTQsrg&v=1&r=https%3a%2f%2fwww.nass.usda.gov%2fData_and_Statistics%2fSpecial_Tabulations%2fRequest_a_Tabulation%2fdata-lab-records4.php&p=DevEx,5168.1




http://www.bing.com/cr?IG=2DBBFD9216514AEA80AB626FFFE7BD69&CID=1D28F9D7A5C264073EE0F03CA42565A8&rd=1&h=w2h4b0cfybdyW5lmlmzqB99BgNMKBrT8nnSGAN9kG6Y&v=1&r=http%3a%2f%2fdocuments.mx%2fdocuments%2fglobal-bee-colony-disorder-and-threats-insect-pollinators.html&p=DevEx,5183.1
http://www.bing.com/cr?IG=2DBBFD9216514AEA80AB626FFFE7BD69&CID=1D28F9D7A5C264073EE0F03CA42565A8&rd=1&h=PPF8ndrIpuFUhLrrnNIExok8Ynu4bx8Up929RkZB9q8&v=1&r=http%3a%2f%2fnjshea.com%2f&p=DevEx,5197.1
https://www.bing.com/cr?IG=2DBBFD9216514AEA80AB626FFFE7BD69&CID=1D28F9D7A5C264073EE0F03CA42565A8&rd=1&h=VhhdjciyIKtMJEs6qboqoop7Ql49QPUr57WoiEe7D68&v=1&r=https%3a%2f%2fissuu.com%2fbendeines%2fdocs%2fthesis_pages_small&p=DevEx,5210.1




http://www.bing.com/cr?IG=2DBBFD9216514AEA80AB626FFFE7BD69&CID=1D28F9D7A5C264073EE0F03CA42565A8&rd=1&h=gmQ4yWHlz4GMh6OnYyO8amj_2a8Y4DOtq0DOi-Nsi-I&v=1&r=http%3a%2f%2fdocplayer.net%2f387050-Global-honey-bee-colony-disorders-and-other-threats-to-insect-pollinators.html&p=DevEx,5222.1
http://www.bing.com/cr?IG=2DBBFD9216514AEA80AB626FFFE7BD69&CID=1D28F9D7A5C264073EE0F03CA42565A8&rd=1&h=_95SuScLlM7XyaaugJOfVLRfawfkCln7lnmCTFtKyAM&v=1&r=http%3a%2f%2fwww.mike-tanner.co.nz%2f04behaviour.html&p=DevEx,5235.1
http://www.bing.com/cr?IG=2DBBFD9216514AEA80AB626FFFE7BD69&CID=1D28F9D7A5C264073EE0F03CA42565A8&rd=1&h=KJ-dSeMe8dPJ3p7MmRj-6cmg5x910lTk_6Iwr2Suxj4&v=1&r=http%3a%2f%2fwww.bremnerinvestments.com%2fU.S.%2bQueen%2bBees%2bWork%2bOvertime%2bto%2bSave%2bHives&p=DevEx,5246.1
http://www.bing.com/cr?IG=2DBBFD9216514AEA80AB626FFFE7BD69&CID=1D28F9D7A5C264073EE0F03CA42565A8&rd=1&h=0mFD13f_ifrDaIDAxkojNgTRbBMcUTNcGSr_K59D1Hw&v=1&r=http%3a%2f%2fsociologyinfocus.com%2fauthor%2fnathan%2fpage%2f7%2f&p=DevEx,



http://www.bing.com/cr?IG=2DBBFD9216514AEA80AB626FFFE7BD69&CID=1D28F9D7A5C264073EE0F03CA42565A8&rd=1&h=LrzspxZMYkGHmAmDh0SU5_wIgLaB2OPOhs5Bk6MSkDY&v=1&r=http%3a%2f%2fsociologyadventures.blogspot.com%2f&p=DevEx,5295.1
0.017619903442929134%
Search for "America" "Number of honey producing bee colonies" 2010
New Query




http://www.bing.com/cr?IG=A627A3B8043748018DA452F1CEBA365D&CID=1B3D1E9FD9586F1734CB1774D8BF6ED9&rd=1&h=FiNvnCqzVqr2sHxk60r1daucaH-EVA9Lgh3bkZUcNfA&v=1&r=http%3a%2f%2fwww.perc.org%2farticles%2feveryone-calm-down-there-no-bee-pocalypse&p=DevEx,5076.1
http://www.bing.com/cr?IG=A627A3B8043748018DA452F1CEBA365D&CID=1B3D1E9FD9586F1734CB1774D8BF6ED9&rd=1&h=c8tstJuN9iiu2V5C57_L6Qu60Qv3LpoQOBcdudibPzw&v=1&r=http%3a%2f%2fusda.mannlib.cornell.edu%2fMannUsda%2fviewDocumentInfo.do%3fdocumentID%3d1191&p=DevEx,5089.1
http://www.bing.com/cr?IG=A627A3B8043748018DA452F1CEBA365D&CID=1B3D1E9FD9586F1734CB1774D8BF6ED9&rd=1&h=72tw_nNy2PQKBYL2CexP-7aRhKrUJsXL5911qhj2ck8&v=1&r=http%3a%2f%2fescholarship.org%2fuc%2fitem%2f8pp7r3bj.pdf&p=DevEx,5101.1
http://www.bing.com/cr?IG=A627A3B8043748018DA452F1CEBA365D&CID=1B3D1E9FD9586F1734CB1774D8BF6ED9&rd=1&h=99G_BRoEgUCKtBrv1U3GRfuHDRxk7u2cJlVI1CA-Qmk&v=1&r=http%3a%2f%2fwww.sustainalytics.com%2feu-pesticide-ban-and-potential-impact-chemicals-industry&p=DevEx,5113.1
http



http://www.bing.com/cr?IG=A627A3B8043748018DA452F1CEBA365D&CID=1B3D1E9FD9586F1734CB1774D8BF6ED9&rd=1&h=RCN0fTV3zWFDqyqDv-jtKBO85pUX7vy640uqERYdj58&v=1&r=http%3a%2f%2fhiveharvest.com%2fblog%2f&p=DevEx,5139.1
http://www.bing.com/cr?IG=A627A3B8043748018DA452F1CEBA365D&CID=1B3D1E9FD9586F1734CB1774D8BF6ED9&rd=1&h=PPF8ndrIpuFUhLrrnNIExok8Ynu4bx8Up929RkZB9q8&v=1&r=http%3a%2f%2fnjshea.com%2f&p=DevEx,5153.1
http://www.bing.com/cr?IG=A627A3B8043748018DA452F1CEBA365D&CID=1B3D1E9FD9586F1734CB1774D8BF6ED9&rd=1&h=7PKDvPs1i0gkmDSkUmx4eZF2FmL5FNBVPYYTFK27j64&v=1&r=http%3a%2f%2fwww.slideshare.net%2fChiman%2fglobal-bee-colonydisorderandthreatsinsectpollinators&p=DevEx,5168.1
http://www.bing.com/cr?IG=A627A3B8043748018DA452F1CEBA365D&CID=1B3D1E9FD9586F1734CB1774D8BF6ED9&rd=1&h=_U8MWc1Ap4eFodbKJoRlxagL8hTgvtimjHbmC-YBlFA&v=1&r=http%3a%2f%2fwww.startribune.com%2fnation-had-more-bees-and-honey-last-year-usda-says%2f116961113%2f&p=DevEx,5181.1


## Fact Checking

### Training
Load Modules for fact checking, generate the features and train our classifier from our training data

In [None]:
from classifier.LogisticRegressionClassifier import LogisticRegressionClassifier
from classifier.features.generate_features import FeatureGenerator, num, is_num
from distant_supervision.utterance_detection import f_threshold_match
from factchecking.question import Question
from tabular.filtering import load_collection

In [None]:
fg = FeatureGenerator()
Xs,ys = fg.generate_training()

In [None]:
classifier = LogisticRegressionClassifier()
classifier.train(Xs,ys)

### Runtime

Load the source data

In [None]:
tables = load_collection("herox")

Define the fact checking function

In [None]:
def fact_check(q):
    question = Question(text=q, type="NUM")
    tuples,q_features = fg.generate_test(tables,question)

    q_match = False

    if len(tuples)>0:
    
        
        
        q_predicted = classifier.predict(q_features)

        for i in range(len(tuples)):
            tuple = tuples[i]
            
            skip = False
            if len(tuple[1]['date']) and len(question.dates):
                for date in question.dates:
                    dstrs = set()
                    for d in question.dates:
                        dstrs.add(str(d))
                    if not len(set(tuple[1]['date']).intersection(dstrs)):
                        skip = True
                        
            if skip:
                continue
    

            if is_num(tuple[1]['value']):
                prediction = q_predicted[i]
                features = q_features[i]

                if prediction == 1:
                    print(str(tuple) + "\t\t" + ("Possible Match" if prediction else "No match"))
                    for number in question.numbers:
                        value = num(tuple[1]['value'])

                        if value is None:
                            continue

                        if f_threshold_match(number, value, 0.05):
                            print(str(tuple) + "\t\t" + "Threshold Match to 5%")
                            q_match = True

                    for number in question.dates:
                        value = num(tuple[1]['value'])
                        if number == value:
                            print(str(tuple) + "\t\t" + "Exact Match")
                            q_match = True
        print(question.text)
        print(q_match)

    else:
        print(question.text)
        print("No supporting information can be found in the knowledge base")
    print("\n\n")

# Fact checking

In [None]:
fact_check("Around 90,000 unaccompanied children claimed asylum in the EU in 2015")