# Numerical Fact Checking System. Univeristy of Sheffield

Pre-requisites:
 * Gradle
 * Java jdk8
 * Python 3
  * numpy
  * jnius
  * fuzzywuzzy
  * sklearn
  * urllib3
 

## Configuration
This defines the colleciton of tables that is used to populate the knowledge base

In [1]:
world = "eacl-demo"

## Common setup

Import required dependencies and download/install Stanford CoreNLP

In [2]:
import sys
import os
import re

#Set path manually to incldue sources location
if 'src/' not in sys.path:
    sys.path.append('src/')

If the following step fails. Run `gradlew writeClasspath` on the terminal in this folder. Then try again

In [3]:
#Load Java classpath for stanford corenlp using gradle. this will also install it if missing
from subprocess import run,PIPE
if 'CLASSPATH' not in os.environ:
    if not (os.path.exists('build') and os.path.exists('build/classpath.txt')):
        print("Generating classpath")
        r=run(["./gradlew", "writeClasspath"],stdout=PIPE, stderr=PIPE, universal_newlines=True)
        print(r.stdout)
        print(r.stderr)
              
    print("Loading classpath")
    os.environ['CLASSPATH'] = open('build/classpath.txt','r').read()
    print("Done")

Loading classpath
Done


## Fact Checking

### Training
Load Modules for fact checking, generate the features and train our classifier from our training data

In [4]:
from classifier.Classifier import Classifier
from classifier.LogisticRegressionClassifier import LogisticRegressionClassifier
from classifier.features.generate_features import FeatureGenerator, num, is_num
from distant_supervision.utterance_detection import f_threshold_match
from factchecking.question import Question
from tabular.filtering import load_collection



In [10]:
fg = FeatureGenerator()
Xs,ys = fg.generate_training(world)

Done: 0.0
Search for "Estonia" Asylum applicants considered to be unaccompanied minors  2011
Query already executed
Done: 0.38461538461538464
Search for "Italy" Asylum applicants considered to be unaccompanied minors  2009
Query already executed
Done: 0.7692307692307693
Search for "Luxembourg" Asylum applicants considered to be unaccompanied minors  2013
Query already executed
Done: 1.1538461538461537
Search for "Latvia" Asylum applicants considered to be unaccompanied minors  2010
Query already executed
Done: 1.5384615384615385
Search for "Lithuania" Asylum applicants considered to be unaccompanied minors  2010
Query already executed
Done: 1.9230769230769231
Search for "Ireland" Asylum applicants considered to be unaccompanied minors  2015
Query already executed
Done: 2.3076923076923075
Search for "Norway" Asylum applicants considered to be unaccompanied minors  2013
Query already executed
Done: 2.6923076923076925
Search for "Liechtenstein" Asylum applicants considered to be unaccompa

In [6]:
from sklearn.linear_model import LogisticRegression
class LogisticRegressionClassifier(Classifier):
    def train(self, Xs, ys):
        print("Training classifier 3")
        self.lr = LogisticRegression(penalty='l1', C=0.78)
        self.lr.fit(Xs, ys)
        print("Trained")

    def predict(self, q_features):
        ys = (self.lr.predict(q_features), self.lr.predict_proba(q_features))
        return ys


classifier = LogisticRegressionClassifier()
classifier.train(Xs,ys)

Training classifier 3
Trained


### Runtime

Load the source data

In [7]:
tables = load_collection(world)
print(tables.files)

LOADED:
[{'utterance': '0', 'answer': '0', 'table': 'eacl-demo/2.tsv', 'id': '0'}]
register table eacl-demo/2.tsv
{'eacl-demo/2.tsv'}


Define the fact checking function

In [8]:
def fact_check(q):
    question = Question(text=q, type="NUM")
    question.parse()
    tuples,q_features = fg.generate_test(tables,question)
    q_match = False
    
  
    
    if len(tuples)>0:
  
    
        
        
        q_predicted = classifier.predict(q_features)

        for i in range(len(tuples)):
            tuple = tuples[i]
            
            skip = False
            if 'date' in tuple[1].keys() and len(question.dates):
                for date in question.dates:
                    dstrs = set()
                    for d in question.dates:
                        dstrs.add(str(d))
                    if not len(set(tuple[1]['date']).intersection(dstrs)):
                        skip = True
                        
            if skip:
                continue
    

            if is_num(tuple[1]['value']):
                prediction = q_predicted[0][i]
                features = q_features[i]

                
             
                if prediction == 1:
                    print(str(tuple) + "\t\t" + ("Possible Match" if prediction else "No match"))
                    for number in question.numbers:
                        value = num(tuple[1]['value'])

                        if value is None:
                            continue

                        if f_threshold_match(number, value, 0.05):
                            print(str(tuple) + "\t\t" + "Threshold Match to 5%")
                            q_match = True

                    for number in question.dates:
                        value = num(tuple[1]['value'])
                        if number == value:
                            print(str(tuple) + "\t\t" + "Exact Match")
                            q_match = True
        print(question.text)
        print(q_match)

    else:
        print(question.text)
        print("No supporting information can be found in the knowledge base")
    print("\n\n")

# Fact checking

In [9]:
fact_check("Around 23,000 unaccompanied children claimed asylum in Germany in 2014")
fact_check("Around 23,000 unaccompanied children claimed asylum in Germany in 2015")

('eacl-demo/2.tsv', {'value': '4400', 'date': ['2014'], 'entity': 'Germany', 'relation': 'Asylum applicants considered to be unaccompanied minors'})		Possible Match
('eacl-demo/2.tsv', {'value': '4400', 'date': ['2014'], 'entity': 'Germany', 'relation': 'Asylum applicants considered to be unaccompanied minors'})		Possible Match
('eacl-demo/2.tsv', {'value': '4400', 'date': ['2014'], 'entity': 'Germany', 'relation': 'Asylum applicants considered to be unaccompanied minors'})		Possible Match
('eacl-demo/2.tsv', {'value': '4400', 'date': ['2014'], 'entity': 'Germany', 'relation': 'Asylum applicants considered to be unaccompanied minors'})		Possible Match
Around 23,000 unaccompanied children claimed asylum in Germany in 2014
False



('eacl-demo/2.tsv', {'value': '22255', 'date': ['2015'], 'entity': 'Germany', 'relation': 'Asylum applicants considered to be unaccompanied minors'})		Possible Match
('eacl-demo/2.tsv', {'value': '22255', 'date': ['2015'], 'entity': 'Germany', 'relation': 'Asy