In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import logging as log
import utils


# Data Processing
## Fever Dataset

In [44]:
# claim label range is smaller than Liar and Snopes but provides sources like Snopes. Better for fact checking
fever_ds = pd.read_json('data/fever_dataset/fever_train.jsonl', lines=True)
fever_ds.head()

Unnamed: 0,id,verifiable,label,claim,evidence
0,75397,VERIFIABLE,SUPPORTS,Nikolaj Coster-Waldau worked with the Fox Broa...,"[[[92206, 104971, Nikolaj_Coster-Waldau, 7], [..."
1,150448,VERIFIABLE,SUPPORTS,Roman Atwood is a content creator.,"[[[174271, 187498, Roman_Atwood, 1]], [[174271..."
2,214861,VERIFIABLE,SUPPORTS,"History of art includes architecture, dance, s...","[[[255136, 254645, History_of_art, 2]]]"
3,156709,VERIFIABLE,REFUTES,Adrienne Bailon is an accountant.,"[[[180804, 193183, Adrienne_Bailon, 0]]]"
4,83235,NOT VERIFIABLE,NOT ENOUGH INFO,System of a Down briefly disbanded in limbo.,"[[[100277, None, None, None]]]"


In [45]:
fever_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145449 entries, 0 to 145448
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   id          145449 non-null  int64 
 1   verifiable  145449 non-null  object
 2   label       145449 non-null  object
 3   claim       145449 non-null  object
 4   evidence    145449 non-null  object
dtypes: int64(1), object(4)
memory usage: 5.5+ MB


In [46]:
print(fever_ds['label'].value_counts())

label
SUPPORTS           80035
NOT ENOUGH INFO    35639
REFUTES            29775
Name: count, dtype: int64


In [47]:
#provides both a good range of labels and sources
snopes_ds = pd.read_csv('data/ukp_snopes_corpus/datasets_raw/snopes_corpus_4.csv')
snopes_ds.head()

Unnamed: 0,ID,Snopes URL,Claim,Truthfulness (Verdicts),Snippets (ETS),Source URL,Source,Documents (orginal documents),Match ETS and Documents
0,1,http://www.snopes.com/inboxer/prayer/darkhorse...,Prayers are requested for the Darkhorse Marine...,OUTDATED,<p> The Marines patrolling through the green f...,http://archive.boston.com/news/world/asia/arti...,Marines learn lessons from tragedy in Afghanis...,Marines learn lessons from tragedy in Afghanis...,match
1,2,http://www.snopes.com/inboxer/prayer/darkhorse...,Prayers are requested for the Darkhorse Marine...,OUTDATED,<p> An infantryman from Camp Pendleton's 3rd B...,,,,
2,3,http://www.snopes.com/inboxer/prayer/darkhorse...,Prayers are requested for the Darkhorse Marine...,OUTDATED,<p> We are asking everyone to say a prayer for...,,,,
3,4,http://www.snopes.com/inboxer/prayer/darkhorse...,Prayers are requested for the Darkhorse Marine...,OUTDATED,<p> U.S. Marines who recently inherited this l...,,,,
4,5,http://www.snopes.com/inboxer/prayer/darkhorse...,Prayers are requested for the Darkhorse Marine...,OUTDATED,,https://web.archive.org/web/20081004102646/htt...,3rd Battalion 5th Marines History,"Starting in 1996, Alexa Internet has been dona...",


In [48]:
snopes_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25778 entries, 0 to 25777
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   ID                             25778 non-null  int64 
 1   Snopes URL                     25778 non-null  object
 2   Claim                          25767 non-null  object
 3   Truthfulness (Verdicts)        25767 non-null  object
 4   Snippets (ETS)                 25778 non-null  object
 5   Source URL                     25778 non-null  object
 6   Source                         25713 non-null  object
 7   Documents (orginal documents)  25778 non-null  object
 8   Match ETS and Documents        25778 non-null  object
dtypes: int64(1), object(8)
memory usage: 1.8+ MB


In [49]:
#print(snopes_ds['Source'][0]) #sources to back the claim, split by ';'
print(snopes_ds['Snippets (ETS)'][0]) #Evidence text snippets, split by number
print('========')
print(snopes_ds['Truthfulness (Verdicts)'].value_counts()) # 5 classes
print('========')
print((snopes_ds['Truthfulness (Verdicts)']=='OUTDATED').sum()) # 5 classes

<p> The Marines patrolling through the green fields and tall mud compounds of Helmand province's Sangin district say they are literally in a race for their lives. They are trying to adjust their tactics to outwit Taliban fighters, who have killed more coalition troops here than in any other Afghan district this year.</p> <p>“As a new unit coming in, you are at a distinct disadvantage because the Taliban have been fighting here for years, have established fighting positions and have laid the ground with a ton of IEDs,” said Lt. Col. Jason Morris, commander of the 3rd Battalion, 5th Marine Regiment. “You have to evolve quickly because you have no other choice.”</p> <p>Many of the younger Marines also have had to cope for the first time with seeing their best friends die or suffer grievous wounds. Fifteen Marines have been killed and about 50 wounded since the battalion arrived in October [2010] - many by improvised explosive devices or IEDs. </p>
Truthfulness (Verdicts)
FALSE            

In [50]:
# provides a good range of labels but no sources so less interesting for fact checking. Also only about american politics so less general...
# claims can be used for testing :)
liar_ds = pd.read_csv('data/liar_dataset/train.tsv', sep='\t', header=None)
liar_ds.columns = [
    'id',
    'label',
    'statement',
    'subjects',
    'speaker',
    'speaker_job',
    'state',
    'party',
    'barely_true_count',
    'false_count',
    'half_true_count',
    'mostly_true_count',
    'pants_on_fire_count',
    'context'
    ]

liar_ds.head()

Unnamed: 0,id,label,statement,subjects,speaker,speaker_job,state,party,barely_true_count,false_count,half_true_count,mostly_true_count,pants_on_fire_count,context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [51]:
liar_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10240 entries, 0 to 10239
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   10240 non-null  object 
 1   label                10240 non-null  object 
 2   statement            10240 non-null  object 
 3   subjects             10238 non-null  object 
 4   speaker              10238 non-null  object 
 5   speaker_job          7342 non-null   object 
 6   state                8030 non-null   object 
 7   party                10238 non-null  object 
 8   barely_true_count    10238 non-null  float64
 9   false_count          10238 non-null  float64
 10  half_true_count      10238 non-null  float64
 11  mostly_true_count    10238 non-null  float64
 12  pants_on_fire_count  10238 non-null  float64
 13  context              10138 non-null  object 
dtypes: float64(5), object(9)
memory usage: 1.1+ MB


### Random tests to understand the data

In [52]:
fever_instance = fever_ds.iloc[0]
snopes_instance = snopes_ds.iloc[0]

print('Fever instance:')
print(fever_instance)
print('========')
print('Snopes instance:')
print(snopes_instance)

Fever instance:
id                                                        75397
verifiable                                           VERIFIABLE
label                                                  SUPPORTS
claim         Nikolaj Coster-Waldau worked with the Fox Broa...
evidence      [[[92206, 104971, Nikolaj_Coster-Waldau, 7], [...
Name: 0, dtype: object
Snopes instance:
ID                                                                               1
Snopes URL                       http://www.snopes.com/inboxer/prayer/darkhorse...
Claim                            Prayers are requested for the Darkhorse Marine...
Truthfulness (Verdicts)                                                   OUTDATED
Snippets (ETS)                   <p> The Marines patrolling through the green f...
Source URL                       http://archive.boston.com/news/world/asia/arti...
Source                           Marines learn lessons from tragedy in Afghanis...
Documents (orginal documents)    Marines le

In [53]:
print('ID :')
print(snopes_instance['ID'])
print('========')
print('Snopes URL :')
print(snopes_instance['Snopes URL'])
print('========')
print('Claim :')
print(snopes_instance['Claim'])
print('========')
print('Truthfulness (Verdicts) :')
print(snopes_instance['Truthfulness (Verdicts)'])
print('========')
print('Snippets (ETS) :')
print(snopes_instance['Snippets (ETS)'])
print('========')
print('Source URL:')
print(snopes_instance['Source URL'])
print('========')
print('Source :')
print(snopes_instance['Source'])
print('========')
print('Documents (orginal documents) :')
print(snopes_instance['Documents (orginal documents)'])
print('========')
print('Match ETS and Documents')
print(snopes_instance[-1])

ID :
1
Snopes URL :
http://www.snopes.com/inboxer/prayer/darkhorse.asp
Claim :
Prayers are requested for the Darkhorse Marine battalion fighting in Afghanistan.
Truthfulness (Verdicts) :
OUTDATED
Snippets (ETS) :
<p> The Marines patrolling through the green fields and tall mud compounds of Helmand province's Sangin district say they are literally in a race for their lives. They are trying to adjust their tactics to outwit Taliban fighters, who have killed more coalition troops here than in any other Afghan district this year.</p> <p>“As a new unit coming in, you are at a distinct disadvantage because the Taliban have been fighting here for years, have established fighting positions and have laid the ground with a ton of IEDs,” said Lt. Col. Jason Morris, commander of the 3rd Battalion, 5th Marine Regiment. “You have to evolve quickly because you have no other choice.”</p> <p>Many of the younger Marines also have had to cope for the first time with seeing their best friends die or suffe

In [54]:
print('ID :')
print(fever_instance['id'])
print('========')
print('verifiable : ')
print(fever_instance['verifiable'])
print('========')
print('label : ')
print(fever_instance['label'])
print('========')
print('claim : ')
print(fever_instance['claim'])
print('========')
print('evidence : ')
print(fever_instance['evidence'])


#f_evidence = fever_instance['evidence']
#print(f_evidence)
#print(f_evidence[0])
#print(f_evidence[0][0])


ID :
75397
verifiable : 
VERIFIABLE
label : 
SUPPORTS
claim : 
Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.
evidence : 
[[[92206, 104971, 'Nikolaj_Coster-Waldau', 7], [92206, 104971, 'Fox_Broadcasting_Company', 0]]]


In [None]:
f_ets_ds = fever_ds.copy()
f_ets_ds['ETS'] = None
f_ets_ds['Documents'] = None

for i, row in f_ets_ds.iterrows():
    try:
        ets, document = utils.get_all_ETS_and_document(row['evidence'])
        f_ets_ds.at[i, 'ETS'] = ets
        f_ets_ds.at[i, 'Documents'] = document
    except:
        if row['label'] == 'NOT ENOUGH INFO':
            f_ets_ds.at[i, 'ETS'] = None
            f_ets_ds.at[i, 'Documents'] = None
        else:
            f_ets_ds = f_ets_ds.drop(i)  # Delete the current row

f_ets_ds.to_csv('data/fever_dataset/fever_with_ETS.csv', index=False)

In [None]:
f_ets_ds.to_csv('data/fever_dataset/fever_with_ETS2.csv', index=True)