In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import logging as log
import utils
import concurrent.futures

# Data Processing
## Fever Dataset

In [13]:
# claim label range is smaller than Liar and Snopes but provides sources like Snopes. Better for fact checking
fever_ds = pd.read_json('data/fever_dataset/fever_train.jsonl', lines=True)
fever_ds.head()

Unnamed: 0,id,verifiable,label,claim,evidence
0,75397,VERIFIABLE,SUPPORTS,Nikolaj Coster-Waldau worked with the Fox Broa...,"[[[92206, 104971, Nikolaj_Coster-Waldau, 7], [..."
1,150448,VERIFIABLE,SUPPORTS,Roman Atwood is a content creator.,"[[[174271, 187498, Roman_Atwood, 1]], [[174271..."
2,214861,VERIFIABLE,SUPPORTS,"History of art includes architecture, dance, s...","[[[255136, 254645, History_of_art, 2]]]"
3,156709,VERIFIABLE,REFUTES,Adrienne Bailon is an accountant.,"[[[180804, 193183, Adrienne_Bailon, 0]]]"
4,83235,NOT VERIFIABLE,NOT ENOUGH INFO,System of a Down briefly disbanded in limbo.,"[[[100277, None, None, None]]]"


In [14]:
fever_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145449 entries, 0 to 145448
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   id          145449 non-null  int64 
 1   verifiable  145449 non-null  object
 2   label       145449 non-null  object
 3   claim       145449 non-null  object
 4   evidence    145449 non-null  object
dtypes: int64(1), object(4)
memory usage: 5.5+ MB


In [6]:
print(fever_ds['label'].value_counts())

label
SUPPORTS           80035
NOT ENOUGH INFO    35639
REFUTES            29775
Name: count, dtype: int64


In [5]:
print(fever_ds['label'].value_counts(normalize=True))

label
SUPPORTS           0.550262
NOT ENOUGH INFO    0.245027
REFUTES            0.204711
Name: proportion, dtype: float64


In [12]:
#provides both a good range of labels and sources
snopes_ds = pd.read_csv('data/ukp_snopes_corpus/datasets_raw/snopes_corpus_4.csv')
snopes_ds.head()

Unnamed: 0,ID,Snopes URL,Claim,Truthfulness (Verdicts),Snippets (ETS),Source URL,Source,Documents (orginal documents),Match ETS and Documents
0,1,http://www.snopes.com/inboxer/prayer/darkhorse...,Prayers are requested for the Darkhorse Marine...,OUTDATED,<p> The Marines patrolling through the green f...,http://archive.boston.com/news/world/asia/arti...,Marines learn lessons from tragedy in Afghanis...,Marines learn lessons from tragedy in Afghanis...,match
1,2,http://www.snopes.com/inboxer/prayer/darkhorse...,Prayers are requested for the Darkhorse Marine...,OUTDATED,<p> An infantryman from Camp Pendleton's 3rd B...,,,,
2,3,http://www.snopes.com/inboxer/prayer/darkhorse...,Prayers are requested for the Darkhorse Marine...,OUTDATED,<p> We are asking everyone to say a prayer for...,,,,
3,4,http://www.snopes.com/inboxer/prayer/darkhorse...,Prayers are requested for the Darkhorse Marine...,OUTDATED,<p> U.S. Marines who recently inherited this l...,,,,
4,5,http://www.snopes.com/inboxer/prayer/darkhorse...,Prayers are requested for the Darkhorse Marine...,OUTDATED,,https://web.archive.org/web/20081004102646/htt...,3rd Battalion 5th Marines History,"Starting in 1996, Alexa Internet has been dona...",


In [13]:
snopes_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25778 entries, 0 to 25777
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   ID                             25778 non-null  int64 
 1   Snopes URL                     25778 non-null  object
 2   Claim                          25767 non-null  object
 3   Truthfulness (Verdicts)        25767 non-null  object
 4   Snippets (ETS)                 25778 non-null  object
 5   Source URL                     25778 non-null  object
 6   Source                         25713 non-null  object
 7   Documents (orginal documents)  25778 non-null  object
 8   Match ETS and Documents        25778 non-null  object
dtypes: int64(1), object(8)
memory usage: 1.8+ MB


In [49]:
#print(snopes_ds['Source'][0]) #sources to back the claim, split by ';'
print(snopes_ds['Snippets (ETS)'][0]) #Evidence text snippets, split by number
print('========')
print(snopes_ds['Truthfulness (Verdicts)'].value_counts()) # 5 classes
print('========')
print((snopes_ds['Truthfulness (Verdicts)']=='OUTDATED').sum()) # 5 classes

<p> The Marines patrolling through the green fields and tall mud compounds of Helmand province's Sangin district say they are literally in a race for their lives. They are trying to adjust their tactics to outwit Taliban fighters, who have killed more coalition troops here than in any other Afghan district this year.</p> <p>“As a new unit coming in, you are at a distinct disadvantage because the Taliban have been fighting here for years, have established fighting positions and have laid the ground with a ton of IEDs,” said Lt. Col. Jason Morris, commander of the 3rd Battalion, 5th Marine Regiment. “You have to evolve quickly because you have no other choice.”</p> <p>Many of the younger Marines also have had to cope for the first time with seeing their best friends die or suffer grievous wounds. Fifteen Marines have been killed and about 50 wounded since the battalion arrived in October [2010] - many by improvised explosive devices or IEDs. </p>
Truthfulness (Verdicts)
FALSE            

### Random tests to understand the data

In [7]:
fever_instance = fever_ds.iloc[0]
snopes_instance = snopes_ds.iloc[0]

print('Fever instance:')
print(fever_instance)
print('========')
print('Snopes instance:')
print(snopes_instance)

NameError: name 'snopes_ds' is not defined

In [53]:
print('ID :')
print(snopes_instance['ID'])
print('========')
print('Snopes URL :')
print(snopes_instance['Snopes URL'])
print('========')
print('Claim :')
print(snopes_instance['Claim'])
print('========')
print('Truthfulness (Verdicts) :')
print(snopes_instance['Truthfulness (Verdicts)'])
print('========')
print('Snippets (ETS) :')
print(snopes_instance['Snippets (ETS)'])
print('========')
print('Source URL:')
print(snopes_instance['Source URL'])
print('========')
print('Source :')
print(snopes_instance['Source'])
print('========')
print('Documents (orginal documents) :')
print(snopes_instance['Documents (orginal documents)'])
print('========')
print('Match ETS and Documents')
print(snopes_instance[-1])

ID :
1
Snopes URL :
http://www.snopes.com/inboxer/prayer/darkhorse.asp
Claim :
Prayers are requested for the Darkhorse Marine battalion fighting in Afghanistan.
Truthfulness (Verdicts) :
OUTDATED
Snippets (ETS) :
<p> The Marines patrolling through the green fields and tall mud compounds of Helmand province's Sangin district say they are literally in a race for their lives. They are trying to adjust their tactics to outwit Taliban fighters, who have killed more coalition troops here than in any other Afghan district this year.</p> <p>“As a new unit coming in, you are at a distinct disadvantage because the Taliban have been fighting here for years, have established fighting positions and have laid the ground with a ton of IEDs,” said Lt. Col. Jason Morris, commander of the 3rd Battalion, 5th Marine Regiment. “You have to evolve quickly because you have no other choice.”</p> <p>Many of the younger Marines also have had to cope for the first time with seeing their best friends die or suffe

In [8]:
print('ID :')
print(fever_instance['id'])
print('========')
print('verifiable : ')
print(fever_instance['verifiable'])
print('========')
print('label : ')
print(fever_instance['label'])
print('========')
print('claim : ')
print(fever_instance['claim'])
print('========')
print('evidence : ')
print(fever_instance['evidence'])


#f_evidence = fever_instance['evidence']
#print(f_evidence)
#print(f_evidence[0])
#print(f_evidence[0][0])


ID :
75397
verifiable : 
VERIFIABLE
label : 
SUPPORTS
claim : 
Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.
evidence : 
[[[92206, 104971, 'Nikolaj_Coster-Waldau', 7], [92206, 104971, 'Fox_Broadcasting_Company', 0]]]


# Retreving data for FEVER

In [14]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import logging as log
import utils
import concurrent.futures

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
fever_ds = pd.read_json('data/fever_dataset/fever_train.jsonl', lines=True)

In [16]:
subset = fever_ds.head(10)
subset = subset.loc[subset['verifiable'] != 'NOT VERIFIABLE']
print(subset.shape)
articles = pd.DataFrame(columns=['page', 'data'])


(7, 5)


In [17]:
for i, row in subset.iterrows():
    try:
        ets, document = utils.get_all_ETS_and_document(row['evidence'], articles)
        print(">>> OUTPUT ETS: ", ets)
        subset.at[i, 'ETS'] = ets
        subset.at[i, 'Documents'] = document
    except:
        if row['label'] == 'NOT ENOUGH INFO':
            subset.at[i, 'ETS'] = None
            subset.at[i, 'Documents'] = None
        else:
            subset = subset.drop(i)

>>> Starting process for: [92206, 104971, 'Nikolaj_Coster-Waldau', 7]
>>> Getting data for: Nikolaj Coster-Waldau (Original name: Nikolaj_Coster-Waldau)
>>> TEST for data length: 10125 characters
>>> Saving data for: Nikolaj Coster-Waldau
>>> Getting ETS for: Nikolaj_Coster-Waldau , sentence id: 7
6
7
>>> OUTPUT ETS:   He is a UNDP Goodwill Ambassador, drawing public attention to issues such as gender equality and climate change
>>> Starting process for: [174271, 187498, 'Roman_Atwood', 1]
>>> Getting data for: Roman Atwood (Original name: Roman_Atwood)
>>> TEST for data length: 8206 characters
>>> Saving data for: Roman Atwood
>>> Getting ETS for: Roman_Atwood , sentence id: 1
6
7
>>> Starting process for: [174271, 187499, 'Roman_Atwood', 3]
>>> Already got data for: Roman_Atwood
>>> Getting ETS for: Roman_Atwood
>>> OUTPUT ETS:   He is best known for his vlogs, where he posts updates about his life.  He also has another YouTube channel called "RomanAtwood", where he used to post pran

In [19]:
print(articles)

                               page  \
0             Nikolaj Coster-Waldau   
1                      Roman Atwood   
2                    History of art   
3                   Adrienne Bailon   
4              Homeland (TV series)   
5                    Boston Celtics   
6  The Ten Commandments (1956 film)   

                                                data  
0  Nikolaj William Coster-Waldau (Danish pronunci...  
1  Roman Bernard Atwood (born May 28, 1983) is an...  
2  The history of art focuses on objects made by ...  
3  Adrienne Eliza Bailon-Houghton (née Bailon ();...  
4  Homeland (stylized as HOMƎLAND) is an American...  
5  The Boston Celtics ( SEL-tiks) are an American...  
6  The Ten Commandments is a 1956 American epic r...  


In [20]:
subset.to_csv('subset_data.csv', index=False)

In [47]:
test = pd.read_csv('subset_data.csv')
print(test)

       id  verifiable     label  \
0   75397  VERIFIABLE  SUPPORTS   
1  150448  VERIFIABLE  SUPPORTS   
2  214861  VERIFIABLE  SUPPORTS   
3  156709  VERIFIABLE   REFUTES   
4  129629  VERIFIABLE  SUPPORTS   
5   33078  VERIFIABLE  SUPPORTS   
6    6744  VERIFIABLE  SUPPORTS   

                                               claim  \
0  Nikolaj Coster-Waldau worked with the Fox Broa...   
1                 Roman Atwood is a content creator.   
2  History of art includes architecture, dance, s...   
3                  Adrienne Bailon is an accountant.   
4  Homeland is an American television spy thrille...   
5  The Boston Celtics play their home games at TD...   
6              The Ten Commandments is an epic film.   

                                            evidence  
0  [[[92206, 104971, 'Nikolaj_Coster-Waldau', 7],...  
1  [[[174271, 187498, 'Roman_Atwood', 1]], [[1742...  
2          [[[255136, 254645, 'History_of_art', 2]]]  
3         [[[180804, 193183, 'Adrienne_Bailon', 0]

In [None]:
import concurrent.futures
import pandas as pd

# Replace this function with your actual processing logic
def process_row(row, articles):
    try:
        ets, document = utils.get_all_ETS_and_document(row['evidence'], articles)
        print(">>> OUTPUT ETS: ", ets)
        return ets, document
    except Exception as e:
        if row['label'] == 'NOT ENOUGH INFO':
            return None, None
        else:
            # Log the exception or handle the error as needed
            return None, None

def parallel_processing(chunk):
    articles_local = pd.DataFrame(columns=['page', 'data'])
    results = []

    for i, row in chunk.iterrows():
        ets, document = process_row(row, articles_local)
        results.append((i, ets, document))

    return results, articles_local

# Assume 'df' is your original DataFrame
# Divide the DataFrame into chunks
chunk_size = 1000
df_chunks = [fever_ds.iloc[i:i + chunk_size] for i in range(0, len(fever_ds), chunk_size)]

# Process each chunk in parallel
with concurrent.futures.ProcessPoolExecutor() as executor:
    futures = [executor.submit(parallel_processing, chunk) for chunk in df_chunks]

    # Collect results
    results_list = [future.result() for future in futures]

# Update the original DataFrame with the results
for results, articles_local in results_list:
    for i, ets, document in results:
        fever_ds.at[i, 'ETS'] = ets
        fever_ds.at[i, 'Documents'] = document

    # Merge articles_local into the main articles DataFrame
    articles = pd.concat([articles, articles_local]).drop_duplicates(subset='page').reset_index(drop=True)


In [None]:
f_ets_ds = fever_ds.copy()
f_ets_ds['ETS'] = None
f_ets_ds['Documents'] = None

for i, row in f_ets_ds.iterrows():
    try:
        ets, document = utils.get_all_ETS_and_document(row['evidence'])
        f_ets_ds.at[i, 'ETS'] = ets
        f_ets_ds.at[i, 'Documents'] = document
    except:
        if row['label'] == 'NOT ENOUGH INFO':
            f_ets_ds.at[i, 'ETS'] = None
            f_ets_ds.at[i, 'Documents'] = None
        else:
            f_ets_ds = f_ets_ds.drop(i)  # Delete the current row

f_ets_ds.to_csv('data/fever_dataset/fever_with_ETS.csv', index=True)