# LIAR Dataset
#### 12k+ entries of authentic, real-world short statements from various contexts with diverse speakers and topics
https://paperswithcode.com/dataset/liar

#### Neccessery imports

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly
plotly.offline.init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings('ignore')

## 1. Loading the training, testing and validation datasets and merging into one

In [3]:
LIAR_train_df = pd.read_csv('Initial_datasets/train.csv', delimiter='\t', header=None, 
                            index_col=False, low_memory=False)
LIAR_test_df = pd.read_csv('Initial_datasets/test.csv', delimiter='\t', header=None, 
                           index_col=False, low_memory=False)
LIAR_valid_df = pd.read_csv('Initial_datasets/valid.csv', header=None, index_col=False, low_memory=False)

LIAR_df = pd.concat([LIAR_train_df, LIAR_test_df, LIAR_valid_df], ignore_index=True)
LIAR_df.set_axis(['json', 'claim_veracity', 'claim', 'topics', 'speaker', 'job', 'state', 
                  'political_party', 'credit_history_1', 'credit_history_2', 'credit_history_3', 
                  'credit_history_4', 'credit_history_5', 'context'], axis=1, inplace=True)
print("size od df: ", LIAR_df.shape)
LIAR_df.head(2)

size od df:  (12791, 14)


Unnamed: 0,json,claim_veracity,claim,topics,speaker,job,state,political_party,credit_history_1,credit_history_2,credit_history_3,credit_history_4,credit_history_5,context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.


## 2. Doping useless columns
Dropping credit history as it's very specific for this dataset and won't be useful in further analysis

In [4]:
LIAR_df.drop(['json', 'political_party', 'job', 'state', 'credit_history_1', 'credit_history_2', 
              'credit_history_3', 'credit_history_4', 'credit_history_5'], axis=1, inplace=True)
LIAR_df.head(5)

Unnamed: 0,claim_veracity,claim,topics,speaker,context
0,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,a mailer
1,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,a floor speech.
2,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,Denver
3,false,Health care reform legislation is likely to ma...,health-care,blog-posting,a news release
4,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,an interview on CNN


## 3. Columns formating

### 3.1 Topics
Changing topics to be an array of topics not a string

In [5]:
LIAR_df['topics'] = LIAR_df['topics'].str.split(',')

### 3.2 Speaker
ex: from hilary-clinton to Hilary Clinton

In [6]:
LIAR_df['speaker'] = LIAR_df['speaker'].str.replace("-", " ").str.title()

### 3.3 Claims
- There are many claims in this dataset that paraphrase what speaker said ex: "Says John McCain has done nothing to help the vets."

- Because I am more focused on the actual claim rather than who said it, whenever I can I want to eliminate the speaker ex: "John McCain has done nothing to help the vets."

- Sometimes however if the claim is personal I add the speaker before the claim to form a sentence such as: "Donald Trump says John McCain has done nothing to help the vets."

In [9]:
# Sentences that have his/him/he/hes/she/shes/her/hers will have a speaker added to them 
personal_list = (' his ', ' him ', ' he ', ' hes ', ' she ', ' shes ', ' her ', ' hers ')
pattern = '|'.join(personal_list)
LIAR_df.loc[((LIAR_df['claim'].str.startswith('Says')) & LIAR_df['claim'].str.contains(pattern)), 
            'claim'] = LIAR_df['speaker'].str[:] + " " + LIAR_df['claim'].str[:]

# Deleting Says that
LIAR_df.loc[LIAR_df['claim'].str.startswith('Says that'), "claim"] = LIAR_df['claim'].str[10:]

# Deleting Says a
LIAR_df.loc[LIAR_df['claim'].str.startswith('Says a '), "claim"] = LIAR_df['claim'].str[7:]

# Deleting Says the
LIAR_df.loc[LIAR_df['claim'].str.startswith('Says the '), "claim"] = LIAR_df['claim'].str[5:]

# Deleting Says the
LIAR_df.loc[LIAR_df['claim'].str.startswith('Says '), "claim"] = LIAR_df['claim'].str[5:]

# Deleting clais that start with "On " and don't contain ':' -> They usually don't have any claim
# Example claim: 'On mandating health care coverage' - It doesn't tell us anything
LIAR_df.drop(LIAR_df.loc[(LIAR_df['claim'].str.lower().str.startswith('on ')) & 
                         (LIAR_df['claim'].str.contains(':') == False)].index, inplace=True)

# Adding the speaker to the remaining claims that start with 'On '
LIAR_df.loc[LIAR_df['claim'].str.startswith('On '), 'claim'] = LIAR_df['speaker'].str[:] + " o" + LIAR_df['claim'].str[1:]

# Capitalise the claims
LIAR_df['claim'] = LIAR_df['claim'].str[0].str.capitalize() + LIAR_df['claim'].str[1:]

In [11]:
# LIAR_df[(LIAR_df['claim'].str.lower().str.startswith('on ')) & (LIAR_df['claim'].str.len() > 40)]
LIAR_df[(LIAR_df['claim'].str.lower().str.contains('on')) & (LIAR_df['claim'].str.contains(':'))].head(5)

Unnamed: 0,claim_veracity,claim,topics,speaker,context
149,barely-true,"Donald Trump on the VA: Over 300,000 veterans ...","[health-care, veterans]",Donald Trump,a speech.
188,mostly-true,It is a commitment voters take very seriously:...,"[elections, taxes]",Americans Tax Reform,a news release
267,half-true,Hypocrisy at the Clinton Foundation: Top male ...,"[candidates-biography, women, workers]",Donald Trump,an Instagram post
357,true,You know we can't just pull out now... The tru...,[iraq],Joe Biden,"CNN/YouTube debate in Charleston, S.C."
375,pants-fire,"Sheila Jackson Lee of Texas said: Hey, all you...","[candidates-biography, diversity, campaign-adv...",Facebook Posts,a meme supposedly quoting Sheila Jackson Lee


### 3.4 claim_veracity
- TRUE: true, mostly-true
- FALSE: false, pants-fire, barely-true
- half-true data entries will go through google search verifier

In [None]:
fig = px.histogram(LIAR_df, x='claim_veracity').update_xaxes(categoryarray=['pants-fire', 'false', 'barely-true', 'half-true', 'mostly-true', 'true'])
fig.update_layout(bargap=0.2)
fig.show()

In [13]:
conditions = [LIAR_df['claim_veracity'].eq('true'),
              LIAR_df['claim_veracity'].eq('mostly-true'),
              LIAR_df['claim_veracity'].eq('TRUE'),
              LIAR_df['claim_veracity'].eq('pants-fire'),
              LIAR_df['claim_veracity'].eq('false'),
              LIAR_df['claim_veracity'].eq('barely-true'),
              LIAR_df['claim_veracity'].eq('FALSE')]
choices = [True, True, True, False, False, False, False]
LIAR_df['claim_veracity'] = np.select(conditions, choices, default = LIAR_df['claim_veracity'])

Half-true claims will be a grey area of this dataset as they will be the hardest to categorise and requires the most amount of time to investigate. It's also the biggest category so adding all to either "false" or "true" will shift the balance quite dramatically.

In [None]:
fig = px.histogram(LIAR_df, x='claim_veracity').update_xaxes(categoryarray=['pants-fire', 'false', 'barely-true', 'half-true', 'mostly-true', 'true'])
fig.update_layout(bargap=0.2)
fig.show()

## 4. Inspecting characteristics

### 4.1 Topics distribution

In [None]:
# Replacing NaN with empty arrays
isna = LIAR_df['topics'].isna()
LIAR_df.loc[isna, 'topics'] = pd.Series([[]] * isna.sum()).values

topics_occurances = LIAR_df.topics.sum()
topics_dict = {i:topics_occurances.count(i) for i in set(topics_occurances)}

fig = px.pie(LIAR_df, values=list(topics_dict.values()), names=list(topics_dict.keys()))
fig.update_traces(textposition='inside', textinfo='value')
fig.show()

We can clearly see that this dataset is diverse and doesn't focus on one specific topic

### 4.1 Context

#### Context modification
The context was very specfic so this changes the context to more be more general

In [14]:
print(len(LIAR_df['context'].unique())) #Over 5k possible contexts...

LIAR_df['context'] = LIAR_df['context'].fillna('None')

# Array that define change in context given certain keywords
speech_context = [['speech', 'senate', 'state', 'house', 'floor', 'meeting', 
                   'answer', 'commentary', 'presentation', 'hearing', 'opinion', 
                   'talk', 'appearance', 'interview', 'debate', 'radio', 'rally', 
                   'remark', 'conference', 'discussion', 'comment', 'town', 'response'], 'speech']
add_context = [['ad', 'mailer', 'flier', 'billboard', 'commercial', 'flyer', 'brochure', 'campaign'], 'ad']
social_media_context = [['twitter', 'tweet', 'facebook', 'post', 'social media', 
                     'media', 'web', 'forum', 'blog', 'internet', 'video', 'message'], 'social media']
news_context = [['press', 'news', 'website', 'cnn', 'tv', 'release', 'article', 'segment', 
             'abc', 'nbc', 'television', 'fox', 'msnbc', 'editiorial', 'column', 'op-ed'], 'news']
writing_context = [['book', 'email', 'e-mail', 'letter', 'report', 'autobiography', 'report', 'petition', 'survey', 'statement'], 'writing']

# Changing context given keywords
contexts_array = [speech_context, add_context, social_media_context, news_context, writing_context]
for context in contexts_array:
    context_regex = "|".join(context[0])
    LIAR_df.loc[LIAR_df['context'].str.lower().str.contains(context_regex), 'context'] = context[1]
    
    
# Other contexts are treated as undefined (they are very vague)
possible_contexts = ['speech', 'ad', 'social media', 'news', 'writing']
possible_contexts_regex = "|".join(possible_contexts)
LIAR_df.loc[~LIAR_df['context'].str.lower().str.contains(possible_contexts_regex), 'context'] = 'Undefined'

5047


#### New Context Distribution

In [None]:
fig = px.pie(LIAR_df, names = 'context', color = 'context', color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_traces(text = LIAR_df['context'].value_counts(), textinfo = 'label+percent')
fig.show()

## 5. Examining Google-verified Half-True entries
Half-true entries were fed into google_scraper, where the average levensthein distance between the claim and the google query results was calculated. This allows to more in-depth analysis and observations on whereas a half-true claim should be considered true or false

In [15]:
LIAR_df = LIAR_df[LIAR_df['claim_veracity'] != 'half-true']

### 5.1. Loading the dataset

In [17]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
pd.options.display.max_colwidth = 200

LIAR_score = pd.read_csv('../../Google_Scraper/LIAR_verified/LIAR_half_true_score.csv', 
                         index_col=0, low_memory=False)

# Delete the entries with leven_score = 0
LIAR_score = LIAR_score.drop(LIAR_score.loc[LIAR_score['leven__score'] <= 0].index)

In [18]:
# Deleting entries that were omitted before
LIAR_score.drop(LIAR_score.loc[(LIAR_score['claim'].str.lower().str.startswith('on')) & 
                               (LIAR_score['claim'].str.contains(':') == False)].index, inplace=True)

### 5.2. Adjusting levenshtein distance score
Adjusting leven_score to account for the claim character count

In [19]:
LIAR_score['claim_count'] = LIAR_score['claim'].str.len()
LIAR_score['adjusted_leven'] = LIAR_score['leven__score'] / LIAR_score['claim_count']

### 5.3. Splitting into groups based on sentence length
From each group the 50% of half-true entries that had the best (smallest) adjusted_leven will be considered as true and false vice-versa

In [20]:
print("4 different ranges of characters length that split the dataset into 4 groups: \n\n", 
      pd.cut(LIAR_score['claim_count'], 4).unique())

LIAR_score['char_category'] = pd.cut(LIAR_score['claim_count'], 4, labels=[0, 1, 2, 3])
LIAR_score.sort_values('adjusted_leven').head(5)

4 different ranges of characters length that split the dataset into 4 groups: 

 [(94.75, 170.5], (18.697, 94.75], (170.5, 246.25], (246.25, 322.0]]
Categories (4, interval[float64]): [(18.697, 94.75] < (94.75, 170.5] < (170.5, 246.25] < (246.25, 322.0]]


Unnamed: 0,claim_veracity,claim,leven__score,claim_count,adjusted_leven,char_category
1805,half-true,Harvard Study Finds States With Most Gun Laws Have Fewest Gun Deaths.,37.222222,69,0.539452,0
1436,half-true,"People (are) paying more in taxes than they will for food, housing and clothing combined.",48.272727,89,0.54239,0
988,half-true,"Americans will spend more on taxes in 2015 than on food, clothing and housing combined.",47.333333,87,0.544061,0
812,half-true,Undocumented immigrantspay $12 billion a year into Social Security.,37.4,67,0.558209,0
2029,half-true,"Americans will spend more on taxes in 2014 than they will on food, clothing and housing combined.",56.545455,97,0.582943,1


### 5.4. Taking 50% of best/worse entries from each group to be true/false

In [21]:
dfs=[]

for category in [0,1,2,3]:
    category_df = LIAR_score[LIAR_score['char_category'] == category].sort_values('adjusted_leven')
    half_index = int(len(category_df) * 0.5)
    
    true_df = category_df.iloc[:half_index]
    true_df['claim_veracity'] = True
    dfs.append(true_df)
    
    false_df = category_df.iloc[half_index:]
    false_df['claim_veracity'] = False
    dfs.append(false_df)
    
half_true_df = pd.concat(dfs)
LIAR_Final = pd.concat([half_true_df, LIAR_df])

In [22]:
LIAR_Final.head(2)

Unnamed: 0,claim_veracity,claim,leven__score,claim_count,adjusted_leven,char_category,topics,speaker,context
1805,True,Harvard Study Finds States With Most Gun Laws Have Fewest Gun Deaths.,37.222222,69.0,0.539452,0,,,
1436,True,"People (are) paying more in taxes than they will for food, housing and clothing combined.",48.272727,89.0,0.54239,0,,,


## 6. Saving modified dataset

In [None]:
# Dropping unncessery columns, renamining and rearranging remaining one
LIAR_Final = LIAR_Final.sample(frac=1).reset_index(drop=True).drop(['leven__score', 'claim_count', 'adjusted_leven', 'char_category', 'topics', 'speaker', 'context'], axis=1)
LIAR_Final = LIAR_Final[['claim', 'claim_veracity']]
LIAR_Final['claim_veracity'] = LIAR_Final['claim_veracity'].astype(int)

LIAR_Final.to_csv('LIAR_Final.csv')

## 