## Calculating the average word count for the column in each dataset which represents the main text of a news source e.g. "statement" in the LIAR dataset

## Reading LIAR dataset and calculating the mean word count 

In [1]:
import pandas as pd

# Reading both the training and testing dataset

LIAR_train = pd.read_csv("liar_dataset/train.tsv", sep='\t', header=0)
LIAR_test = pd.read_csv("liar_dataset/test.tsv", sep='\t', header=0)

# Adding the column names to both dataset

LIAR_train.columns =["id", "label", "statement", "subject", "speaker", "job title", "state info", "party affiliation", "barely true counts", "false counts", "half true counts", "mostly true counts", "pants on fire counts", "context"]
LIAR_test.columns =["id", "label", "statement", "subject", "speaker", "job title", "state info", "party affiliation", "barely true counts", "false counts", "half true counts", "mostly true counts", "pants on fire counts", "context"]

# Concatenating the training and test dataset

LIAR_data = pd.concat([LIAR_train, LIAR_test], ignore_index=True, sort=False)

# Dropping rows that do not represent exactly 'true' or 'false' information

LIAR_data = LIAR_data.drop(LIAR_data.query('label == "half-true"').index)
LIAR_data = LIAR_data.drop(LIAR_data.query('label == "barely-true"').index)
LIAR_data = LIAR_data.drop(LIAR_data.query('label == "mostly-true"').index)

#Calculating mean word count

LIAR_data['statement_length'] = LIAR_data['statement'].apply(lambda x: len(x) - x.count(" "))
LIAR_data['statement_length'].mean()

87.127348230176

## Reading BuzzFeedNews dataset and calculating the mean word count 

In [2]:
import xml.etree.ElementTree as et 
import os
import pandas as pd 

rows_list = []
path = "buzzfeed_dataset/"

# Each datapoint in the BuzzFeedNews dataset comes as a seperate xml file
# Therefore I loop through the file directory and read each xml file, adding the contents of mainText 
# (contents of a news source) and veracity (label) to a dataframe

for filename in os.listdir(path):
    if filename.endswith('.xml'):
        fullname = os.path.join(path, filename)
        xtree = et.parse(fullname)
        xroot = xtree.getroot() 
        main_text = xroot.find('mainText').text
        veracity = xroot.find('veracity').text
        new_row = {'mainText': main_text, 'label': veracity}
        rows_list.append(new_row)

# Dropping rows that do not represent exactly 'true' or 'false' information

buzzfeed_data = pd.DataFrame(rows_list)   
buzzfeed_data = buzzfeed_data.drop(buzzfeed_data.query('label == "mixture of true and false"').index)

# Rename rows that represent 'true' and 'false' information to "REAL" and "FAKE" since the model is trained to classify to these

buzzfeed_data['label'] = buzzfeed_data['label'].replace(['mostly true','mostly false', 'no factual content'], ['REAL', 'FAKE', 'FAKE'])

# Drop any rows which have an empty body

buzzfeed_data.dropna(subset = ['mainText'], inplace=True)
buzzfeed_data.dropna(subset = ['mainText'], inplace=True)

#Calculating mean word count

buzzfeed_data['statement_length'] = buzzfeed_data['mainText'].apply(lambda x: len(x) - x.count(" "))
buzzfeed_data['statement_length'].mean()

2924.1756272401435

## Reading McIntire dataset and calculating the mean word count 

In [3]:
#Reading dataset
pd.set_option("max_colwidth", 200)
McIntire_data = pd.read_csv("fake_or_real_news.csv")

#Removing unused column
McIntire_data = McIntire_data.drop(labels = "Unnamed: 0", axis = 1)

#Labelling remaining columns
McIntire_data.columns=["title", "body_text", "label"]

#Calculating mean word count

McIntire_data['statement_length'] = McIntire_data['body_text'].apply(lambda x: len(x) - x.count(" "))
McIntire_data['statement_length'].mean()

3942.6716653512235