In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from re import search

Preliminary Analysis

Importing the dataset

In [None]:
cwd = os.getcwd()
dataset_dir = os.path.join(cwd,'Dataset')
df = pd.read_csv(os.path.join(dataset_dir,'fake_or_real_news_mc.csv'))

In [None]:
df_true = pd.read_csv(os.path.join(dataset_dir,'BuzzFeed_real_news_content.csv'))
df_false = pd.read_csv(os.path.join(dataset_dir,'BuzzFeed_fake_news_content.csv'))

In [None]:
print("True Dataset shape:", df_true.shape)
print("Fake Dataset shape:", df_false.shape)

In [None]:
df_true.head()

In [None]:
df_false.head()

In [None]:
unnamed_columns = [col for col in df_true.columns if search(r'^Unnamed', col)]

df_true = df_true.drop(unnamed_columns, axis=1)

df_true.head()

In [None]:
unnamed_columns = [col for col in df_false.columns if search(r'^Unnamed', col)]

df_false = df_false.drop(unnamed_columns, axis=1)

df_false.head()

In [None]:
df_true = df_true.drop(columns=df_true.columns.difference(['id','title','text','authors']))
df_true.shape

In [None]:
df_false = df_false.drop(columns=df_false.columns.difference(['id','title','text','authors']))
df_false.shape

In [None]:
df_true['label']=0
df_true.head()

In [None]:
df_false['label']=1
df_false.head()

In [None]:
df = pd.concat([df_true, df_false], axis=0)
df = df.sample(frac=1, random_state=1973)
df.head()

In [None]:
print("Dataset shape:", df.shape)

In [None]:
news = 'text'
headline = 'title'

Cleaning the data

Checking for missing data

In [None]:
df.info()

In [None]:
df.label.value_counts()

This confirms that 0 label is used for true news and 1 label is used for fake news

In [None]:
print('Dataset null values:\n',df.isna().sum())

In [None]:
def show_tf_distribution(df, column) :
    null = df[df[column].isna()]
    total = len(null)
    notfake_cnt = list(null[null['label']==0].shape)[0]
    fake_cnt = list(null[null['label']==1].shape)[0]
    
    print('Null Values distribution for \''+column+'\' on basis for realness')
    print('Total:\t',total)
    print('Real %:\t',format(((notfake_cnt/total)*100),'.2f'))
    print('Fake %:\t',format(((fake_cnt/total)*100),'.2f'))

In [None]:
show_tf_distribution(df, news)

In [None]:
show_tf_distribution(df, headline)

As every instance of missing value almost always indicates a fake news article, missing value/information can be an identifying factor. Hence not dropping rows with null values.

Checking for placeholder values and duplicates

In [None]:
headline_value_counts = df[headline].value_counts()
headline_value_counts[headline_value_counts > 1]

In [None]:
duplicate_headline_list = set(headline_value_counts[headline_value_counts > 1].keys())
df_dup_headline = df[df[headline].isin(duplicate_headline_list)]
df_dup_headline

In [None]:
df_dup_headline[df_dup_headline.duplicated()]

There are no duplicated rows that needs to be deleted

In [None]:
df_dup_headline[df_dup_headline.duplicated([news])]

There are 4 rows with duplicated text

In [None]:
df_dup_headline[df_dup_headline.duplicated([headline, news])]

There are 4 rows with both headlines and news duplicated. These needs to be removed.

In [None]:
df_dup_headline[df_dup_headline[news] == ' ']

We can leave duplicate headlines as that is a common part of news when it undergoes revision but those instances where both headline and news articles are same needs to be dropped. Rows without news will also be removed.

Checking news for whitespaces.

In [None]:
df[df[news] == ' ']

Replacing white spaces with null.

In [None]:
df = df.replace(r'^\s*$', np.nan, regex=True)

In [None]:
df.isna().sum()

This shows that there is increase in null values in column news as we replace articles that only had null values.

In [None]:
show_tf_distribution(df, news)

The distribution shows that all null news values still point to fake news. As these values are small compared to total dataset size, dropping them might be preferable.

Removing Duplicated Data

In [None]:
len(df)

Dropping rows with no news article or headlines

In [None]:
df_clean = df.dropna(subset=[headline,news])

In [None]:
len(df_clean)

Dropping rows with same headline and news articles

In [None]:
df_clean = df_clean.drop_duplicates([headline, news], ignore_index=True)

In [None]:
len(df_clean)

Dropping rows with same news articles

In [None]:
df_clean = df_clean.drop_duplicates([news], ignore_index=True)

In [None]:
df_clean.info()

In [None]:
df_clean.isna().sum()

In [None]:
len(df) - len(df_clean)

In [None]:
df_clean.label.value_counts()

Summary

0 rows of data removed on basis of not having information in news column, headline column and or having duplicate values.

Exploring the dataset

Helper Function

In [None]:
graph_dir = os.path.join(cwd,'Graphs')

In [None]:
def show_hist_for_col(df, column, title):
    """
    Display a histogram for a column in a dataframe, splitting the data by label.
    """
    plt.figure(figsize=(12,8))
    df[df.label == 0][column].hist(label='True')
    df[df.label == 1][column].hist(alpha=0.4, label='Fake')
    plt.title(title)
    plt.legend()
    plt.savefig(os.path.join(graph_dir,title+'.png'), bbox_inches='tight')
    plt.show()
    plt.close()

Distribution of Labels

In [None]:
plt.figure(figsize=(12,8))
df_clean.label.hist()
title = 'Fake or True News'
plt.title(title)
plt.savefig(os.path.join(graph_dir,title+'.png'), bbox_inches='tight')
plt.show()
plt.close()

After cleaning the data there is an imbalance present but hopefully not enough to have effect on the models

Length of Headlines

In [None]:
df_clean['headline_len'] = df_clean[headline].str.len()

In [None]:
show_hist_for_col(df_clean, 'headline_len', 'Number of Characters in Headline')

In [None]:
df_clean[df_clean.label == 0].headline_len.describe()

In [None]:
df_clean[df_clean.label == 1].headline_len.describe()

Fake news stories have a wider range in the length of title than True news stories. The quartile differences between fake news articles is much larger than in case of true news article.

Length of News article

In [None]:
df_clean['news_len'] = df_clean[news].str.len()

In [None]:
show_hist_for_col(df_clean, 'news_len', 'Length of Text for News')

In [None]:
df_clean[df_clean.label == 0].news_len.describe()

In [None]:
df_clean[df_clean.label == 1].news_len.describe()

In [None]:
df_long = df_clean[df_clean.news_len > 10000]

In [None]:
df_long

In [None]:
df_long.label.value_counts()

ok there does not seem to be bias in long stories towards either type of articles so they will be kept. also fake news article have shown much higher max story size and significantly lower quartile scores across the board.

Capital letters in headline.

In [None]:
df_clean['caps_in_headline'] = df_clean[headline].apply(lambda headline: sum(1 for char in headline if char.isupper()))

In [None]:
df_clean

In [None]:
show_hist_for_col(df_clean, 'caps_in_headline', 'Number of Capitals in Headline')

In [None]:
df_clean[df_clean.label == 0].caps_in_headline.describe()

In [None]:
df_clean[df_clean.label == 1].caps_in_headline.describe()

There is much more deviation in number of Capital letter in headline in fake news articles. also fake news articles have much more amount of Capital letter in headline.

In [None]:
df_clean['norm_caps_in_headline'] = df_clean['caps_in_headline'] / df_clean['headline_len']

In [None]:
show_hist_for_col(df_clean, 'norm_caps_in_headline', 'Percentage of Capitals in Headline')

In [None]:
df_clean[df_clean.label == 0].norm_caps_in_headline.describe()

In [None]:
df_clean[df_clean.label == 1].norm_caps_in_headline.describe()

percentage of capitals may be useful as even first quartile of percentage of capitals in fake news is larger than even third quartile in true news article. but this might not be generalizable

Capital in news article

In [None]:
df_clean['caps_in_news'] = df_clean[news].apply(lambda news: sum(1 for char in news if char.isupper()))

In [None]:
show_hist_for_col(df_clean, 'caps_in_news', 'Number of Capitals in News')

In [None]:
df_clean[df_clean.label == 0].caps_in_news.describe()

In [None]:
df_clean[df_clean.label == 1].caps_in_news.describe()

In [None]:
df_clean['norm_caps_in_news'] = df_clean['caps_in_news'] / df_clean['news_len']

In [None]:
show_hist_for_col(df_clean, 'norm_caps_in_news', 'Percentage of Capitals in News')

In [None]:
df_clean[df_clean.label == 0].norm_caps_in_news.describe()

In [None]:
df_clean[df_clean.label == 1].norm_caps_in_news.describe()

There is a lot of overlap in number of capitals in news article between fake and real news.

In [None]:
def check_string_for(substring, fullstring):
    """Check if the substring is in the fullstring"""
    if search(substring, fullstring):
        return True
    else:
        return False

Via and Image Via in article

In [None]:
df_via = df_clean[df_clean[news].apply(lambda news_text: check_string_for(' via', news_text))]

In [None]:
df_via

In [None]:
df_via['label'].value_counts()

Via is much more indicative of fake news article compared to true news article.

In [None]:
df_image_via = df_clean[df_clean[news].apply(lambda news_text: check_string_for('image via', news_text))]

In [None]:
df_image_via

In [None]:
df_image_via['label'].value_counts()

With all the posts with image via being Fake, it's highly indicative of that label, but this may be particular to this dataset and may not generalize.

Said in news article

In [None]:
df_said = df_clean[df_clean[news].apply(lambda news_text: check_string_for('said', news_text))]

In [None]:
df_said

In [None]:
df_said['label'].value_counts()

The stories containing the word said are indicative of the news story being true. With twice as many of the "true" news stories containing said vs. "fake", the true ones must seem likely to be more concerned with providing quotations, or at least quotations in this style.

On in news article

In [None]:
df_on = df_clean[df_clean[news].apply(lambda news_text: check_string_for(' on ', news_text))]

In [None]:
df_on

In [None]:
df_on.label.value_counts()

The use of 'on' is fairly balanced although somewhat indicative of a 'true' story.

You in news article

In [None]:
df_you = df_clean[df_clean[news].apply(lambda news_text: check_string_for(' you ', news_text))]

In [None]:
df_you

In [None]:
df_you.label.value_counts()

You is present in both true and fake news stories with a bit more mention in fake news stories which might be due to a more informal writing approach.

In [None]:
df_clean = df_clean.rename(columns={'title': 'headline'})
df_clean = df_clean.rename(columns={'text': 'news'})
df_clean.head()

Save the Cleaned Dataset

In [None]:
df_clean.to_csv(os.path.join(dataset_dir,'train_news_cleaned.csv'),index=False)