In [1]:
import sys

In [2]:
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install regex



In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from re import search

Preliminary Analysis

Importing the dataset

In [4]:
cwd = os.getcwd()
dataset_dir = os.path.join(cwd,'Dataset')
df = pd.read_csv(os.path.join(dataset_dir,'train_news.csv'))

Problem Description
The authenticity of Information has become a longstanding issue affecting businesses and society, both for printed and digital media. On social networks, the reach and effects of information spread occur at such a fast pace and so amplified that distorted, inaccurate, or false information acquires a tremendous potential to cause real-world impacts, within minutes, ffor millions of users. Recently, several public concerns about this problem and some approaches to mitigate the problem were expressed.

Data- Description:
There are 6 columns in the dataset provided to you. The description of each of the column is given below: “id”: Unique id of each news article “headline”: It is the title of the news. “news”: It contains the full text of the news article “Unnamed:0”: It is a serial number “written_by”: It represents the author of the news article “label”: It tells whether the news is fake (1) or not fake (0).

In [5]:
print("Dataset shape:", df.shape)

Dataset shape: (20800, 6)


In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,headline,written_by,news,label
0,0,9653,Ethics Questions Dogged Agriculture Nominee as...,Eric Lipton and Steve Eder,"WASHINGTON — In Sonny Perdue’s telling, Geo...",0
1,1,10041,U.S. Must Dig Deep to Stop Argentina’s Lionel ...,David Waldstein,HOUSTON — Venezuela had a plan. It was a ta...,0
2,2,19113,Cotton to House: ’Do Not Walk the Plank and Vo...,Pam Key,"Sunday on ABC’s “This Week,” while discussing ...",0
3,3,6868,"Paul LePage, Besieged Maine Governor, Sends Co...",Jess Bidgood,"AUGUSTA, Me. — The beleaguered Republican g...",0
4,4,7596,A Digital 9/11 If Trump Wins,Finian Cunningham,Finian Cunningham has written extensively on...,1


In [7]:
unnamed_columns = [col for col in df.columns if search(r'^Unnamed', col)]

df = df.drop(unnamed_columns, axis=1)

df.head()

Unnamed: 0,id,headline,written_by,news,label
0,9653,Ethics Questions Dogged Agriculture Nominee as...,Eric Lipton and Steve Eder,"WASHINGTON — In Sonny Perdue’s telling, Geo...",0
1,10041,U.S. Must Dig Deep to Stop Argentina’s Lionel ...,David Waldstein,HOUSTON — Venezuela had a plan. It was a ta...,0
2,19113,Cotton to House: ’Do Not Walk the Plank and Vo...,Pam Key,"Sunday on ABC’s “This Week,” while discussing ...",0
3,6868,"Paul LePage, Besieged Maine Governor, Sends Co...",Jess Bidgood,"AUGUSTA, Me. — The beleaguered Republican g...",0
4,7596,A Digital 9/11 If Trump Wins,Finian Cunningham,Finian Cunningham has written extensively on...,1


In [8]:
print("Dataset shape:", df.shape)

Dataset shape: (20800, 5)


Cleaning the data

Checking for missing data

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          20800 non-null  int64 
 1   headline    20242 non-null  object
 2   written_by  18843 non-null  object
 3   news        20761 non-null  object
 4   label       20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [10]:
print('Dataset null values:\n',df.isna().sum())

Dataset null values:
 id               0
headline       558
written_by    1957
news            39
label            0
dtype: int64


In [11]:
def show_tf_distribution(column) :
    null = df[df[column].isna()]
    total = len(null)
    notfake_cnt = list(null[null['label']==0].shape)[0]
    fake_cnt = list(null[null['label']==1].shape)[0]
    
    print('Null Values distribution for \''+column+'\' on basis for realness')
    print('Total:\t',total)
    print('Real %:\t',format(((notfake_cnt/total)*100),'.2f'))
    print('Fake %:\t',format(((fake_cnt/total)*100),'.2f'))

In [12]:
show_tf_distribution('news')

Null Values distribution for 'news' on basis for realness
Total:	 39
Real %:	 0.00
Fake %:	 100.00


In [13]:
show_tf_distribution('headline')

Null Values distribution for 'headline' on basis for realness
Total:	 558
Real %:	 0.00
Fake %:	 100.00


In [14]:
show_tf_distribution('written_by')

Null Values distribution for 'written_by' on basis for realness
Total:	 1957
Real %:	 1.33
Fake %:	 98.67


As every instance of missing value almost always indicates a fake news article, missing value/information can be an identifying factor. Hence not dropping rows with null values.

Checking for placeholder values and duplicates

In [15]:
headline_value_counts = df.headline.value_counts()
headline_value_counts[headline_value_counts > 1]

The Dark Agenda Behind Globalism And Open Borders                                                                            5
Get Ready For Civil Unrest: Survey Finds That Most Americans Are Concerned About Election Violence                           5
The Fix Is In: NBC Affiliate Accidentally Posts Election Results A Week Early: Hillary Wins Presidency 42% to Trump’s 40%    4
Schools All Over America Are Closing On Election Day Due To Fears Of Violence                                                4
Will Barack Obama Delay Or Suspend The Election If Hillary Is Forced Out By The New FBI Email Investigation?                 4
                                                                                                                            ..
War Less Imminent After Clinton Defeat                                                                                       2
115 Million Americans Killed In 30 Minutes                                                                     

In [16]:
duplicate_headline_list = set(headline_value_counts[headline_value_counts > 1].keys())
df_dup_headline = df[df.headline.isin(duplicate_headline_list)]
df_dup_headline

Unnamed: 0,id,headline,written_by,news,label
17,4694,Millions of South Koreans Rise Up Against Shad...,Madeline,Your News Wire \nSouth Koreans are rising up i...,1
47,11814,Las imágenes libres de derechos más destacadas...,Tomás Fuentes,Las imágenes libres de derechos más destacadas...,1
66,14678,Young patient,-NO AUTHOR-,Anatomy lesson Published: 12 mins ago \nEditor...,1
79,17351,Thomas Frank Explores Whether Hillary Clinton ...,,,1
84,12395,Russia is Hoarding Gold at an Alarming Rate — ...,Jay Syrmopoulos,Home / Be The Change / Antiwar / Russia is Hoa...,1
...,...,...,...,...,...
20633,10988,WHO cancer agency under fire for withholding ‘...,Editor,The International Agency for Research on Cance...,1
20677,18176,"Rigged Primary, Media, and Candidate; the Amer...",,Email \n\nThe excitement over the US election ...,1
20717,20652,Jeddah airport was targeted by Ansar Allah mis...,,Email \n\nAn informed source in Yemen's AnsarA...,1
20753,9516,Las imágenes libres de derechos más destacadas...,Tomás Fuentes,Las imágenes libres de derechos más destacadas...,1


In [17]:
df_dup_headline[df_dup_headline.duplicated()]

Unnamed: 0,id,headline,written_by,news,label


There are no directly duplicated rows

In [18]:
df_dup_headline[df_dup_headline.duplicated(['headline', 'news'])]

Unnamed: 0,id,headline,written_by,news,label
1818,7178,JASON CHAFFETZ EXPOSED HILLARY CLINTON’S PLAN ...,,The Corruption of the Clinton’s is like an end...,1
5016,3244,"The U.S./Turkey Plan For “Seizing, Holding, An...",Brandon Turbeville,By Brandon Turbeville As the U.S. Presidential...,1
5110,11407,19 men cry rape by Iran's top Quran reader,Jay Baggett,"Print Saeed Toosi, right, and Ayatollah Khamen...",1
5284,14974,North Korea Threatens ‘Sacred’ Nuclear War Aga...,,Email \nNorth Korea’s Foreign Ministry slammed...,1
5390,15872,"FEAR OF TRUMP: BUSH, OBAMA, CLINTON ALL BUYING...",,Email \n\nIt appears Bill and Hillary Clinton ...,1
...,...,...,...,...,...
19953,12925,War Less Imminent After Clinton Defeat,Glen Ford,2016 presidential campaign A Black Agenda Radi...,1
20385,15010,Donald J. Trump’s 10 Point Plan to Put America...,Anonymous,Tweet Widget by Tanya Golash-Boza \nHow will D...,1
20580,6111,Rand Paul: Polls Showing Hillary Ahead Are ‘De...,,Email \n\nWednesday on 800 WVHU radio’s “The T...,1
20677,18176,"Rigged Primary, Media, and Candidate; the Amer...",,Email \n\nThe excitement over the US election ...,1


There are 70 rows with both headlines and news duplicated. These needs to be removed.

In [19]:
df_dup_headline[df_dup_headline.news == ' ']

Unnamed: 0,id,headline,written_by,news,label
79,17351,Thomas Frank Explores Whether Hillary Clinton ...,,,1
11572,5101,More on Trump’s Populism and How It Can Be Con...,Daily Bell Staff,,1
15286,20141,Thomas Frank Explores Whether Hillary Clinton ...,,,1
17458,1591,More on Trump’s Populism and How It Can Be Con...,Daily Bell Staff,,1


We can leave duplicate headlines as that is a common part of news when it undergoes revision but those instances where both headline and news articles are same needs to be dropped. Rows without news will also be removed.

Checking news for whitespaces.

In [20]:
df[df.news == ' ']

Unnamed: 0,id,headline,written_by,news,label
79,17351,Thomas Frank Explores Whether Hillary Clinton ...,,,1
210,936,Hillary is Sick & Tired of Suffering from Wein...,Captain Craptek,,1
271,295,A Connecticut Reader Reports Record Voter Regi...,VDARE.com Reader,,1
472,16561,Not sure what to do with your time now Bake Of...,newsbiscuit editorial team,,1
1573,16715,Citizens Reject Forced Diversity Policy by Sup...,Brenda Walker,,1
...,...,...,...,...,...
19880,4275,"Yes, Virginia (Dare), There ARE Righteous Jews...",Paul Gottfried,,1
20232,16347,Trump’s Closing Argument,Steve Sailer,,1
20543,592,Is your promising internet career over now Vin...,newsbiscuit editorial team,,1
20559,2228,YIKES! HILLARY GOES OFF THE RAILS…Pulls A Howa...,EdJenner,,1


Replacing white spaces with null.

In [21]:
df = df.replace(r'^\s*$', np.nan, regex=True)

In [22]:
df.isna().sum()

id               0
headline       558
written_by    1957
news           116
label            0
dtype: int64

This shows that there is increase in null values in column news as we replace articles that only had null values.

In [23]:
show_tf_distribution('news')

Null Values distribution for 'news' on basis for realness
Total:	 116
Real %:	 0.00
Fake %:	 100.00


The distribution shows that all null news values still point to fake news. As these values are small compared to total dataset size, dropping them might be preferable.

Removing Duplicated Data

In [24]:
len(df)

20800

Dropping rows with no news article

In [25]:
df_clean = df.dropna(subset=['news'])

In [26]:
len(df_clean)

20684

Dropping rows with same headline and news articles

In [27]:
df_clean = df_clean.drop_duplicates(['headline', 'news'], ignore_index=True)

In [28]:
len(df_clean)

20576

Dropping rows with same news articles

In [29]:
df_clean = df_clean.drop_duplicates(['news'], ignore_index=True)

In [30]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20383 entries, 0 to 20382
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          20383 non-null  int64 
 1   headline    19865 non-null  object
 2   written_by  18531 non-null  object
 3   news        20383 non-null  object
 4   label       20383 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 796.3+ KB


In [31]:
df_clean.isna().sum()

id               0
headline       518
written_by    1852
news             0
label            0
dtype: int64

In [32]:
len(df) - len(df_clean)

417

In [33]:
df_clean.label.value_counts()

0    10387
1     9996
Name: label, dtype: int64

Summary

417 rows of data removed on basis of not having information in news column and or have duplicate values.

Exploring the dataset

Helper Function

In [34]:
graph_dir = os.path.join(cwd,'Graphs')

In [36]:
def show_hist_for_col(df, column, title):
    """
    Display a histogram for a column in a dataframe, splitting the data by label.
    """
    plt.figure(figsize=(16,8))
    df[df.label == 0][column].hist(label='True')
    df[df.label == 1][column].hist(alpha=0.4, label='Fake')
    plt.title(title)
    plt.legend()
    plt.savefig(os.path.join(graph_dir,title+'.png'), bbox_inches='tight')
    plt.show()
    plt.close()