## Data Loading


In [1]:
# Import dependencies
from pathlib import Path
import pandas as pd 


In [2]:
# Define the file paths
true_news_file = Path('Resources/True.csv')
fake_news_file = Path('Resources/Fake.csv')

# Read the CSV files directly into pandas DataFrames
true_news_df = pd.read_csv(true_news_file, sep=',')
fake_news_df = pd.read_csv(fake_news_file, sep=',')



In [3]:
true_news_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [4]:
fake_news_df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


## Data Merging

In [5]:
# Adding 'category' column with the following values: 0 for fake news and 1 for true news
true_news_df['category'] = 1
fake_news_df['category'] = 0


In [6]:
#Merging two datasets into one
news_df = pd.concat([true_news_df,fake_news_df])
news_df


Unnamed: 0,title,text,subject,date,category
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1
...,...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",0
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",0
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",0
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0


# Data Cleaning

In [7]:
# There are 44898 articles in the database
news_df.describe()


Unnamed: 0,category
count,44898.0
mean,0.477015
std,0.499477
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [8]:
# Count the number of true vs. fake articles
number_true_articles = news_df[news_df['category'] == 1].shape[0]
number_fake_articles = news_df[news_df['category'] == 0].shape[0]

print(f"Number of true articles: {number_true_articles}.")
print(f"Number of fake articles: {number_fake_articles}.")


Number of true articles: 21417.
Number of fake articles: 23481.


In [9]:
# Calculate the percentage of the true and fake articles within the dataset
total_articles = number_true_articles + number_fake_articles
percent_true = (number_true_articles / total_articles) * 100
percent_fake = (number_fake_articles / total_articles) * 100

print(f"Percentage of true articles: {percent_true:.2f}%")
print(f"Percentage of fake articles: {percent_fake:.2f}%")


Percentage of true articles: 47.70%
Percentage of fake articles: 52.30%


### The dataset is fairly balanced

With 21,417 true articles and 23,481 fake articles, the two categories are well-represented and neither significantly outnumbers the other.

In [10]:
# Looking at column types
news_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 44898 entries, 0 to 23480
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     44898 non-null  object
 1   text      44898 non-null  object
 2   subject   44898 non-null  object
 3   date      44898 non-null  object
 4   category  44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


In [11]:
#  Looking at 'subject' column
news_df['subject'].value_counts()


politicsNews       11272
worldnews          10145
News                9050
politics            6841
left-news           4459
Government News     1570
US_News              783
Middle-east          778
Name: subject, dtype: int64

In [12]:
# Get the count of missing values in each column
missing_values_count = news_df.isnull().sum()
print(missing_values_count)


title       0
text        0
subject     0
date        0
category    0
dtype: int64


### There are no NaN values in the dataset after initial loading

In [13]:
# Check for duplicates across the DataFrame
duplicates = news_df.duplicated()
print(f"Number of duplicate entries: {duplicates.sum()}")

# View the duplicate rows
duplicate_rows = news_df[news_df.duplicated()]
print(duplicate_rows)


Number of duplicate entries: 209
                                                   title  \
445    Senate tax bill stalls on deficit-focused 'tri...   
778    Trump warns 'rogue regime' North Korea of grav...   
892    Republicans unveil tax cut bill, but the hard ...   
896    Trump taps Fed centrist Powell to lead U.S. ce...   
974    Two ex-Trump aides charged in Russia probe, th...   
...                                                  ...   
21353  Thailand's ousted PM Yingluck has fled abroad:...   
21408  U.S., North Korea clash at U.N. forum over nuc...   
9942   HILLARY TWEETS MESSAGE In Defense Of DACA…OOPS...   
11446  FORMER DEMOCRAT WARNS Young Americans: “Rioter...   
14925  [VIDEO] #BlackLivesMatter Terrorists Storm Dar...   

                                                    text       subject  \
445    WASHINGTON (Reuters) - The U.S. Senate on Thur...  politicsNews   
778    BEIJING (Reuters) - U.S. President Donald Trum...  politicsNews   
892    WASHINGTON (Reute

In [14]:
# Remove duplicate rows
news_df = news_df.drop_duplicates()

# Reset the index to maintain consecutive indexing
news_df = news_df.reset_index(drop=True)

# Check the shape of the DataFrame to confirm duplicates are removed
print(news_df.shape)

(44689, 5)


### There are 209 duplicate rows in the DataFrame

In [15]:
# Convert the 'date' column to datetime and in a row that is not recognisable as date, mark it as 'Nat' (Not a Time)
news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce') 

# Count the number of NaT values in the 'date' column
nat_count = news_df['date'].isna().sum()
print(f"Number of NaT entries in 'date' column: {nat_count}.")


Number of NaT entries in 'date' column: 10.


### There are 10 rows without the date value

In [16]:
# Drop rows with NaT values and reset index
news_df.dropna(subset=['date'], inplace=True)
news_df = news_df.reset_index(drop=True)


In [17]:
# Recheck missing values in the entire DataFrame after dropping NaT rows
missing_values_count = news_df.isnull().sum()
print("Missing values in each column after cleaning:")
print(missing_values_count)


Missing values in each column after cleaning:
title       0
text        0
subject     0
date        0
category    0
dtype: int64


In [18]:
# Print the date range of the articles
earliest_date = news_df['date'].min()
latest_date = news_df['date'].max()
print(f"The articles cover a period from {earliest_date} to {latest_date}")

The articles cover a period from 2015-03-31 00:00:00 to 2018-02-19 00:00:00


In [19]:
news_df.head()

Unnamed: 0,title,text,subject,date,category
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,2017-12-31,1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,2017-12-29,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,2017-12-31,1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,2017-12-30,1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,2017-12-29,1


In [20]:
# Print the final shape of the DataFrame
print(f"Final DataFrame shape: {news_df.shape}")

Final DataFrame shape: (44679, 5)


### There are 44,679 articles in the DataFrame after Data Cleaning.
### The articles are for the following period: 31 March 2015 - 19 February 2018. 

# Standarizing text

In [21]:
# Lowercase the text in the 'title' and 'text' columns
news_df['title'] = news_df['title'].str.lower()
news_df['text'] = news_df['text'].str.lower()

In [22]:
# Strip leading and trailing whitespace from the 'title' and 'text' columns
news_df['title'] = news_df['title'].str.strip()
news_df['text'] = news_df['text'].str.strip()

Unnamed: 0,title,text,subject,date,category
0,"as u.s. budget fight looms, republicans flip t...",washington (reuters) - the head of a conservat...,politicsNews,2017-12-31,1
1,u.s. military to accept transgender recruits o...,washington (reuters) - transgender people will...,politicsNews,2017-12-29,1
2,senior u.s. republican senator: 'let mr. muell...,washington (reuters) - the special counsel inv...,politicsNews,2017-12-31,1
3,fbi russia probe helped by australian diplomat...,washington (reuters) - trump campaign adviser ...,politicsNews,2017-12-30,1
4,trump wants postal service to charge 'much mor...,seattle/washington (reuters) - president donal...,politicsNews,2017-12-29,1
