In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from pydrive.auth import GoogleAuth
from google.colab import drive
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.colab import files

# ISOT Dataset

## ISOT Fake News Prepping

In [None]:
# fake_df <--- need to read in from google drive
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

file_id = '1O-gi1M2_f0j9NVMPEptEsMzWBWukXaY3'
download = drive.CreateFile({'id': file_id})

# Download the file to a local disc
download.GetContentFile('file.csv')
fake_df = pd.read_csv('file.csv')

In [None]:
fake_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB


In [None]:
# Dropping the unnecessary columns
fake_df.drop(columns=['title', 'subject', 'date'], inplace=True)

# Adding the new column 'Fake_rating' with 1's for every row
# We could rename this to target or something??
fake_df['Fake_rating'] = 1

# Displaying the modified DataFrame
fake_df.head()

Unnamed: 0,text,Fake_rating
0,Donald Trump just couldn t wish all Americans ...,1
1,House Intelligence Committee Chairman Devin Nu...,1
2,"On Friday, it was revealed that former Milwauk...",1
3,"On Christmas day, Donald Trump announced that ...",1
4,Pope Francis used his annual Christmas Day mes...,1


In [None]:
missing_values_text = fake_df['text'].isnull().sum()
missing_values_text

0

In [None]:
# Checking for empty strings or strings containing only whitespace in the 'text' column
empty_strings_count = fake_df[fake_df['text'].str.strip() == ''].shape[0]
empty_strings_count

630

In [None]:
# Removing the rows where the 'text' column has empty strings or strings containing only whitespace
fake_df = fake_df[fake_df['text'].str.strip() != '']

# Checking the number of remaining rows in the DataFrame
# This will bring the dataset down to 22,851. We are losing 630 rows, but that's okay
remaining_rows = fake_df.shape[0]
remaining_rows

22851

In [None]:
# Note. There's duplicates in this dataset... 5,398 to be exact.
# There will be 17,453 rows remaining to be used.
# Do we clean now? Or after all datasets are combined?

In [None]:
# Remove duplicate rows in the 'text' column
fake_df = fake_df.drop_duplicates(subset='text', keep='first')

In [None]:
# Reset the index of fake_df
fake_df.reset_index(drop=True, inplace=True)

In [None]:
# view the dataset
fake_df.head(-5)

Unnamed: 0,text,Fake_rating
0,Donald Trump just couldn t wish all Americans ...,1
1,House Intelligence Committee Chairman Devin Nu...,1
2,"On Friday, it was revealed that former Milwauk...",1
3,"On Christmas day, Donald Trump announced that ...",1
4,Pope Francis used his annual Christmas Day mes...,1
...,...,...
17443,21st Century Wire says As 21WIRE reported earl...,1
17444,21st Century Wire says It s a familiar theme. ...,1
17445,Patrick Henningsen 21st Century WireRemember ...,1
17446,21st Century Wire says Al Jazeera America will...,1


In [None]:
# Dataset looks good and is ready to merge with ISOT Real.
fake_df.shape

(17453, 2)

## ISOT Real News Prepping

In [None]:
# Read in ISOT real news dataset
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


file_id = '1qZs6CUN9jpJaiva09Bez24Pp9pCcLgwx'
download = drive.CreateFile({'id': file_id})

# Download the file to a local disc
download.GetContentFile('file.csv')
real_df = pd.read_csv('file.csv')

In [None]:
real_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


In [None]:
real_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [None]:
# Dropping the unnecessary columns
real_df.drop(columns=['title', 'subject', 'date'], inplace=True)

# Adding the new column 'Fake_rating' with 0's for every row
# We could rename this to target or something??
real_df['Fake_rating'] = 0

# Displaying the modified DataFrame
real_df.head()

Unnamed: 0,text,Fake_rating
0,WASHINGTON (Reuters) - The head of a conservat...,0
1,WASHINGTON (Reuters) - Transgender people will...,0
2,WASHINGTON (Reuters) - The special counsel inv...,0
3,WASHINGTON (Reuters) - Trump campaign adviser ...,0
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,0


In [None]:
# Check for missing values
missing_values_text = real_df['text'].isnull().sum()
missing_values_text

0

In [None]:
# Checking for empty strings or strings containing only whitespace in the 'text' column
empty_strings_count = real_df[real_df['text'].str.strip() == ''].shape[0]
empty_strings_count

1

In [None]:
# Remove white space rows
real_df = real_df[real_df['text'].str.strip() != '']

In [None]:
real_df.shape

(21416, 2)

In [None]:
# Check for duplicates in the 'text' column
duplicates_count = real_df['text'].duplicated().sum()
duplicates_count

225

In [None]:
# Remove duplicate rows based on the 'text' column
real_df = real_df.drop_duplicates(subset='text')

# Verify if the duplicate rows are removed
duplicates_count_after_removal = real_df['text'].duplicated().sum()
duplicates_count_after_removal


0

In [None]:
real_df.shape

(21191, 2)

## Merge ISOT

ISOT Fake = 17,453 rows

ISOT Real = 21,191 rows

In [None]:
# Combine fake_df & real_df by concatenating real_df and fake_df vertically
ISOT = pd.concat([real_df, fake_df], ignore_index=True)

In [None]:
# Reset the index of ISOT
ISOT.reset_index(drop=True, inplace=True)

In [None]:
ISOT.shape

(38644, 2)

In [None]:
ISOT.head(-5)

Unnamed: 0,text,Fake_rating
0,WASHINGTON (Reuters) - The head of a conservat...,0
1,WASHINGTON (Reuters) - Transgender people will...,0
2,WASHINGTON (Reuters) - The special counsel inv...,0
3,WASHINGTON (Reuters) - Trump campaign adviser ...,0
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,0
...,...,...
38634,21st Century Wire says As 21WIRE reported earl...,1
38635,21st Century Wire says It s a familiar theme. ...,1
38636,Patrick Henningsen 21st Century WireRemember ...,1
38637,21st Century Wire says Al Jazeera America will...,1


In [None]:
# Check for issues
# Checking for empty strings or strings containing only whitespace in the 'text' column
empty_strings_count = ISOT[ISOT['text'].str.strip() == ''].shape[0]
empty_strings_count

0

In [None]:
# Check that Fake_rating value_counts correlate with what we expect (see text above for values)
ISOT['Fake_rating'].value_counts()

0    21191
1    17453
Name: Fake_rating, dtype: int64

# FakeNewsNet Dataset (Might not be feasible)

Issue: Politico Fake and Real are the SAME EXACT dataset. I don't know which is fake or real... I tried comparing them to the github dataset with the column 'title' but I couldn't find any matches. Odd...

## Politico Fake

In [None]:
# # Read in Politco Fake news dataset
# auth.authenticate_user()
# gauth = GoogleAuth()
# gauth.credentials = GoogleCredentials.get_application_default()
# drive = GoogleDrive(gauth)


# file_id = '1tuMZy4gL4Nhc4NfDHmYTqnuP_rak23-m'
# download = drive.CreateFile({'id': file_id})

# # Download the file to a local disc
# download.GetContentFile('file.csv')
# pol_fake_df = pd.read_csv('file.csv')

In [None]:
# pol_fake_df.head()

Unnamed: 0,id,title,text,url,top_img,authors,source,publish_date,movies,images,canonical_link,meta_data
0,Real_1-Webpage,Trump Just Insulted Millions Who Lost Everythi...,16.8k SHARES SHARE THIS STORY\n\nHillary Clint...,http://occupydemocrats.com/2016/09/27/trump-ju...,http://occupydemocrats.com/wp-content/uploads/...,"Brett Bose,Grant Stern,Steve Bernstein,Natalie...",http://occupydemocrats.com,{'$date': 1474934400000},,http://occupydemocrats.com/wp-content/uploads/...,http://occupydemocrats.com/2016/09/27/trump-ju...,"{""generator"": ""Powered by Visual Composer - dr..."
1,Real_10-Webpage,Famous dog killed in spot she waited a year fo...,Famous dog killed in spot she waited a year fo...,http://rightwingnews.com/top-news/famous-dog-k...,http://rightwingnews.com/wp-content/uploads/20...,,http://rightwingnews.com,{'$date': 1474948336000},,http://rightwingnews.com/wp-content/uploads/20...,http://rightwingnews.com/top-news/famous-dog-k...,"{""googlebot"": ""noimageindex"", ""og"": {""site_nam..."
2,Real_100-Webpage,House oversight panel votes Clinton IT chief i...,Story highlights The House Oversight panel vot...,http://cnn.it/2deaH2d,http://i2.cdn.cnn.com/cnnnext/dam/assets/16091...,"Tom Lobianco,Deirdre Walsh",http://cnn.it,,,http://i2.cdn.cnn.com/cnnnext/dam/assets/17050...,http://www.cnn.com/2016/09/22/politics/bryan-p...,"{""description"": ""Members of the House Oversigh..."
3,Real_101-Webpage,America Just Tragically Lost A Country Music I...,We are absolutely heartbroken to hear about th...,http://newsbake.com/entertainment-news/music-e...,http://newsbake.com/wp-content/uploads/2016/05...,Nancy Wells,http://newsbake.com,{'$date': 1474898600000},https://www.youtube.com/embed/8ozTJcu-_BU,http://0.gravatar.com/avatar/0d702c6042933cd78...,http://newsbake.com/entertainment-news/music-e...,"{""shareaholic"": {""site_name"": ""NewsBake"", ""lan..."
4,Real_102-Webpage,Monuments to the Battle for the New South,"Nine years ago, a driver lost control of his p...",http://politi.co/2dd9U1x,http://static.politico.com/25/ed/85332de14c45b...,"Jack Shafer,Lisa Rab",http://politi.co,{'$date': 1473941820000},,http://static.politico.com/25/ed/85332de14c45b...,http://www.politico.com/magazine/story/2016/09...,"{""description"": ""Virginia, increasingly divers..."


In [None]:
# # Drop all columns except for 'text'
# pol_fake_df = pol_fake_df[['text']]

# # Adding the new column 'Fake_rating' with 1's for every row
# # We could rename this to target or something??
# pol_fake_df['Fake_rating'] = 1

# pol_fake_df.head()

Unnamed: 0,text,Fake_rating
0,16.8k SHARES SHARE THIS STORY\n\nHillary Clint...,1
1,Famous dog killed in spot she waited a year fo...,1
2,Story highlights The House Oversight panel vot...,1
3,We are absolutely heartbroken to hear about th...,1
4,"Nine years ago, a driver lost control of his p...",1


In [None]:
# # Check for missing values
# missing_values_text = pol_fake_df['text'].isnull().sum()
# missing_values_text

0

In [None]:
# # Checking for empty strings or strings containing only whitespace in the 'text' column
# empty_strings_count = pol_fake_df[pol_fake_df['text'].str.strip() == ''].shape[0]
# empty_strings_count

0

In [None]:
# pol_fake_df.shape

(120, 2)

## Politco Real

In [None]:
# Need to figure out issue above before doing anything with this dataset... (It's the same as the fake...)

## BuzzFeed Fake

In [None]:
#

## BuzzFeed Real

# Getting Real about Fake News dataset

Info: https://opendatascience.com/how-to-build-a-fake-news-classification-model/

Source: https://github.com/GeorgeMcIntire/fake_real_grafn_df

Using this version of "Getting Real about Fake News dataset" because it's balanced with real news. The original is just fake news and was put out by Kaggle.

Original source: https://www.kaggle.com/datasets/mrisdal/fake-news

In [None]:
# Read in Getting Real about Fake News dataset from Google Drive folder
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

file_id = '1qQuqE3ab36NiY1vo8qJ1FHjN-0hrY3oj'
download = drive.CreateFile({'id': file_id})

# Download the file to a local disc
download.GetContentFile('file.csv')
grafn_df = pd.read_csv('file.csv')


In [None]:
grafn_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4594 entries, 0 to 4593
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   idd     4594 non-null   object
 1   title   4593 non-null   object
 2   text    4594 non-null   object
 3   label   4594 non-null   object
dtypes: object(4)
memory usage: 143.7+ KB


In [None]:
# Drop the 'idd' and 'title' columns
grafn_df = grafn_df.drop(columns=['idd', 'title'])

# Rename the 'label' column to 'Fake_rating'
grafn_df = grafn_df.rename(columns={'label': 'Fake_rating'})

# Replace 'FAKE' with 1 and 'REAL' with 0 in the 'Fake_rating' column
grafn_df['Fake_rating'] = grafn_df['Fake_rating'].map({'FAKE': 1, 'REAL': 0})

In [None]:
grafn_df.head()

Unnamed: 0,text,Fake_rating
0,UPDATE: Gov. Fallin vetoed the bill on Friday....,0
1,Ever since Texas laws closed about half of the...,0
2,"Donald Trump and Hillary Clinton, now at the s...",0
3,A Houston grand jury investigating criminal al...,0
4,WASHINGTON -- Forty-three years after the Supr...,0


In [None]:
# Missing Values
missing_values = grafn_df.isnull().sum()
missing_values

text           0
Fake_rating    0
dtype: int64

In [None]:
# Checking for empty strings or strings containing only whitespace in the 'text' column
empty_strings_count = grafn_df[grafn_df['text'].str.strip() == ''].shape[0]
empty_strings_count

25

In [None]:
grafn_df.shape

(4594, 2)

In [None]:
# Removing the rows where the 'text' column has empty strings or strings containing only whitespace
grafn_df = grafn_df[grafn_df['text'].str.strip() != '']

In [None]:
grafn_df.shape

(4569, 2)

In [None]:
# Target balance
grafn_df['Fake_rating'].value_counts()

0    2297
1    2272
Name: Fake_rating, dtype: int64

In [None]:
# Reset the index
grafn_df.reset_index(drop=True, inplace=True)

In [None]:
grafn_df.head(-5)

Unnamed: 0,text,Fake_rating
0,UPDATE: Gov. Fallin vetoed the bill on Friday....,0
1,Ever since Texas laws closed about half of the...,0
2,"Donald Trump and Hillary Clinton, now at the s...",0
3,A Houston grand jury investigating criminal al...,0
4,WASHINGTON -- Forty-three years after the Supr...,0
...,...,...
4559,"20 Views November 03, 2016 GOLD , KWN King Wor...",1
4560,WATCH: Pay no attention to 2008 Michelle Obama...,1
4561,The Book Of Alien Races Exposed # Timotei Simo...,1
4562,Leave a reply \nJames Corbett – FBI Director J...,1


# Merge ISOT and Getting Real about Fake News datasets

In [None]:
# Combine fake_df & real_df by concatenating real_df and fake_df vertically
merged_df = pd.concat([grafn_df, ISOT], ignore_index=True)

In [None]:
merged_df.shape

(43213, 2)

In [None]:
merged_df.head(-5)

Unnamed: 0,text,Fake_rating
0,UPDATE: Gov. Fallin vetoed the bill on Friday....,0
1,Ever since Texas laws closed about half of the...,0
2,"Donald Trump and Hillary Clinton, now at the s...",0
3,A Houston grand jury investigating criminal al...,0
4,WASHINGTON -- Forty-three years after the Supr...,0
...,...,...
43203,21st Century Wire says As 21WIRE reported earl...,1
43204,21st Century Wire says It s a familiar theme. ...,1
43205,Patrick Henningsen 21st Century WireRemember ...,1
43206,21st Century Wire says Al Jazeera America will...,1


In [None]:
# Target balance after merging
# 0 - Real article
# 1 - Fake article
merged_df['Fake_rating'].value_counts()

0    23488
1    19725
Name: Fake_rating, dtype: int64

In [None]:
# Shuffle dataset
merged_df = merged_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# Merged after being shuffled
merged_df.head(-5)

Unnamed: 0,text,Fake_rating
0,"PLYMOUTH, N.H. (Reuters) - U.S. Republican pre...",0
1,If sane Americans have done one stupid thing o...,1
2,Mexicans have been given a green light by Obam...,1
3,China warned President Obama on Tuesday not to...,0
4,"It’s all old news to those of us who, unlike D...",1
...,...,...
43203,Meet Alice Miller who single-handedly scr*wed ...,1
43204,GENEVA (Reuters) - U.N. Syria envoy Staffan de...,0
43205,"KUALA LUMPUR (Reuters) - Malaysia, which until...",0
43206,Pelosi claims to have not had a meeting with ...,1


In [None]:
#----------------- Save merged dataset to CSV -----------------
from google.colab import files

# Dataset was saved to Shared Google Drive on 9/27/2023

## Use this to save dataset to CSV
# merged_df.to_csv('merged_dataset.csv')
# files.download('merged_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Need to push dataset to Hugging Face

In [None]:
# Code here to push to hugging face