In [1]:
# Import dependencies
import numpy as np
import pandas as pd
import re

### Read and Parse Text

In [2]:
# Read data
fake_news_df = pd.read_csv('data/fake_news.csv', usecols=range(4))
real_news_df = pd.read_csv('data/real_news.csv', usecols=range(4))

In [3]:
# Verify if any missing values
fake_news_df.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [4]:
# Verify if any missing values
real_news_df.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [5]:
# Replace any empty strings with NaN
fake_news_df['title'].replace('', np.nan, inplace=True)
real_news_df['title'].replace('', np.nan, inplace=True)

# Remove data with missing values for a complete dataset
fake_news_df.dropna(subset=['title'], inplace=True)
real_news_df.dropna(subset=['title'], inplace=True)

In [6]:
# Add labels for fake and real news
fake_news_df['label'] = 0
real_news_df['label'] = 1

In [7]:
# Append dataframes
news_df = fake_news_df.append(real_news_df, ignore_index=True)
news_df

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,2017-12-31,0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,2017-12-31,0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,2017-12-30,0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,2017-12-29,0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,2017-12-25,0
...,...,...,...,...,...
44862,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,2017-08-22,1
44863,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,2017-08-22,1
44864,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,2017-08-22,1
44865,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,2017-08-22,1


In [8]:
# Add new column that will combine all columns to help determine fake and real news; this data will be used to train our model
news_df['combined_text'] = news_df['title'] + ' ' + news_df['text']
news_df.head()

Unnamed: 0,title,text,subject,date,label,combined_text
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,2017-12-31,0,Donald Trump Sends Out Embarrassing New Year’...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,2017-12-31,0,Drunk Bragging Trump Staffer Started Russian ...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,2017-12-30,0,Sheriff David Clarke Becomes An Internet Joke...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,2017-12-29,0,Trump Is So Obsessed He Even Has Obama’s Name...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,2017-12-25,0,Pope Francis Just Called Out Donald Trump Dur...


### Remove special characters and stopwords, and lemmatize

In [None]:
# Pre-processing: Lemmatization
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
for index, row in news_df.iterrows():
    filter_sentence = ''
    sentence = row['combined_text']
    # Cleaning the sentence with regex
    sentence = re.sub(r'[^\w\s]', '', sentence)
    # Tokenization
    words = nltk.word_tokenize(sentence)
    # Stopwords removal
    words = [w for w in words if not w in stop_words]
    # Lemmatization
    for words in words:
        filter_sentence = filter_sentence  + ' ' + str(lemmatizer.lemmatize(words)).lower()
    
    news_df.loc[index, 'combined_text'] = filter_sentence

In [None]:
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

# Find highest frequency words found in fake news
news_words = []
for index, row in news_df.iterrows():
    sentence = ''
    text = row['combined_text']

    # Convert to lower case
    text = text.lower()
    
    # Remove numbers
    text = re.sub(r'[0-9]+', '', text)
    
    # Cleaning with regex
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize
    text_words = nltk.word_tokenize(text)
    
    # Remove stop words and add to list
    news_words.extend([str(w) for w in text_words if not w in stop_words])
    
    # Lemmatization
    for word in news_words:
        sentence = sentence  + ' ' + str(lemmatizer.lemmatize(word))
        
    # Replace 'combined_text' with lemmatized sentence
    news_df.loc[index, 'combined_text'] = sentence

In [None]:
# Rename 'subject' column
news_df.rename(columns={"subject": "topic"}, inplace = True)

# Rename 'date' column
news_df.rename(columns={"date": "news_date"}, inplace = True)

In [None]:
# Drop unnecessary columns
news_df.drop(['title', 'text'], axis=1, inplace=True)

In [None]:
 # Export the dataset into a csv
news_df.to_csv("data/news_df.csv", index = False)