In [None]:
# Import dependencies
import numpy as np
import pandas as pd
import re

### Read and Parse Text

In [None]:
# Read data
fake_news_df = pd.read_csv('data/fake_news.csv', usecols=range(4))
real_news_df = pd.read_csv('data/real_news.csv', usecols=range(4))

In [None]:
# Verify if any missing values
fake_news_df.isnull().sum()

In [None]:
# Verify if any missing values
real_news_df.isnull().sum()

In [None]:
# Replace any empty strings with NaN
fake_news_df['title'].replace('', np.nan, inplace=True)
real_news_df['title'].replace('', np.nan, inplace=True)

# Remove data with missing values for a complete dataset
fake_news_df.dropna(subset=['title'], inplace=True)
real_news_df.dropna(subset=['title'], inplace=True)

In [None]:
# Add labels for fake and real news
fake_news_df['label'] = 0
real_news_df['label'] = 1

In [None]:
# Append dataframes
news_df = fake_news_df.append(real_news_df, ignore_index=True)
news_df

In [None]:
# Add new column that will combine all columns to help determine fake and real news; this data will be used to train our model
news_df['combined_text'] = news_df['title'] + ' ' + news_df['text']
news_df.head()

### Remove special characters and stopwords, and lemmatize

In [None]:
# Pre-processing: Lemmatization
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
for index, row in news_df.iterrows():
    filter_sentence = ''
    sentence = row['combined_text']
    # Cleaning the sentence with regex
    sentence = re.sub(r'[^\w\s]', '', sentence)
    # Tokenization
    words = nltk.word_tokenize(sentence)
    # Stopwords removal
    words = [w for w in words if not w in stop_words]
    # Lemmatization
    for words in words:
        filter_sentence = filter_sentence  + ' ' + str(lemmatizer.lemmatize(words)).lower()
    
    news_df.loc[index, 'combined_text'] = filter_sentence

In [None]:
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

# Find highest frequency words found in fake news
news_words = []
for index, row in news_df.iterrows():
    sentence = ''
    text = row['combined_text']

    # Convert to lower case
    text = text.lower()
    
    # Remove numbers
    text = re.sub(r'[0-9]+', '', text)
    
    # Cleaning with regex
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize
    text_words = nltk.word_tokenize(text)
    
    # Remove stop words and add to list
    news_words.extend([str(w) for w in text_words if not w in stop_words])
    
    # Lemmatization
    for word in news_words:
        sentence = sentence  + ' ' + str(lemmatizer.lemmatize(word))
        
    # Replace 'combined_text' with lemmatized sentence
    news_df.loc[index, 'combined_text'] = sentence

In [None]:
# Rename 'subject' column
news_df.rename(columns={"subject": "topic"}, inplace = True)

# Rename 'date' column
news_df.rename(columns={"date": "news_date"}, inplace = True)

In [None]:
# Drop unnecessary columns
news_df.drop(['title', 'text'], axis=1, inplace=True)

In [None]:
 # Export the dataset into a csv
news_df.to_csv("data/news_df.csv", index = False)