In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

The Natural Language Toolkit (NLTK) is a platform used for building Python programs that work with human language data for applying in statistical natural language processing (NLP). It contains text processing libraries for tokenization, parsing, classification, stemming, tagging and semantic reasoning.

The NLTK corporation and modules must be installed using the standard NLTK downloader

In [None]:
import nltk

download the specific packages or nltk.download('all')

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')


The news dataset comprises various authors' original and fictitious article titles and text.

In [None]:
# Load the dataset
news_data = pd.read_csv("Data/news.csv")

In [None]:
print("Shape of News data: ", news_data.shape)
print("News data columns: ", news_data.columns)
print("News data info:")
news_data.info()

In [None]:
# Familiarizing with the dataset by viewing first 5 rows of every column. 
news_data.head()

In [None]:
#Text Word statistics: min.mean, max and interquartile range

txt_length = news_data['text'].str.split().str.len()
txt_length.describe()

In [None]:
#Title statistics 

title_length = news_data.title.str.split().str.len()
title_length.describe()

The statistics for the training and testing sets are as follows:
- The text attribute has a higher word count with an average of 776 words and 75% having more than 1000 words.
 -The title attribute is a short statement with an average of 10 words, and 75% of them are around 13 words.
 - The experiment would be with both text and title together.

In [None]:
import nltk
from nltk.corpus import stopwords
import re
from nltk.stem.porter import PorterStemmer
from collections import Counter

ps = PorterStemmer()
wnl = nltk.stem.WordNetLemmatizer()

stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)

In [None]:
def wrangle(dataset):
    # Removed id column
    data = dataset.drop(columns=['Unnamed: 0'])
    
    # Define nested/inner function to impute null values with None
    def replace_null(data):
        for col in data:
            data.loc[data[col].isnull(), col] = "None"
        return data
    
    # Apply method to data
    df = replace_null(data)
    
    # Define nested/inner function to clean text and title columns
    def clean_text(text):
        # Remove urls, then remove everything else except words (w) and punctuation (s)
        text = re.sub(r'http[\w:/\.]+', ' ', str(text))
        text = re.sub(r'[^\.\w\s]', ' ', text)
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        text = re.sub(r'\s\s+', ' ', text)
        # Turn text to lowercase & use strip to remove the whitespace from the beginning & end of the string
        text = text.lower().strip()
        # Split string by word into array of words
        wordlist = re.sub(r'[^\w\s]', '', text).split()
        # Remove words not in stopwords_dict: words with no significance such as (but, and, or), then rejoin string
        text = ' '.join([wnl.lemmatize(word) for word in wordlist if word not in stopwords_dict])
        return text
    
    # Apply method to text & title columns
    df['text'] = df['text'].apply(clean_text)
    df['title'] = df['title'].apply(clean_text)
    
    # Define another inner function to change label column from text to 0 if REAL and 1 if FAKE
    def category_sort(label):
        if label == 'FAKE':
            return 1
        elif label == 'REAL':
            return 0
        return label
    
    # Apply method to the label column    
    df['label'] = df['label'].apply(category_sort).astype(int)
    
    # Set precision point to remove float
    pd.set_option('display.precision', 0)
    
    return df

In [None]:
df = wrangle(news_data)
df.head()

In [None]:
from wordcloud import WordCloud, STOPWORDS

In [None]:
# Initialize word cloud
wordcloud = WordCloud( background_color='black', width=800, height=600)
# Passing in the text
text_cloud = wordcloud.generate(' '.join(df['text']))
# Plotting the result:
plt.figure(figsize=(20,30))
plt.imshow(text_cloud)
plt.axis('off')
plt.show()

#Save plot
plt.savefig('saved_plots/text-cloud.png')

In [None]:
#For reliable or REAL news only, filter/mask text where label == 0
real_news = ' '.join(df[df['label']==0]['text']) 
text_cloud_real = wordcloud.generate(real_news)
#Plotting the result:
plt.figure(figsize=(20,30))
plt.imshow(text_cloud_real)
plt.axis('off')
plt.show()

#Save plot
plt.savefig('saved_plots/text-cloud-real.png')

In [None]:
#For FAKE news only, filter/mask text where label == 1
fake_news = ' '.join(df[df['label']==1]['text']) 
text_cloud_fake = wordcloud.generate(fake_news)
#Plotting the result:
plt.figure(figsize=(20,30))
plt.imshow(text_cloud_fake)
plt.axis('off')
plt.show()

#Save plot
plt.savefig('saved_plots/text-cloud-fake.png')

In [None]:
#Plot count of REAL AND FAKE news
sns.countplot(x="label", data=df);
plt.xlabel('Real & Fake News')
plt.ylabel('Count')
plt.title("Distribution of labels")
print(df.label.value_counts());

#Save Figure
plt.savefig('saved_plots/real_&_fake-count_barplot.png')

In [None]:
# Define a function to plot an n-gram
def plot_top_ngrams(corpus, title, ylabel, xlabel="Number of Occurences", n=2):
  """Utility function to plot top n-grams"""
  true_b = (pd.Series(nltk.ngrams(corpus.split(), n)).value_counts())[:20]
  true_b.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
  plt.title(title)
  plt.ylabel(ylabel)
  plt.xlabel(xlabel)
  plt.savefig(f'saved_plots/{title}.png')
  plt.show()
  

In [None]:
#Plotting the most common bigram on the reliable news:
plot_top_ngrams(real_news, 'Top 20 Frequently Occuring Real news Bigrams', "Bigram", n=2)

In [None]:
#Plotting the most common bigram on the fake news:
plot_top_ngrams(fake_news, 'Top 20 Frequently Occuring Fake news Bigrams', "Bigram", n=2)

In [None]:
#Plotting the most common three word combination on the reliable news:
plot_top_ngrams(real_news, 'Top 20 Frequently Occuring Real news Trigrams', "Trigrams", n=3)

In [None]:
#Plotting the most common trigram on the fake news:
plot_top_ngrams(fake_news, 'Top 20 Frequently Occuring Fake news Trigrams', "Trigrams", n=3)

This concludes the news analysis