## Data Loading


In [1]:
# Import dependencies
from pathlib import Path
import pandas as pd 
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from textblob import TextBlob
import numpy as np


In [2]:
# Load the news file with cleaned Dataset
news_file = Path('Resources/news_initial_clean_df.csv')


# Read the CSV file directly into pandas DataFrame
news_df = pd.read_csv(news_file, sep=',')



In [3]:
# Display the first five rows
news_df.head()


Unnamed: 0.1,Unnamed: 0,title,text,subject,date,Class
0,0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) The head of a conservativ...,politicsNews,"December 31, 2017",1
1,1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) Transgender people will b...,politicsNews,"December 29, 2017",1
2,2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) The special counsel inves...,politicsNews,"December 31, 2017",1
3,3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) Trump campaign adviser Ge...,politicsNews,"December 30, 2017",1
4,4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) President Donald ...,politicsNews,"December 29, 2017",1


In [4]:
# Remove 'Unnamed' column and change column's name 'Class' to 'class'
news_df = news_df.drop(columns = 'Unnamed: 0', axis = 1)
news_df = news_df.rename(columns = {'Class': 'class'},)
news_df


Unnamed: 0,title,text,subject,date,class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) The head of a conservativ...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) Transgender people will b...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) The special counsel inves...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) Trump campaign adviser Ge...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) President Donald ...,politicsNews,"December 29, 2017",1
...,...,...,...,...,...
44682,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",0
44683,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",0
44684,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",0
44685,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0


In [5]:
# Print the final shape of the DataFrame
print(f"Final DataFrame shape: {news_df.shape}")

Final DataFrame shape: (44687, 5)


### There are 44,687 articles in the DataFrame after Data Cleaning.

# Standarizing text

In [6]:
# Lowercase the text in the 'title' and 'text' columns
news_df['title'] = news_df['title'].str.lower()
news_df['text'] = news_df['text'].str.lower()


In [7]:
# Strip leading and trailing whitespace from the 'title' and 'text' columns
news_df['title'] = news_df['title'].str.strip()
news_df['text'] = news_df['text'].str.strip()


In [8]:
# Verify the changes by viewing the DataFrame
news_df.head()


Unnamed: 0,title,text,subject,date,class
0,"as u.s. budget fight looms, republicans flip t...",washington (reuters) the head of a conservativ...,politicsNews,"December 31, 2017",1
1,u.s. military to accept transgender recruits o...,washington (reuters) transgender people will b...,politicsNews,"December 29, 2017",1
2,senior u.s. republican senator: 'let mr. muell...,washington (reuters) the special counsel inves...,politicsNews,"December 31, 2017",1
3,fbi russia probe helped by australian diplomat...,washington (reuters) trump campaign adviser ge...,politicsNews,"December 30, 2017",1
4,trump wants postal service to charge 'much mor...,seattle/washington (reuters) president donald ...,politicsNews,"December 29, 2017",1


# Text Analysis - Natural Language Processing

## 1. Tokenization: Breaking the text into individual words (tokens). 

In [9]:
# Set up spaCy to process and analyze English text
nlp = spacy.load('en_core_web_sm')


In [10]:
# Download the necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Define the English stop words list from NLTK
stop_words = set(stopwords.words('english'))
print(stop_words)

{'couldn', 'because', 'hadn', "it's", 'them', 'is', 'out', 'theirs', 've', 'who', 'which', 'yours', 'hers', 'him', 'there', "hadn't", 're', 'mightn', 'more', 'all', "don't", 'too', 'are', 'so', 'should', 'how', 'until', 'it', 'ain', "shouldn't", 'those', 'himself', 'these', 'other', 'weren', 'your', 'she', 'on', 'mustn', 'itself', 'me', 'some', "wouldn't", 'yourself', 'further', 'both', 'while', 'whom', 'was', 'off', 'at', 'to', 'where', 'after', 'under', "that'll", "you'll", "aren't", 'being', 'no', 'their', 'ourselves', 'down', 'aren', "haven't", 'we', 'having', 'in', "hasn't", 'up', 'same', "doesn't", 'below', 'and', "mustn't", 'but', 'd', "shan't", 'not', 'again', 'of', "she's", 'has', 'shan', 'above', 'own', 'won', 'i', 'over', 'between', 'needn', 'most', 'once', 'shouldn', 'they', 'the', 't', 'themselves', 'll', 'ma', 'such', 'with', 'did', 'than', 'as', 'isn', 'don', 'doesn', 'doing', 'few', "wasn't", 'by', 'myself', 'very', 'have', 'does', 'from', 'against', 'wasn', 'now', 'its

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joann\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\joann\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Code taken from the website: https://spacy.io/api/doc

# Define a function to split text into individual words using spaCy
def tokenize_and_remove_stopwords(text):
    # Use the processor to analyze the text
    doc = nlp(text)
    # Create an empty list to hold the words
    tokens = []
    # Loop through each word in the processed text
    for token in doc:
        # Check if the word is not a stop word and is alphabetic (removes words which contain numbers or punctuation)
        if token.text not in stop_words and token.is_alpha:
            # Add the word to the list of words
            tokens.append(token.text)
    # Return the list of tokens
    return tokens

# Split the text in each row into words and save it in a new column
news_df['tokens'] = news_df['text'].apply(tokenize_and_remove_stopwords)

# Take a look at the first few rows to make sure it worked
news_df['tokens'].head()

## 2. Word Frequency Distribution: Calculate the frequency of each word in the dataset. 

In [None]:
# Use .loc to filter the DataFrame for true news and select the 'tokens' column
true_news_tokens = news_df.loc[news_df['class'] == 1, 'tokens']
true_news_tokens


In [None]:
# Use .loc to filter the DataFrame for fake news and select the 'tokens' column
fake_news_tokens = news_df.loc[news_df['class'] == 0, 'tokens']
fake_news_tokens


In [None]:
# true_news_tokens is a pandas Series where each row contains a list of tokens (words). 
# We need to convert it into a single list

# Initialize an empty list to hold the flattened tokens
true_tokens_list = []

# Loop through each sublist in the true_news_tokens
for sublist in true_news_tokens:
    # Loop through each token in the current sublist
    for token in sublist:
        # Add the token to the flattened list
        true_tokens_list.append(token)



In [None]:
# The same process for fake_news_tokens - flattening a list of lists into a single list

# Initialize an empty list to hold the flattened tokens
fake_tokens_list = []

# Loop through each sublist in the true_news_tokens
for sublist in fake_news_tokens:
    # Loop through each token in the current sublist
    for token in sublist:
        # Add the token to the flattened list
        fake_tokens_list.append(token)
        


In [None]:
# Calculate the frequency distribution for true news
true_freq_dist = FreqDist(true_tokens_list)
print("Most common words in true news:")
print(true_freq_dist.most_common(30)) 



In [None]:
# Calculate the frequency distribution for fake news
fake_freq_dist = FreqDist(fake_tokens_list)
print("Most common words in fake news:")
print(fake_freq_dist.most_common(30)) 


In [None]:
# Generate word clouds for each category
true_wordcloud = WordCloud(width=800, height=400, background_color ='white').generate_from_frequencies(true_freq_dist)
fake_wordcloud = WordCloud(width=800, height=400, background_color ='white').generate_from_frequencies(fake_freq_dist)


In [None]:
# Display the word clouds
plt.figure(figsize=(15, 7))

plt.subplot(1, 2, 1)
plt.imshow(true_wordcloud, interpolation='bilinear')
plt.title('True News Word Cloud')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(fake_wordcloud, interpolation='bilinear')
plt.title('Fake News Word Cloud')
plt.axis('off')

plt.show()

## 3. Sentiment Analysis: Asses the sentiment of the news articles to examine the emotional tone of the articles.


In [None]:
# Define a function to get the polarity of text
def sentiment_analysis_polarity(text):
    try:
        # Convert text to string
        text = str(text)
        # Create a TextBlob object
        blob = TextBlob(text)
        # Return only the polarity 
        return blob.sentiment.polarity
    except:
        return None

# Apply the function to the 'text' column and create a new 'polarity' column
news_df['polarity'] = news_df['text'].apply(sentiment_analysis_polarity)

# Check the first few entries in the new 'polarity' column
news_df['polarity'].head()



In [None]:
# Define a function to get the subjectivity of text
def sentiment_analysis_subjectivity(text):
    try:
        # Convert text to string
        text = str(text)
        # Create a TextBlob object
        blob = TextBlob(text)
        # Return only the polarity 
        return blob.sentiment.subjectivity
    except:
        return None

# Apply the function to the 'text' column and create a new 'subjectivity' column
news_df['subjectivity'] = news_df['text'].apply(sentiment_analysis_subjectivity)

# Check the first few entries in the new 'subjectivity' column
news_df['subjectivity'].head()


In [None]:
# Determine if the sentiment is positive, negative, or neutral

def determine_sentiment(polarity):
    if polarity > 0:
        return "positive"
    elif polarity < 0:
         return "negative"
    else:
        return "neutral"

# Apply the function to the 'text' column and create a new 'sentiment' column
news_df['sentiment'] = news_df['polarity'].apply(determine_sentiment)

# Check the first few entries in the new 'sentiment' column
news_df['sentiment'].head()


In [None]:
# Check for unique values in the 'sentiment' column
unique_sentiments = news_df['sentiment'].unique()
print(f"Unique values in 'sentiment' column: {unique_sentiments}")


In [None]:
# Check for null values in the 'sentiment' column
null_sentiments = news_df['sentiment'].isnull().sum()
print(f"Number of null values in 'sentiment' column: {null_sentiments}")


In [None]:
# Filter the DataFrame for true news and calculate the value counts of the 'sentiment' column
true_news_sentiment = news_df.loc[news_df['class'] == 1, 'sentiment'].value_counts()

# Display the value counts for both true and fake news
print("Sentiment distribution in True News:")
print(true_news_sentiment)


In [None]:
# Filter the DataFrame for fake news and calculate the value counts of the 'sentiment' column
fake_news_sentiment = news_df.loc[news_df['class'] == 0, 'sentiment'].value_counts()

print("Sentiment distribution in Fake News:")
print(fake_news_sentiment)


In [None]:
# Create a new figure for the bar charts
plt.figure(figsize=(15, 7))

# Create the True News Sentiment Bar Chart subplot
plt.subplot(1, 2, 1)  
true_news_sentiment.plot(kind='bar', color='blue')
plt.title('True News Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.xticks(rotation=0) 

# Create the Fake News Sentiment Bar Chart subplot
plt.subplot(1, 2, 2)  # 1 row, 2 columns, 2nd subplot
fake_news_sentiment.plot(kind='bar', color='orange')
plt.title('Fake News Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.xticks(rotation=0)  

# Display the bar charts
plt.show()



In [None]:
# Create histograms for polarity distribution of true and fake news
plt.figure(figsize=(14, 6))

# Histogram for true news polarity
plt.subplot(1, 2, 1)  
true_news_polarity = news_df.loc[news_df['class'] == 1, 'polarity']
plt.hist(true_news_polarity, bins=20, color='blue', alpha=0.7)
plt.title('Polarity Distribution in True News')
plt.xlabel('Polarity')
plt.ylabel('Number of Articles')

# Histogram for fake news polarity
plt.subplot(1, 2, 2) 
fake_news_polarity = news_df.loc[news_df['class'] == 0, 'polarity']
plt.hist(fake_news_polarity, bins=20, color='orange', alpha=0.7)
plt.title('Polarity Distribution in Fake News')
plt.xlabel('Polarity')
plt.ylabel('Number of Articles')

# Display the histograms
plt.tight_layout()
plt.show()

In [None]:
# Calculate the average polarity and subjectivity for true news
average_polarity_true = news_df.loc[news_df['class'] == 1, 'polarity'].mean()
average_subjectivity_true = news_df.loc[news_df['class'] == 1, 'subjectivity'].mean()

# Print the average polarity and subjectivity for true news
print(f"Average Polarity for True News: {average_polarity_true}")
print(f"Average Subjectivity for True News: {average_subjectivity_true}")


In [None]:
# Calculate the average polarity and subjectivity for fake news
average_polarity_fake = news_df.loc[news_df['class'] == 0, 'polarity'].mean()
average_subjectivity_fake = news_df.loc[news_df['class'] == 0, 'subjectivity'].mean()

# Print the average polarity and subjectivity for fake news
print(f"Average Polarity for Fake News: {average_polarity_fake}")
print(f"Average Subjectivity for Fake News: {average_subjectivity_fake}")


In [None]:
# Filter the DataFrame for fake news with negative sentiment to see if there are specific words that often appear in fake news
# with a negative sentiment?
negative_fake_news = news_df.loc[
                    (news_df['class'] == 0) & (news_df['sentiment'] == 'negative'), 'tokens']

# Flat the negative_fake_news into a single list
negative_fake_token_list = []

# Loop through each sublist in the true_news_tokens
for sublist in negative_fake_news:
    # Loop through each token in the current sublist
    for token in sublist:
        # Add the token to the flattened list
        negative_fake_token_list.append(token)
        
# Calculate the frequency distribution for negative and fake news:
negative_fake_freq_dist = FreqDist(negative_fake_token_list)
print("Most common words in fake news with negative sentiment:")
print(negative_fake_freq_dist.most_common(30)) 
     
# Visualization using a word cloud
wordcloud = WordCloud(width=800, height=400, background_color ='white').generate_from_frequencies(negative_fake_freq_dist)

# Plotting the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0) 
plt.title('Word Cloud for Negative Sentiment in Fake News Articles')
plt.show()

In [None]:
# Generic function to generate word cloud for specific sentiment and category
def generate_wordcloud(sentiment, class_label, title):
    filtered_news = news_df.loc[
                    (news_df['class'] == class_label) & (news_df['sentiment'] == sentiment), 'tokens']
    
   
    # Flat the negative_fake_news into a single list
    flat_token_list = []

    # Loop through each sublist in the true_news_tokens
    for sublist in filtered_news:
        # Loop through each token in the current sublist
        for token in sublist:
            # Add the token to the flattened list
            flat_token_list.append(token)
        
    
    # Calculate the frequency distribution
    freq_dist = FreqDist(flat_token_list)
    print(f"Most common words in {'fake' if class_label == 0 else 'true'} news with {sentiment} sentiment:")
    print(freq_dist.most_common(30)) 
    
    # Visualization using a word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(freq_dist)
    
    # Plotting the WordCloud image                        
    plt.figure(figsize=(8, 8), facecolor=None) 
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad=0) 
    plt.title(title)
    plt.show()



In [None]:
# Most common words in fake news with positive sentiment:
generate_wordcloud('positive', 0, 'Word Cloud for Positive Sentiment in Fake News Articles')


In [None]:
# Most common words in real news with positive sentiment:
generate_wordcloud('positive', 1, 'Word Cloud for Positive Sentiment in True News Articles')


In [None]:
# Most common words in fake news with negative sentiment:
generate_wordcloud('negative', 0, 'Word Cloud for Negative Sentiment in Fake News Articles')


In [None]:
# Most common words in true news with negative sentiment:
generate_wordcloud('negative', 1, 'Word Cloud for Negative Sentiment in True News Articles')
