In [1]:
# Importing dependencies
import numpy as np
from PIL import Image
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

import psycopg2
import sqlalchemy
import pandas as pd
import re

In [2]:
# Read data
fake_news_df = pd.read_csv('data/fake_news.csv', usecols=range(4))
real_news_df = pd.read_csv('data/real_news.csv', usecols=range(4))

In [3]:
# Add new column that will combine all columns to help determine fake and real news; this data will be used to train our model
fake_news_df['combined_text'] = fake_news_df['title'] + ' ' + fake_news_df['text']
real_news_df['combined_text'] = real_news_df['title'] + ' ' + real_news_df['text']

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize

wordcloud_words_fake = []

# Tokenize lemmatized words for wordcloud_words_fake
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
for index, row in fake_news_df.iterrows():
    filter_sentence = ''
    sentence = row['combined_text']
    # Convert to lower case
    sentence = sentence.lower()
    # Remove numbers
    sentence = re.sub(r'[0-9]+', '', sentence)
    # Cleaning the sentence with regex
    sentence = re.sub(r'[^\w\s]', '', sentence)
    # Tokenization
    words = nltk.word_tokenize(sentence)
    # Stopwords removal / lemmatize
    wordcloud_words_fake = [str(w) for w in words if not w in stop_words]

In [5]:
wordcloud_words_real = []

# Tokenize lemmatized words for wordcloud_words_real
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
for index, row in real_news_df.iterrows():
    filter_sentence = ''
    sentence = row['combined_text']
    # Convert to lower case
    sentence = sentence.lower()
    # Remove numbers
    sentence = re.sub(r'[0-9]+', '', sentence)
    # Cleaning the sentence with regex
    sentence = re.sub(r'[^\w\s]', '', sentence)
    # Tokenization
    words = nltk.word_tokenize(sentence)
    # Stopwords removal / lemmatize
    wordcloud_words_real = [str(w) for w in words if not w in stop_words]

In [6]:
wordcloud_fake_news_df = pd.DataFrame(wordcloud_words_fake, columns=['Words'])
wordcloud_fake_news_df['Count'] = 1

wordcloud_real_news_df = pd.DataFrame(wordcloud_words_real, columns=['Words'])
wordcloud_real_news_df['Count'] = 1

In [7]:
wordcloud_fake_news_df

Unnamed: 0,Words,Count
0,us,1
1,navy,1
2,sailors,1
3,held,1
4,iranian,1
...,...,...
513,st,1
514,century,1
515,wire,1
516,iran,1


In [None]:
wordcloud_fake_news_df = wordcloud_fake_news_df.groupby('Words')
wordcloud_fake_news_df = wordcloud_fake_news_df.agg({"Count": "sum"})

wordcloud_fake_news = wordcloud_fake_news_df.sort_values(by="Count", ascending=False)

In [None]:
wordcloud_fake_news_df.to_csv('research.csv')

In [None]:
# Extract unique list of words
unique_wordcloud_words_fake = pd.unique(wordcloud_words_fake)
unique_wordcloud_words_real = pd.unique(wordcloud_words_real)

# Combine all words into one big string
wordcloud_words_fake = " ".join(word for word in unique_wordcloud_words_fake)
wordcloud_words_real = " ".join(word for word in unique_wordcloud_words_real)

In [None]:
# Create function to swap numbers 0 to 255 (white)
def transform_format(val):
    if val == 0:
        return 255
    else:
        return val

In [None]:
# Import PNG mask
mask = np.array(Image.open(".static/images/magnifying_glass.png"))
    
# Transform mask
transformed_mask = np.ndarray((mask.shape[0],mask.shape[1]), np.int32)

# for i in range(len(mask)):
#     transformed_mask[i] = list(map(transform_format, mask[i]))

In [None]:
# Create a word cloud image
wc = WordCloud(background_color="white", max_words=1000, mask=transformed_mask,
               contour_width=3, contour_color='black')

# Generate a wordcloud
wc.generate(word_cloud_text)

# Store to file
wc.to_file("Images/word_cloud.png")

# Show
plt.figure(figsize=[20,10])
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()