# ICS 438 Project: Fake News
## by: Leilani Reich

### Link to Dataset: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset?select=True.csv

### GitHub Repo: https://github.com/leilani-reich/ICS438-FinalProject-FakeNews

## Install Libraries

In [None]:
#!pip install pyspark
#!python -m pip install -U gensim
#%pip install -U sentence-transformers
!pip install --user annoy

In [1]:
# Create new Spark Context
from pyspark import SparkContext
sc = SparkContext()

In [2]:
# Create new Spark Session
from pyspark.sql import SparkSession
session = SparkSession(sc)

### Load in Data

In [3]:
# Some of the data like the text contains double quotes, which really cause a lot of issues!
# So I need escape='"'
fake_df = session.read.csv("Fake.csv", inferSchema = True, header=True, multiLine=True, escape='"')

print(type(fake_df))

<class 'pyspark.sql.dataframe.DataFrame'>


In [4]:
fake_df.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)



In [5]:
true_df = session.read.csv("True.csv", inferSchema = True, header=True, multiLine=True, escape='"')

print(type(true_df))

<class 'pyspark.sql.dataframe.DataFrame'>


In [6]:
true_df.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)



## Preprocess Data

In [7]:
# Remove missing info

fake_df = fake_df.dropna()
true_df = true_df.dropna()


In [None]:
import matplotlib.pyplot as plt

# Visualize the types of fake news by frequency

# Get the unique subject names for the articles
fake_news_types = fake_df.select("subject").distinct()
fake_news_types = list(fake_news_types.toPandas()["subject"])
print("fake news types", fake_news_types)

# Get the total counts for each type of article
fake_news_types_counts = fake_df.groupBy("subject").count().select("count")
fake_news_types_counts = list(fake_news_types_counts.toPandas()["count"])
print("fake news types counts:", fake_news_types_counts)

# Show subject names and corresponding counts in table
fake_df.groupBy("subject").count().show()

# Create dictionary with subjects as keys and counts as values
fake_news_dict = dict(zip(fake_news_types, fake_news_types_counts))

# Sort in ascending order by value
fake_news_by_frequency = sorted(fake_news_dict.items(), key=lambda x: x[1], reverse=True)

# Get sorted keys and values
fn_subjects, fn_counts = zip(*fake_news_by_frequency)

# Show subject names and corresponding counts in barchart
plt.bar(x = fn_subjects, height = fn_counts)

plt.xticks(rotation=-45)

plt.tight_layout()

plt.show()

In [None]:
# Visualize the types of true news by frequency

# Get the unique subject names for the articles
true_news_types = true_df.select("subject").distinct()
true_news_types = list(true_news_types.toPandas()["subject"])
print("true news types", true_news_types)

# Get the total counts for each type of article
true_news_types_counts = true_df.groupBy("subject").count().select("count")
true_news_types_counts = list(true_news_types_counts.toPandas()["count"])
print("true news types counts:", true_news_types_counts)

# Show subject names and corresponding counts in table
true_df.groupBy("subject").count().show()

# Create dictionary with subjects as keys and counts as values
true_news_dict = dict(zip(true_news_types, true_news_types_counts))

# Sort in ascending order by value
true_news_by_frequency = sorted(true_news_dict.items(), key=lambda x: x[1], reverse=True)

# Get sorted keys and values
fn_subjects, fn_counts = zip(*true_news_by_frequency)

# Show subject names and corresponding counts in barchart
plt.bar(x = fn_subjects, height = fn_counts)

plt.xticks(rotation=-45)

plt.tight_layout()


In [None]:
# Visualizing the top 20 most prominent dates of fake news

# Get the unique dates for the articles
fake_news_dates = fake_df.select("date").distinct()
fake_news_dates = list(fake_news_dates.toPandas()["date"])
#print("fake news dates", fake_news_dates)

# Get the total counts for each type of article
fake_news_dates_counts = fake_df.groupBy("date").count().select("count")
fake_news_dates_counts = list(fake_news_dates_counts.toPandas()["count"])
#print("fake news dates counts:", fake_news_dates_counts)

# Show dates and corresponding counts in table
fake_df.groupBy("date").count().show()

# Create dictionary with subjects as keys and counts as values
fake_news_dict = dict(zip(fake_news_dates, fake_news_dates_counts))

# Sort in ascending order by value
fake_news_dates_by_frequency = sorted(fake_news_dict.items(), key=lambda x: x[1], reverse=True)

print("Top 10 most prevalent dates of fake news posts", list(fake_news_dict.items())[:10])

# Get sorted keys and values
fn_dates, fn_counts = zip(*fake_news_dates_by_frequency)

# Show subject names and corresponding counts in barchart
plt.bar(x = fn_dates[:20], height = fn_counts[:20])

plt.xticks(rotation=-90)

plt.tight_layout()

plt.show()

In [None]:
# Visualizing the top 20 most prominent dates of true news

# Get the unique dates for the articles
true_news_dates = true_df.select("date").distinct()
true_news_dates = list(true_news_dates.toPandas()["date"])
#print("true news dates", true_news_dates)

# Get the total counts for each type of article
true_news_dates_counts = true_df.groupBy("date").count().select("count")
true_news_dates_counts = list(true_news_dates_counts.toPandas()["count"])
#print("true news dates counts:", true_news_dates_counts)

# Show dates and corresponding counts in table
true_df.groupBy("date").count().show()

# Create dictionary with subjects as keys and counts as values
true_news_dict = dict(zip(true_news_dates, true_news_dates_counts))

# Sort in ascending order by value
true_news_dates_by_frequency = sorted(true_news_dict.items(), key=lambda x: x[1], reverse=True)

print("Top 10 most prevalent dates of true news posts", list(true_news_dict.items())[:10])

# Get sorted keys and values
fn_dates, fn_counts = zip(*true_news_dates_by_frequency)

# Show subject names and corresponding counts in barchart
plt.bar(x = fn_dates[:20], height = fn_counts[:20])

plt.xticks(rotation=-90)

plt.tight_layout()

plt.show()

In [8]:
# Preprocess the text data
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, strip_short

# Do some common cleaning options to remove noise from text
def clean_text(text):
    text_p1 = remove_stopwords(text)
    text_p2 = strip_punctuation(text_p1)
    text_p3 = strip_short(text_p2)
    return text_p3.lower()
    
    # How to remove @ from text?


In [9]:
# I combined the title and text and am considering them together
fake_text = fake_df.rdd.map(lambda x: clean_text(x["title"]+ " " + x["text"]))

print(type(fake_text))

print(fake_text.first())

<class 'pyspark.rdd.PipelinedRDD'>
donald trump sends out embarrassing new year’s eve message this disturbing donald trump couldn wish americans happy new year leave that instead shout enemies haters dishonest fake news media the reality star job couldn country rapidly grows stronger smarter want wish friends supporters enemies haters dishonest fake news media happy healthy new year president angry pants tweeted 2018 great year america country rapidly grows stronger smarter want wish friends supporters enemies haters dishonest fake news media happy healthy new year 2018 great year america donald trump realdonaldtrump december 2017trump tweet went welll expect what kind president sends new year greeting like despicable petty infantile gibberish only trump his lack decency won allow rise gutter long wish american citizens happy new year bishop talbert swan talbertswan december 2017no likes calvin calvinstowell december 2017your impeachment 2018 great year america accept regaining control

In [10]:
# I combined the title and text and am considering them together
true_text = true_df.rdd.map(lambda x: clean_text(x["title"]+ " " + x["text"]))

print(type(true_text))

print(true_text.first())

<class 'pyspark.rdd.PipelinedRDD'>
budget fight looms republicans flip fiscal script washington reuters the head conservative republican faction congress voted month huge expansion national debt pay tax cuts called “fiscal conservative” sunday urged budget restraint 2018 keeping sharp pivot way republicans representative mark meadows speaking cbs’ “face nation drew hard line federal spending lawmakers bracing battle january when return holidays wednesday lawmakers begin trying pass federal budget fight likely linked issues immigration policy november congressional election campaigns approach republicans seek control congress president donald trump republicans want big budget increase military spending democrats want proportional increases non defense “discretionary” spending programs support education scientific research infrastructure public health environmental protection “the trump administration willing say ‘we’re going increase non defense discretionary spending percent meadows ch

## Encode Data

In [11]:
# Create SentenceTransformer model

from sentence_transformers import SentenceTransformer

# https://www.sbert.net/docs/pretrained_models.html
model = SentenceTransformer('paraphrase-MiniLM-L3-v2')


In [12]:
# Create embeddings - BATCH THIS?

print("Embedding Dimension:", model.encode(fake_text.first()).reshape(1, -1).shape)


Embedding Dimension: (1, 384)


In [30]:
import numpy as np
from pyspark.sql import Row

# Setting up for embedding for fake text:

fake_text_df = fake_text.map(Row("value")).toDF()

# Splitting data into train and test
fake_text_train_df, fake_text_test_df = fake_text_df.randomSplit([0.9, 0.1])

# Double check randomsplit gives what we expect
fake_train_len = fake_text_train_df.count()
fake_test_len = fake_text_test_df.count()
total_len = fake_train_len + fake_test_len

print("Percent training:", round(fake_train_len / total_len, 2))
print("Percent testing:", round(fake_test_len / total_len, 2))


Percent training: 0.9
Percent testing: 0.1


In [38]:
# Embed the fake training and testing text

fake_embed_train = fake_text_train_df.rdd.map(lambda x: model.encode(x["value"]).reshape(1, -1))
fake_embed_test = fake_text_test_df.rdd.map(lambda x: model.encode(x["value"]).reshape(1, -1))

print(type(fake_embed_train))
print(type(fake_embed_test))

<class 'pyspark.rdd.PipelinedRDD'>
<class 'pyspark.rdd.PipelinedRDD'>


In [39]:
# Checking values

print(type(fake_embed_train.first()))
print(fake_embed_train.first().shape)
#print(fake_embed_train.first())

print(type(fake_embed_test.first()))
print(fake_embed_test.first().shape)
#print(fake_embed_train.first())

<class 'numpy.ndarray'>
(1, 384)
<class 'numpy.ndarray'>
(1, 384)


In [40]:
# Setting up for embedding for true text:

true_text_df = true_text.map(Row("value")).toDF()

# Splitting data into train and test
true_text_train_df, true_text_test_df = true_text_df.randomSplit([0.9, 0.1])

# Double check randomsplit gives what we expect
true_train_len = true_text_train_df.count()
true_test_len = true_text_test_df.count()
total_len = true_train_len + true_test_len

print("Percent training:", round(true_train_len / total_len, 2))
print("Percent testing:", round(true_test_len / total_len, 2))

Percent training: 0.9
Percent testing: 0.1


In [42]:
# Embed the true training and testing text

true_embed_train = true_text_train_df.rdd.map(lambda x: model.encode(x["value"]).reshape(1, -1))
true_embed_test = true_text_test_df.rdd.map(lambda x: model.encode(x["value"]).reshape(1, -1))

print(type(true_embed_train))
print(type(true_embed_test))

<class 'pyspark.rdd.PipelinedRDD'>
<class 'pyspark.rdd.PipelinedRDD'>


In [43]:
# Checking values

print(type(true_embed_train.first()))
print(fake_embed_train.first().shape)
#print(fake_embed_train.first())

print(type(true_embed_test.first()))
print(true_embed_test.first().shape)
#print(fake_embed_train.first())

<class 'numpy.ndarray'>
(1, 384)
<class 'numpy.ndarray'>
(1, 384)


## Can we classify news as being fake or true?

In [None]:
# Using Approximate Nearest Neighbors Oh Yeah! (ANNOY)
from annoy import AnnoyIndex
from pyspark.sql.functions import array_position

# https://github.com/spotify/annoy

f = 384 # Embedding dimension

t = AnnoyIndex(f, 'angular')

i = 0
fake_embed.foreach(lambda vector: t.add_item(i, vector), i += 1)
    
t.build(10) # 10 trees
t.save('test.ann')

u = AnnoyIndex(f, 'angular')
u.load('test.ann') # super fast, will just mmap the file
print(u.get_nns_by_item(0, 10)) # will find the 10 nearest neighbors

In [None]:

# Which type of news has the most negative sentiment (Sentiment Analysis)?
# What/who are the topics of fake vs. true news (Name Entity Recognition)?
# Can we classify news as being fake or true?