# ICS 438 Project: Fake News
## by: Leilani Reich

### Link to Dataset: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset?select=True.csv

### GitHub Repo: https://github.com/leilani-reich/ICS438-FinalProject-FakeNews

## Install Libraries

In [46]:
#!pip install pyspark
#!python -m pip install -U gensim
#%pip install -U sentence-transformers
#!pip install --user annoy
#!pip install faiss-cpu --no-cache
!pip install autofaiss

Collecting faiss-cpu
  Downloading faiss_cpu-1.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.3


In [1]:
# Create new Spark Context
from pyspark import SparkContext
sc = SparkContext()

In [2]:
# Create new Spark Session
from pyspark.sql import SparkSession
session = SparkSession(sc)

### Load in Data

In [3]:
# Some of the data like the text contains double quotes, which really cause a lot of issues!
# So I need escape='"'
fake_df = session.read.csv("Fake.csv", inferSchema = True, header=True, multiLine=True, escape='"')

print(type(fake_df))

<class 'pyspark.sql.dataframe.DataFrame'>


In [4]:
fake_df.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)



In [5]:
true_df = session.read.csv("True.csv", inferSchema = True, header=True, multiLine=True, escape='"')

print(type(true_df))

<class 'pyspark.sql.dataframe.DataFrame'>


In [6]:
true_df.printSchema()

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- date: string (nullable = true)



## Preprocess Data

In [7]:
# Remove missing info

fake_df = fake_df.dropna()
true_df = true_df.dropna()


In [None]:
import matplotlib.pyplot as plt

# Visualize the types of fake news by frequency

# Get the unique subject names for the articles
fake_news_types = fake_df.select("subject").distinct()
fake_news_types = list(fake_news_types.toPandas()["subject"])
print("fake news types", fake_news_types)

# Get the total counts for each type of article
fake_news_types_counts = fake_df.groupBy("subject").count().select("count")
fake_news_types_counts = list(fake_news_types_counts.toPandas()["count"])
print("fake news types counts:", fake_news_types_counts)

# Show subject names and corresponding counts in table
fake_df.groupBy("subject").count().show()

# Create dictionary with subjects as keys and counts as values
fake_news_dict = dict(zip(fake_news_types, fake_news_types_counts))

# Sort in ascending order by value
fake_news_by_frequency = sorted(fake_news_dict.items(), key=lambda x: x[1], reverse=True)

# Get sorted keys and values
fn_subjects, fn_counts = zip(*fake_news_by_frequency)

# Show subject names and corresponding counts in barchart
plt.bar(x = fn_subjects, height = fn_counts)

plt.xticks(rotation=-45)

plt.tight_layout()

plt.show()

In [None]:
# Visualize the types of true news by frequency

# Get the unique subject names for the articles
true_news_types = true_df.select("subject").distinct()
true_news_types = list(true_news_types.toPandas()["subject"])
print("true news types", true_news_types)

# Get the total counts for each type of article
true_news_types_counts = true_df.groupBy("subject").count().select("count")
true_news_types_counts = list(true_news_types_counts.toPandas()["count"])
print("true news types counts:", true_news_types_counts)

# Show subject names and corresponding counts in table
true_df.groupBy("subject").count().show()

# Create dictionary with subjects as keys and counts as values
true_news_dict = dict(zip(true_news_types, true_news_types_counts))

# Sort in ascending order by value
true_news_by_frequency = sorted(true_news_dict.items(), key=lambda x: x[1], reverse=True)

# Get sorted keys and values
fn_subjects, fn_counts = zip(*true_news_by_frequency)

# Show subject names and corresponding counts in barchart
plt.bar(x = fn_subjects, height = fn_counts)

plt.xticks(rotation=-45)

plt.tight_layout()


In [None]:
# Visualizing the top 20 most prominent dates of fake news

# Get the unique dates for the articles
fake_news_dates = fake_df.select("date").distinct()
fake_news_dates = list(fake_news_dates.toPandas()["date"])
#print("fake news dates", fake_news_dates)

# Get the total counts for each type of article
fake_news_dates_counts = fake_df.groupBy("date").count().select("count")
fake_news_dates_counts = list(fake_news_dates_counts.toPandas()["count"])
#print("fake news dates counts:", fake_news_dates_counts)

# Show dates and corresponding counts in table
fake_df.groupBy("date").count().show()

# Create dictionary with subjects as keys and counts as values
fake_news_dict = dict(zip(fake_news_dates, fake_news_dates_counts))

# Sort in ascending order by value
fake_news_dates_by_frequency = sorted(fake_news_dict.items(), key=lambda x: x[1], reverse=True)

print("Top 10 most prevalent dates of fake news posts", list(fake_news_dict.items())[:10])

# Get sorted keys and values
fn_dates, fn_counts = zip(*fake_news_dates_by_frequency)

# Show subject names and corresponding counts in barchart
plt.bar(x = fn_dates[:20], height = fn_counts[:20])

plt.xticks(rotation=-90)

plt.tight_layout()

plt.show()

In [None]:
# Visualizing the top 20 most prominent dates of true news

# Get the unique dates for the articles
true_news_dates = true_df.select("date").distinct()
true_news_dates = list(true_news_dates.toPandas()["date"])
#print("true news dates", true_news_dates)

# Get the total counts for each type of article
true_news_dates_counts = true_df.groupBy("date").count().select("count")
true_news_dates_counts = list(true_news_dates_counts.toPandas()["count"])
#print("true news dates counts:", true_news_dates_counts)

# Show dates and corresponding counts in table
true_df.groupBy("date").count().show()

# Create dictionary with subjects as keys and counts as values
true_news_dict = dict(zip(true_news_dates, true_news_dates_counts))

# Sort in ascending order by value
true_news_dates_by_frequency = sorted(true_news_dict.items(), key=lambda x: x[1], reverse=True)

print("Top 10 most prevalent dates of true news posts", list(true_news_dict.items())[:10])

# Get sorted keys and values
fn_dates, fn_counts = zip(*true_news_dates_by_frequency)

# Show subject names and corresponding counts in barchart
plt.bar(x = fn_dates[:20], height = fn_counts[:20])

plt.xticks(rotation=-90)

plt.tight_layout()

plt.show()

In [8]:
# Preprocess the text data
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, strip_short

# Do some common cleaning options to remove noise from text
def clean_text(text):
    text_p1 = remove_stopwords(text)
    text_p2 = strip_punctuation(text_p1)
    text_p3 = strip_short(text_p2)
    return text_p3.lower()
    
    # How to remove @ from text?


In [9]:
# I combined the title and text and am considering them together
fake_text = fake_df.rdd.map(lambda x: clean_text(x["title"]+ " " + x["text"]))

print(type(fake_text))

print(fake_text.first())

<class 'pyspark.rdd.PipelinedRDD'>
donald trump sends out embarrassing new year’s eve message this disturbing donald trump couldn wish americans happy new year leave that instead shout enemies haters dishonest fake news media the reality star job couldn country rapidly grows stronger smarter want wish friends supporters enemies haters dishonest fake news media happy healthy new year president angry pants tweeted 2018 great year america country rapidly grows stronger smarter want wish friends supporters enemies haters dishonest fake news media happy healthy new year 2018 great year america donald trump realdonaldtrump december 2017trump tweet went welll expect what kind president sends new year greeting like despicable petty infantile gibberish only trump his lack decency won allow rise gutter long wish american citizens happy new year bishop talbert swan talbertswan december 2017no likes calvin calvinstowell december 2017your impeachment 2018 great year america accept regaining control

In [10]:
# I combined the title and text and am considering them together
true_text = true_df.rdd.map(lambda x: clean_text(x["title"]+ " " + x["text"]))

print(type(true_text))

print(true_text.first())

<class 'pyspark.rdd.PipelinedRDD'>
budget fight looms republicans flip fiscal script washington reuters the head conservative republican faction congress voted month huge expansion national debt pay tax cuts called “fiscal conservative” sunday urged budget restraint 2018 keeping sharp pivot way republicans representative mark meadows speaking cbs’ “face nation drew hard line federal spending lawmakers bracing battle january when return holidays wednesday lawmakers begin trying pass federal budget fight likely linked issues immigration policy november congressional election campaigns approach republicans seek control congress president donald trump republicans want big budget increase military spending democrats want proportional increases non defense “discretionary” spending programs support education scientific research infrastructure public health environmental protection “the trump administration willing say ‘we’re going increase non defense discretionary spending percent meadows ch

## Get words with highest tf-idfs for types of news

In [137]:
# Load in news text as spark dataframes and add column for the type of news (fake or true)
from pyspark.sql.functions import lit


fake_text_df = fake_text.map(Row("value")).toDF()
# adding new column for class_name, which is all "fake"
fake_text_df = fake_text_df.withColumn("class_name", lit("fake"))

true_text_df = true_text.map(Row("value")).toDF()
# adding new column for class_name, which is all "true"
true_text_df = true_text_df.withColumn("class_name", lit("true"))

print(fake_text_df.columns)
print(true_text_df.columns)

['value', 'class_name']
['value', 'class_name']


In [141]:
# Combine the dataframes into one
import pyspark.sql.functions as F

news_text_df = fake_text_df.union(true_text_df)
# make the order of fake/true news random
news_text_df = news_text_df.select("*").orderBy(F.rand())

news_text_df.show()

+--------------------+----------+
|               value|class_name|
+--------------------+----------+
|bill maher trump’...|      fake|
|black pastors ral...|      fake|
|just former dnc c...|      fake|
|when you see trum...|      fake|
|britain boris joh...|      true|
|watch trump call ...|      fake|
|exclusive trump c...|      true|
|holy moly trump g...|      fake|
|the list mainstre...|      fake|
|turkey seeks arre...|      true|
|breaking the elec...|      fake|
|community agitato...|      fake|
|self driving car ...|      true|
|pakistan issues l...|      true|
|germany deports f...|      true|
|republicans lack ...|      true|
|halloween firesid...|      fake|
|liberia johnson s...|      true|
|may tells busines...|      true|
|whoa clinton grif...|      fake|
+--------------------+----------+
only showing top 20 rows



In [143]:
# Let's get the tf-idf to see the most common words
# https://spark.apache.org/docs/latest/mllib-feature-extraction.html

from pyspark.ml.feature import HashingTF, IDF, Tokenizer

# Start by tokenizing text
tokenizer = Tokenizer(inputCol="value", outputCol="tokens")
news_text_tokenized = tokenizer.transform(news_text_df)

news_text_tokenized.show()


+--------------------+----------+--------------------+
|               value|class_name|              tokens|
+--------------------+----------+--------------------+
|bill maher trump’...|      fake|[bill, maher, tru...|
|black pastors ral...|      fake|[black, pastors, ...|
|just former dnc c...|      fake|[just, former, dn...|
|when you see trum...|      fake|[when, you, see, ...|
|britain boris joh...|      true|[britain, boris, ...|
|watch trump call ...|      fake|[watch, trump, ca...|
|exclusive trump c...|      true|[exclusive, trump...|
|holy moly trump g...|      fake|[holy, moly, trum...|
|the list mainstre...|      fake|[the, list, mains...|
|turkey seeks arre...|      true|[turkey, seeks, a...|
|breaking the elec...|      fake|[breaking, the, e...|
|community agitato...|      fake|[community, agita...|
|self driving car ...|      true|[self, driving, c...|
|pakistan issues l...|      true|[pakistan, issues...|
|germany deports f...|      true|[germany, deports...|
|republica

In [160]:
# Computing the tf-idf

hashingTF = HashingTF(inputCol="tokens", outputCol="raw_features")
tf = hashingTF.transform(news_text_tokenized)

tf.cache()
idf = IDF(inputCol="raw_features", outputCol="features").fit(tf)
tfidf = idf.transform(tf)

tfidf.select(["class_name", "tokens", "features"]).show()


+----------+--------------------+--------------------+
|class_name|              tokens|            features|
+----------+--------------------+--------------------+
|      fake|[bill, maher, tru...|(262144,[1512,256...|
|      fake|[black, pastors, ...|(262144,[531,1512...|
|      fake|[just, former, dn...|(262144,[1546,157...|
|      fake|[when, you, see, ...|(262144,[2564,304...|
|      true|[britain, boris, ...|(262144,[10723,24...|
|      fake|[watch, trump, ca...|(262144,[1125,130...|
|      true|[exclusive, trump...|(262144,[2564,328...|
|      fake|[holy, moly, trum...|(262144,[6991,800...|
|      fake|[the, list, mains...|(262144,[3657,410...|
|      true|[turkey, seeks, a...|(262144,[1096,154...|
|      fake|[breaking, the, e...|(262144,[511,619,...|
|      fake|[community, agita...|(262144,[3657,635...|
|      true|[self, driving, c...|(262144,[2410,256...|
|      true|[pakistan, issues...|(262144,[4223,504...|
|      true|[germany, deports...|(262144,[511,2366...|
|      tru

In [158]:
# What does the data look like?

print(type(tfidf.select("features").first()))

print(tfidf.select("features").first())

<class 'pyspark.sql.types.Row'>
Row(features=SparseVector(262144, {1512: 1.8092, 2564: 1.2176, 3532: 4.8288, 4254: 4.1769, 8798: 3.8585, 12035: 3.946, 16108: 3.8706, 21823: 3.7285, 24657: 2.5067, 24692: 4.0338, 24862: 2.6366, 26101: 5.9415, 26593: 12.2956, 28622: 4.7384, 29241: 3.5397, 34202: 6.1796, 34289: 6.2695, 35925: 7.5536, 41567: 8.7663, 43734: 2.6733, 48448: 1.3241, 49652: 4.4004, 50777: 3.2006, 52351: 4.9179, 52914: 1.0851, 54558: 4.696, 54569: 7.311, 55666: 5.0667, 57264: 2.7698, 58839: 7.3127, 59513: 4.6029, 61581: 5.6098, 62058: 2.8186, 62790: 2.8758, 63316: 3.131, 64382: 2.6391, 67087: 4.7232, 68164: 5.6062, 69536: 4.7257, 70065: 4.6123, 72125: 5.3877, 74096: 2.4141, 74375: 1.5733, 75571: 1.4192, 75881: 3.0535, 77886: 2.3679, 78896: 2.1515, 81095: 3.9541, 81566: 2.1956, 81662: 2.8897, 81788: 2.8683, 81916: 3.413, 83742: 3.9311, 84738: 4.4114, 85624: 3.4531, 87257: 4.332, 87567: 2.5144, 88941: 3.494, 89833: 1.0212, 91243: 4.4738, 93484: 3.4679, 95889: 0.4415, 96803: 5.873, 

In [176]:
# Get most important words according to tf-idf

from pyspark.sql.types import ArrayType, DoubleType, StringType, MapType

# I used code from stackoverflow and applied it to my data:
# source - https://stackoverflow.com/questions/69218494/pyspark-display-top-10-words-of-document

ndf = tfidf.select('class_name',F.explode('tokens').name('exptokens')).withColumn('tokens',F.array('exptokens'))
hashudf = F.udf(lambda vector : vector.indices.tolist()[0],StringType())
wordtf = hashingTF.transform(ndf).withColumn('wordhash',hashudf(F.col('raw_features')))
wordtf.show()

+----------+-----------+-------------+--------------------+--------+
|class_name|  exptokens|       tokens|        raw_features|wordhash|
+----------+-----------+-------------+--------------------+--------+
|      fake|       bill|       [bill]|(262144,[58839],[...|   58839|
|      fake|      maher|      [maher]|(262144,[202188],...|  202188|
|      fake|    trump’s|    [trump’s]|(262144,[101468],...|  101468|
|      fake|    orlando|    [orlando]|(262144,[219544],...|  219544|
|      fake|   massacre|   [massacre]|(262144,[256965],...|  256965|
|      fake|   response|   [response]|(262144,[218285],...|  218285|
|      fake|      shows|      [shows]|(262144,[62058],[...|   62058|
|      fake|    america|    [america]|(262144,[230876],...|  230876|
|      fake|      ‘what|      [‘what]|(262144,[54569],[...|   54569|
|      fake|narcissist’|[narcissist’]|(262144,[176064],...|  176064|
|      fake|      video|      [video]|(262144,[154594],...|  154594|
|      fake|     donald|     [dona

In [177]:
# I used code from stackoverflow and applied it to my data:
# source - https://stackoverflow.com/questions/69218494/pyspark-display-top-10-words-of-document# 

udf1 = F.udf(lambda vec : dict(zip(vec.indices.tolist(),vec.values.tolist())),MapType(StringType(),StringType()))
valuedf = tfidf.select('class_name',F.explode(udf1(F.col('features'))).name('wordhash','value'))
valuedf.show()

+----------+--------+------------------+
|class_name|wordhash|             value|
+----------+--------+------------------+
|      fake|   84738| 4.411385007104161|
|      fake|   12035|3.9459790871070552|
|      fake|    2564|1.2175541504742746|
|      fake|  219915|2.7484581703950015|
|      fake|  116745| 6.463675559718046|
|      fake|  239374| 4.698455645724604|
|      fake|  177679| 4.033828687113074|
|      fake|  238861| 5.829368879181034|
|      fake|   87567|2.5143567695462035|
|      fake|   67087| 4.723209384877542|
|      fake|  168976|0.3113684874932338|
|      fake|  145690| 4.509635284579483|
|      fake|  113432|2.0274313391393677|
|      fake|   54558| 4.696013642069052|
|      fake|   24862|2.6365881650956857|
|      fake|   83742| 3.931113175831226|
|      fake|  115491|2.0834362356182607|
|      fake|   96803|  5.87295010370432|
|      fake|  201511| 3.342570081240996|
|      fake|   54569|  7.31097342010525|
+----------+--------+------------------+
only showing top

In [184]:
from pyspark.sql import Window

# I used code from stackoverflow and applied it to my data:
# source - https://stackoverflow.com/questions/69218494/pyspark-display-top-10-words-of-document# 

w = Window.partitionBy("class_name").orderBy(F.desc('value'))
valuedf = valuedf.withColumn('rank',F.rank().over(w)).where(F.col('rank')<=3) # used 3 for testing.
topn_df = valuedf.join(wordtf,['class_name','wordhash']).groupby('class_name').agg(F.sort_array(F.collect_list(F.struct(F.col('value'),F.col('exptokens'))),asc=False).name('topn'))

topn_df.show()

+----------+--------------------+
|class_name|                topn|
+----------+--------------------+
|      fake|[{99.890111211870...|
|      true|[{99.992282147450...|
+----------+--------------------+



In [191]:
#print(topn_df.first()[1])

# Dungeons, descent, macy are top words for fake news??

## Encode Data

In [192]:
# Create SentenceTransformer model

from sentence_transformers import SentenceTransformer

# https://www.sbert.net/docs/pretrained_models.html
model = SentenceTransformer('paraphrase-MiniLM-L3-v2')


In [193]:
# Create embeddings

print("Embedding Dimension:", model.encode(fake_text.first()).reshape(1, -1).shape)


Embedding Dimension: (1, 384)


In [194]:
import numpy as np
from pyspark.sql import Row

# Setting up for embedding for news text:

# Splitting data into train and test
news_text_train_df, news_text_test_df = news_text_df.randomSplit([0.9, 0.1])

# Double check randomsplit gives what we expect
news_train_len = news_text_train_df.count()
news_test_len = news_text_test_df.count()
total_len = news_train_len + news_test_len

print("Percent training:", round(news_train_len / total_len, 2))
print("Percent testing:", round(news_test_len / total_len, 2))


Percent training: 0.9
Percent testing: 0.1


In [286]:
# Embed the training text

# Converting numpy to list because pyspark cannot handle numpy
news_embed_train = news_text_train_df.rdd.map(lambda x: (x["class_name"], model.encode(x["value"]).tolist()))

# Don't embed now
#news_embed_test = news_text_test_df.rdd.map(lambda x: model.encode(x["value"]).reshape(1, -1))

print(type(news_embed_train))

<class 'pyspark.rdd.PipelinedRDD'>


In [287]:
# Checking values

# print(type(news_embed_train.first()[1]))
# print(news_embed_train.first()[1].shape)
# #print(news_embed_train.first())

## Can we classify news as being fake or true?

- use classes of sentences closest to query to assign class to query
- and what sentences are closest to query?

In [289]:
# Creating dataframe for news training

from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf

# Help from https://stackoverflow.com/questions/42138482/how-do-i-convert-an-array-i-e-list-column-to-vector

# User defined function to transform list to vector, which I'll need for the later model
list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())

news_embed_train_df_initial = news_embed_train.toDF(["class_name", "embedding"])

# Testing:
# import pandas as pd
# # Model expects Vectors
# pandas_df = pd.DataFrame({
#     'a': [1, 2, 3],
   
#     'b': [[1,2,3,4], [5,6,7,8], [9,10,11,12]]})
# df = session.createDataFrame(pandas_df)

# df.select(df["a"], list_to_vector_udf(df["b"])).show()
# print(type(df.first()[1]))


news_embed_train_df = news_embed_train_df_initial.select(news_embed_train_df_initial["class_name"], list_to_vector_udf(news_embed_train_df_initial["embedding"]).alias("embedding"))

In [290]:
print(type(news_embed_train_df))

print(news_embed_train_df.columns)

print(news_embed_train_df.show())

<class 'pyspark.sql.dataframe.DataFrame'>
['class_name', 'embedding']
+----------+--------------------+
|class_name|           embedding|
+----------+--------------------+
|      fake|[0.06772855669260...|
|      fake|[0.15943287312984...|
|      fake|[0.15943287312984...|
|      fake|[0.00743169244378...|
|      fake|[0.00743169244378...|
|      fake|[-0.0523787103593...|
|      fake|[0.16917179524898...|
|      fake|[0.16917179524898...|
|      fake|[0.16917179524898...|
|      fake|[-0.1691941618919...|
|      fake|[0.19559511542320...|
|      fake|[0.01163810491561...|
|      fake|[0.26805040240287...|
|      fake|[0.13052178919315...|
|      fake|[0.06194987520575...|
|      fake|[0.09986516833305...|
|      fake|[0.13429512083530...|
|      fake|[0.13429512083530...|
|      fake|[0.14780071377754...|
|      fake|[0.19369360804557...|
+----------+--------------------+
only showing top 20 rows

None


In [291]:
from pyspark.ml.feature import BucketedRandomProjectionLSH

# Using Approximate Nearest Neighbors in PySpark
brp = BucketedRandomProjectionLSH(inputCol="embedding", outputCol="hashes", bucketLength=2.0,
                                  numHashTables=3)

# Fit the model
model = brp.fit(news_embed_train_df)

In [293]:
# Transform the model
model.transform(news_embed_train_df).show()

+----------+--------------------+--------------------+
|class_name|           embedding|              hashes|
+----------+--------------------+--------------------+
|      fake|[0.06772855669260...|[[0.0], [0.0], [0...|
|      fake|[0.15943287312984...|[[0.0], [-1.0], [...|
|      fake|[0.15943287312984...|[[0.0], [-1.0], [...|
|      fake|[0.00743169244378...|[[0.0], [-1.0], [...|
|      fake|[0.00743169244378...|[[0.0], [-1.0], [...|
|      fake|[-0.0523787103593...|[[0.0], [-1.0], [...|
|      fake|[0.16917179524898...|[[0.0], [-1.0], [...|
|      fake|[0.16917179524898...|[[0.0], [-1.0], [...|
|      fake|[0.16917179524898...|[[0.0], [-1.0], [...|
|      fake|[-0.1691941618919...|[[-1.0], [-1.0], ...|
|      fake|[0.19559511542320...|[[0.0], [-1.0], [...|
|      fake|[0.01163810491561...|[[0.0], [-1.0], [...|
|      fake|[0.26805040240287...|[[-1.0], [-1.0], ...|
|      fake|[0.13052178919315...|[[0.0], [-1.0], [...|
|      fake|[0.06194987520575...|[[0.0], [-1.0], [...|
|      fak

In [None]:

# Which type of news has the most negative sentiment (Sentiment Analysis)?
# What/who are the topics of fake vs. true news (Name Entity Recognition)?
# Can we classify news as being fake or true?