## <u>__Dependencies__<u/>

In [0]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [0]:
 from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("FakeNewsPoject_Naive_Bayes").getOrCreate()

## <u>__Extract__<u/>

In [17]:
 from pyspark import SparkFiles
# Load in Fake.csv from S3 into a DataFrame
fake_url = "https://bootcamp-proj-3.s3.us-east-2.amazonaws.com/Fake.csv"
spark.sparkContext.addFile(fake_url)

raw_fake_df = spark.read.csv(SparkFiles.get("Fake.csv"), sep=",", header=True)
raw_fake_df.show(10)

+--------------------+--------------------+-------+-----------------+
|               title|                text|subject|             date|
+--------------------+--------------------+-------+-----------------+
| Donald Trump Sen...|Donald Trump just...|   News|December 31, 2017|
| Drunk Bragging T...|House Intelligenc...|   News|December 31, 2017|
| Sheriff David Cl...|On Friday, it was...|   News|December 30, 2017|
| Trump Is So Obse...|On Christmas day,...|   News|December 29, 2017|
| Pope Francis Jus...|Pope Francis used...|   News|December 25, 2017|
| Racist Alabama C...|The number of cas...|   News|December 25, 2017|
| Fresh Off The Go...|Donald Trump spen...|   News|December 23, 2017|
| Trump Said Some ...|In the wake of ye...|   News|December 23, 2017|
| Former CIA Direc...|Many people have ...|   News|December 22, 2017|
| WATCH: Brand-New...|Just when you mig...|   News|December 21, 2017|
+--------------------+--------------------+-------+-----------------+
only showing top 10 

In [18]:
# Load in True.csv from S3 into a DataFrame
true_url = "https://bootcamp-proj-3.s3.us-east-2.amazonaws.com/True.csv"
spark.sparkContext.addFile(true_url)

raw_true_df = spark.read.csv(SparkFiles.get("True.csv"), sep=",", header=True)
raw_true_df.show(10)


+--------------------+--------------------+------------+------------------+
|               title|                text|     subject|              date|
+--------------------+--------------------+------------+------------------+
|As U.S. budget fi...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |
|U.S. military to ...|WASHINGTON (Reute...|politicsNews|December 29, 2017 |
|Senior U.S. Repub...|WASHINGTON (Reute...|politicsNews|December 31, 2017 |
|FBI Russia probe ...|WASHINGTON (Reute...|politicsNews|December 30, 2017 |
|Trump wants Posta...|SEATTLE/WASHINGTO...|politicsNews|December 29, 2017 |
|White House, Cong...|WEST PALM BEACH, ...|politicsNews|December 29, 2017 |
|Trump says Russia...|WEST PALM BEACH, ...|politicsNews|December 29, 2017 |
|Factbox: Trump on...|The following sta...|politicsNews|December 29, 2017 |
|Trump on Twitter ...|The following sta...|politicsNews|December 29, 2017 |
|Alabama official ...|WASHINGTON (Reute...|politicsNews|December 28, 2017 |
+-----------

## <u>__Transform__<u/>

In [0]:
import pyspark.sql.functions as sf

# Add true/fake categories
add_category_fake = raw_fake_df.withColumn('category',sf.lit('Fake'))
add_category_true = raw_true_df.withColumn('category',sf.lit('True'))

# add_category_fake.show(10)
# add_category_true.show(10)

In [20]:
#Append and select data

# import pandas as pd
appended_data = add_category_fake.union(add_category_true)\
                                 .select(['category', 'text'])\
                                .dropna(subset=('text'))

appended_data.show()

+--------+--------------------+
|category|                text|
+--------+--------------------+
|    Fake|Donald Trump just...|
|    Fake|House Intelligenc...|
|    Fake|On Friday, it was...|
|    Fake|On Christmas day,...|
|    Fake|Pope Francis used...|
|    Fake|The number of cas...|
|    Fake|Donald Trump spen...|
|    Fake|In the wake of ye...|
|    Fake|Many people have ...|
|    Fake|Just when you mig...|
|    Fake|A centerpiece of ...|
|    Fake|Republicans are w...|
|    Fake|Republicans have ...|
|    Fake|The media has bee...|
|    Fake|Abigail Disney is...|
|    Fake|Donald Trump just...|
|    Fake|A new animatronic...|
|    Fake|Trump supporters ...|
|    Fake|Right now, the wh...|
|    Fake|Senate Majority W...|
+--------+--------------------+
only showing top 20 rows



In [21]:
from pyspark.sql.functions import length, trim

# Create a length column to be used as a future feature 
review_data = appended_data.withColumn('length', length(appended_data['text']))\
                            .where("length>=100")\
                            .orderBy('length')\
                            .withColumn("text", trim(appended_data.text))
review_data.show()

+--------+--------------------+------+
|category|                text|length|
+--------+--------------------+------+
|    Fake|It s been said th...|   100|
|    Fake|This is hysterica...|   100|
|    Fake|I d like to sugge...|   100|
|    Fake|Trump spokesperso...|   100|
|    Fake|The left has had ...|   100|
|    Fake|Obama is off his ...|   101|
|    Fake|The Senate is doi...|   101|
|    Fake|Ted Cruz takes th...|   101|
|    Fake|Usain Bolt was mi...|   101|
|    Fake|Wow! Hillary Clin...|   101|
|    Fake|Wow! The DNC is p...|   101|
|    Fake|You don t know wh...|   102|
|    Fake|Clinton Stumbles ...|   102|
|    Fake|What in the world...|   102|
|    Fake|THIS IS YET ANOTH...|   102|
|    Fake|I m shocked! I th...|   102|
|    Fake|Lou Dobbs rips in...|   103|
|    Fake|You will love, lo...|   103|
+--------+--------------------+------+
only showing top 20 rows



In [0]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer

# Create all the features to the data set
pos_neg_to_num = StringIndexer(inputCol='category',outputCol='label')
tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')


In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [0]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[pos_neg_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])

In [0]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(review_data)
cleaned = cleaner.transform(review_data)

In [26]:
# Show label and resulting features
cleaned.select(['label', 'features']).show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(262145,[7612,487...|
|  0.0|(262145,[7612,258...|
|  0.0|(262145,[20832,36...|
|  0.0|(262145,[27526,10...|
|  0.0|(262145,[7695,148...|
|  0.0|(262145,[7612,141...|
|  0.0|(262145,[36974,79...|
|  0.0|(262145,[8985,217...|
|  0.0|(262145,[3924,218...|
|  0.0|(262145,[3908,790...|
+-----+--------------------+
only showing top 10 rows



## <u>__Load__<u/>

## <u>__Analysis__<u/>

In [27]:
from pyspark.ml.classification import NaiveBayes
# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])

training.show(10)
testing.show(10)

+--------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|category|                text|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|
+--------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|    Fake|"""Fake news"" wa...|   124|  0.0|["""fake, news"",...|["""fake, news"",...|(262144,[35014,10...|(262144,[35014,10...|(262145,[35014,10...|
|    Fake|#SourcesHaveConfi...|   131|  0.0|[#sourceshaveconf...|[#sourceshaveconf...|(262144,[7612,202...|(262144,[7612,202...|(262145,[7612,202...|
|    Fake|.@Nigel_Farage te...|   141|  0.0|[.@nigel_farage, ...|[.@nigel_farage, ...|(262144,[3283,880...|(262144,[3283,880...|(262145,[3283,880...|
|    Fake|2 Corinthians 9:7...|   138|  0.0|[2, corinthians, ...|[2, corinthians, ...|(262144,[33264

In [0]:
# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [29]:
 # Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(10, truncate=False)

+--------+----------------------------------------------------------------------------------------------------------------------------------------------+------+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [30]:
 # Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting articles' truthfullness was: %f" % acc)

Accuracy of model at predicting articles' truthfullness was: 0.984125
