**Import Dependencies**

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib as plt
import sklearn as skl
import tensorflow as tf

In [2]:
spark_version = 'spark-3.2.3'
os.environ['SPARK_VERSION']=spark_version

!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

import findspark
findspark.init()

0% [Working]            Ign:1 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.38)] [Connecting to security.ub                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
0% [Connecting to archive.ubuntu.com (91.189.91.38)] [Connecting to security.ub                                                                               Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.38)] [Connecting to security.ub0% [Waiting for headers] [Waiting for headers] [Connecting to ppa.launchpad.net                                                                               Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64  Release
0% [Waiting for headers] [

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NaiveBayes").getOrCreate()

In [4]:
from pyspark import SparkFiles
url ="Resources/spam_emails.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("spam_emails.csv"), sep=",", header=True)

df.show()

+--------+--------------------+
|Category|             Message|
+--------+--------------------+
|     ham|Go until jurong p...|
|     ham|Ok lar... Joking ...|
|    spam|Free entry in 2 a...|
|     ham|U dun say so earl...|
|     ham|Nah I don't think...|
|    spam|FreeMsg Hey there...|
|     ham|Even my brother i...|
|     ham|As per your reque...|
|    spam|WINNER!! As a val...|
|    spam|Had your mobile 1...|
|     ham|I'm gonna be home...|
|    spam|SIX chances to wi...|
|    spam|URGENT! You have ...|
|     ham|I've been searchi...|
|     ham|I HAVE A DATE ON ...|
|    spam|XXXMobileMovieClu...|
|     ham|Oh k...i'm watchi...|
|     ham|Eh u remember how...|
|     ham|Fine if thats th...|
|    spam|England v Macedon...|
+--------+--------------------+
only showing top 20 rows



In [5]:
from pyspark.sql.functions import length

data_df = df.withColumn('length', length(df['Message']))
data_df.show()

+--------+--------------------+------+
|Category|             Message|length|
+--------+--------------------+------+
|     ham|Go until jurong p...|   111|
|     ham|Ok lar... Joking ...|    29|
|    spam|Free entry in 2 a...|   155|
|     ham|U dun say so earl...|    49|
|     ham|Nah I don't think...|    61|
|    spam|FreeMsg Hey there...|   147|
|     ham|Even my brother i...|    77|
|     ham|As per your reque...|   160|
|    spam|WINNER!! As a val...|   157|
|    spam|Had your mobile 1...|   154|
|     ham|I'm gonna be home...|   109|
|    spam|SIX chances to wi...|   136|
|    spam|URGENT! You have ...|   155|
|     ham|I've been searchi...|   196|
|     ham|I HAVE A DATE ON ...|    35|
|    spam|XXXMobileMovieClu...|   149|
|     ham|Oh k...i'm watchi...|    26|
|     ham|Eh u remember how...|    81|
|     ham|Fine if thats th...|    56|
|    spam|England v Macedon...|   155|
+--------+--------------------+------+
only showing top 20 rows



**Cleaning the Data/Adding Features**

In [14]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer

pos_neg_to_num = StringIndexer(inputCol='Category',outputCol='label')
tokenizer = Tokenizer(inputCol="Message", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [15]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [16]:
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[pos_neg_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])

In [17]:
cleaner = data_prep_pipeline.fit(data_df)
cleaned = cleaner.transform(data_df)

In [18]:
cleaned.select(['label', 'features']).show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(262145,[38555,52...|
|  0.0|(262145,[51783,15...|
|  1.0|(262145,[9443,122...|
|  0.0|(262145,[2306,332...|
|  0.0|(262145,[25964,64...|
|  1.0|(262145,[19835,23...|
|  0.0|(262145,[103497,1...|
|  0.0|(262145,[12650,27...|
|  1.0|(262145,[4314,232...|
|  1.0|(262145,[1546,219...|
|  0.0|(262145,[12716,17...|
|  1.0|(262145,[7415,161...|
|  1.0|(262145,[23209,35...|
|  0.0|(262145,[15585,41...|
|  0.0|(262145,[39504,13...|
|  1.0|(262145,[26364,44...|
|  0.0|(262145,[18184,22...|
|  0.0|(262145,[12524,16...|
|  0.0|(262145,[37132,51...|
|  1.0|(262145,[16168,29...|
+-----+--------------------+
only showing top 20 rows



In [19]:
from pyspark.ml.classification import NaiveBayes
training, testing = cleaned.randomSplit([0.7, 0.3])

nb = NaiveBayes()
predictor = nb.fit(training)

In [20]:
test_results = predictor.transform(testing)
test_results.show(5)

+--------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|Category|             Message|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+--------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|     ham|"7 wonders in My ...|   155|  0.0|["7, wonders, in,...|["7, wonders, wor...|(262144,[59381,60...|(262144,[59381,60...|(262145,[59381,60...|[-1576.1658865825...|[1.0,1.7390252805...|       0.0|
|     ham|"7 wonders in My ...|   155|  0.0|["7, wonders, in,...|["7, wonders, wor...|(262144,[59381,60...|(262144,[59381,60...|(262145,[59381,60...|[-1576.1658865825...|[1.0,1.7390252805.

**Results**

In [22]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting spam was: %f" % acc)

Accuracy of model at predicting spam was: 0.953669
