In [1]:
import os
# Find the latest version of spark 3.2  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.2.3'
spark_version = 'spark-3.2.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.2.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.2.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3.2"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Ign:1 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Connecting to security.                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Connecting to security.                                                                               Get:3 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64  Release
Hit:5 http://archive.ubuntu.com/ubuntu focal InRelease
Hit:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease
Get:7 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Get:8 http://archive.ubuntu.com/ubuntu focal-updates InRelease [

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NaiveBayes").getOrCreate()

In [3]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
# url ="https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.1/22-big-data/day_2/yelp_reviews.csv"
spark.sparkContext.addFile("Resources/sms_spam.csv")
df = spark.read.csv(SparkFiles.get("sms_spam.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+----+--------------------+----+----+----+
|  v1|                  v2| _c2| _c3| _c4|
+----+--------------------+----+----+----+
| ham|Go until jurong p...|null|null|null|
| ham|Ok lar... Joking ...|null|null|null|
|spam|Free entry in 2 a...|null|null|null|
| ham|U dun say so earl...|null|null|null|
| ham|Nah I don't think...|null|null|null|
|spam|FreeMsg Hey there...|null|null|null|
| ham|Even my brother i...|null|null|null|
| ham|As per your reque...|null|null|null|
|spam|WINNER!! As a val...|null|null|null|
|spam|Had your mobile 1...|null|null|null|
| ham|I'm gonna be home...|null|null|null|
|spam|SIX chances to wi...|null|null|null|
|spam|URGENT! You have ...|null|null|null|
| ham|I've been searchi...|null|null|null|
| ham|I HAVE A DATE ON ...|null|null|null|
|spam|XXXMobileMovieClu...|null|null|null|
| ham|Oh k...i'm watchi...|null|null|null|
| ham|Eh u remember how...|null|null|null|
| ham|Fine if that��s t...|null|null|null|
|spam|England v Macedon...|null|null|null|
+----+-----

In [9]:

from pyspark.sql.functions import col, udf, length
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover

In [5]:
df = df.select(col('v1'),col('v2'))
df.show()

+----+--------------------+
|  v1|                  v2|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
|spam|FreeMsg Hey there...|
| ham|Even my brother i...|
| ham|As per your reque...|
|spam|WINNER!! As a val...|
|spam|Had your mobile 1...|
| ham|I'm gonna be home...|
|spam|SIX chances to wi...|
|spam|URGENT! You have ...|
| ham|I've been searchi...|
| ham|I HAVE A DATE ON ...|
|spam|XXXMobileMovieClu...|
| ham|Oh k...i'm watchi...|
| ham|Eh u remember how...|
| ham|Fine if that��s t...|
|spam|England v Macedon...|
+----+--------------------+
only showing top 20 rows



In [6]:
df.count()

5574

In [7]:
df = df.dropna()

In [8]:
df.count()

5573

In [11]:

# Create a length column to be used as a future feature 
data_df = df.withColumn('length', length(df['v2']))
data_df.show()

+----+--------------------+------+
|  v1|                  v2|length|
+----+--------------------+------+
| ham|Go until jurong p...|   111|
| ham|Ok lar... Joking ...|    29|
|spam|Free entry in 2 a...|   155|
| ham|U dun say so earl...|    49|
| ham|Nah I don't think...|    61|
|spam|FreeMsg Hey there...|   147|
| ham|Even my brother i...|    77|
| ham|As per your reque...|   160|
|spam|WINNER!! As a val...|   157|
|spam|Had your mobile 1...|   154|
| ham|I'm gonna be home...|   109|
|spam|SIX chances to wi...|   136|
|spam|URGENT! You have ...|   155|
| ham|I've been searchi...|   196|
| ham|I HAVE A DATE ON ...|    35|
|spam|XXXMobileMovieClu...|   149|
| ham|Oh k...i'm watchi...|    26|
| ham|Eh u remember how...|    81|
| ham|Fine if that��s t...|    58|
|spam|England v Macedon...|   155|
+----+--------------------+------+
only showing top 20 rows



### Feature Transformations


In [12]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
# Create all the features to the data set
pos_neg_to_num = StringIndexer(inputCol='v1',outputCol='label')
tokenizer = Tokenizer(inputCol="v2", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')


In [13]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [14]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[pos_neg_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])

In [15]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data_df)
cleaned = cleaner.transform(data_df)

In [16]:
# Show label and resulting features
cleaned.select(['label', 'features']).show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(262145,[38555,52...|
|  0.0|(262145,[51783,15...|
|  1.0|(262145,[9443,122...|
|  0.0|(262145,[2306,332...|
|  0.0|(262145,[25964,64...|
|  1.0|(262145,[19835,23...|
|  0.0|(262145,[103497,1...|
|  0.0|(262145,[12650,27...|
|  1.0|(262145,[4314,232...|
|  1.0|(262145,[1546,219...|
|  0.0|(262145,[12716,17...|
|  1.0|(262145,[7415,161...|
|  1.0|(262145,[23209,35...|
|  0.0|(262145,[15585,41...|
|  0.0|(262145,[39504,13...|
|  1.0|(262145,[26364,44...|
|  0.0|(262145,[18184,22...|
|  0.0|(262145,[12524,16...|
|  0.0|(262145,[51471,51...|
|  1.0|(262145,[16168,29...|
+-----+--------------------+
only showing top 20 rows



In [26]:
from pyspark.ml.classification import NaiveBayes
# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])

# Create a Naive Bayes model and fit training data
model = NaiveBayes()
predictor = model.fit(training)

In [27]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(5)

+---+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
| v1|                  v2|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+---+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|ham| and  picking the...|    41|  0.0|[, and, , picking...|[, , picking, var...|(262144,[59463,11...|(262144,[59463,11...|(262145,[59463,11...|[-300.52460066267...|[1.0,1.9857776779...|       0.0|
|ham| gonna let me kno...|    95|  0.0|[, gonna, let, me...|[, gonna, let, kn...|(262144,[238,1777...|(262144,[238,1777...|(262145,[238,1777...|[-757.56305773812...|[1.0,9.9726487876...|       0.0|
|ham| what

In [28]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting reviews was: %f" % acc)

Accuracy of model at predicting reviews was: 0.954923


In [29]:
import os
import tensorflow as tf

In [30]:
from tensorflow import keras


In [39]:
model.save("trained_nb.h5")