In [1]:
import findspark
findspark.init()

import pyspark

In [2]:
# !pip install findspark

In [3]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [4]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip

--2021-03-03 19:50:07--  https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 203415 (199K) [application/x-httpd-php]
Saving to: ‘smsspamcollection.zip.4’


2021-03-03 19:50:10 (250 KB/s) - ‘smsspamcollection.zip.4’ saved [203415/203415]



In [5]:
# !unzip smsspamcollection.zip

In [6]:
!head SMSSpamCollection

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham	Ok lar... Joking wif u oni...
spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
ham	U dun say so early hor... U c already then say...
ham	Nah I don't think he goes to usf, he lives around here though
spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
ham	Even my brother is not like to speak with me. They treat me like aids patent.
ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
spam	WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only

In [7]:
src = (
    spark
    .read
    .option("sep", "\t")
    .csv("SMSSpamCollection")
    .withColumnRenamed("_c0", "label")
    .withColumnRenamed("_c1", "message")
)

In [8]:
src.show()

+-----+--------------------+
|label|             message|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
| spam|FreeMsg Hey there...|
|  ham|Even my brother i...|
|  ham|As per your reque...|
| spam|WINNER!! As a val...|
| spam|Had your mobile 1...|
|  ham|I'm gonna be home...|
| spam|SIX chances to wi...|
| spam|URGENT! You have ...|
|  ham|I've been searchi...|
|  ham|I HAVE A DATE ON ...|
| spam|XXXMobileMovieClu...|
|  ham|Oh k...i'm watchi...|
|  ham|Eh u remember how...|
|  ham|Fine if thats th...|
| spam|England v Macedon...|
+-----+--------------------+
only showing top 20 rows



In [9]:
from pyspark.ml import feature

In [10]:
tokenizer = feature.Tokenizer(inputCol="message", outputCol="tokens")

In [11]:
tokenizer.transform(src).show()

+-----+--------------------+--------------------+
|label|             message|              tokens|
+-----+--------------------+--------------------+
|  ham|Go until jurong p...|[go, until, juron...|
|  ham|Ok lar... Joking ...|[ok, lar..., joki...|
| spam|Free entry in 2 a...|[free, entry, in,...|
|  ham|U dun say so earl...|[u, dun, say, so,...|
|  ham|Nah I don't think...|[nah, i, don't, t...|
| spam|FreeMsg Hey there...|[freemsg, hey, th...|
|  ham|Even my brother i...|[even, my, brothe...|
|  ham|As per your reque...|[as, per, your, r...|
| spam|WINNER!! As a val...|[winner!!, as, a,...|
| spam|Had your mobile 1...|[had, your, mobil...|
|  ham|I'm gonna be home...|[i'm, gonna, be, ...|
| spam|SIX chances to wi...|[six, chances, to...|
| spam|URGENT! You have ...|[urgent!, you, ha...|
|  ham|I've been searchi...|[i've, been, sear...|
|  ham|I HAVE A DATE ON ...|[i, have, a, date...|
| spam|XXXMobileMovieClu...|[xxxmobilemoviecl...|
|  ham|Oh k...i'm watchi...|[oh, k...i'm, wat...|


In [12]:
from pyspark.ml import pipeline

In [13]:
src.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|  ham| 4827|
| spam|  747|
+-----+-----+



In [14]:
from pyspark.ml import classification

In [15]:
train, test = src.randomSplit(seed=1234, weights=(70., 30.))

In [16]:
main_pipe = pipeline.Pipeline(
    stages=(
        feature.Tokenizer(inputCol="message", outputCol="tokens"),
        feature.CountVectorizer(
            maxDF=4000,
            minDF=5,
            inputCol="tokens", 
            outputCol="vector",
        ),
        feature.StringIndexer(inputCol="label", outputCol="y"),
        classification.NaiveBayes(featuresCol="vector", labelCol="y"),
    )
)

In [17]:
pipe_model = main_pipe.fit(train)

In [18]:
pipe_model.transform(test).show()

+-----+--------------------+--------------------+--------------------+---+--------------------+--------------------+----------+
|label|             message|              tokens|              vector|  y|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+---+--------------------+--------------------+----------+
|  ham| &lt;#&gt;  in mc...|[, &lt;#&gt;, , i...|(1502,[7,20,24,26...|0.0|[-29.189065862648...|[0.99999999839701...|       0.0|
|  ham| &lt;#&gt;  mins ...|[, &lt;#&gt;, , m...|(1502,[0,1,20,24,...|0.0|[-57.645920593005...|[0.99999995129602...|       0.0|
|  ham| came to look at ...|[, came, to, look...|(1502,[0,4,7,8,20...|0.0|[-106.80303662568...|[0.99999999987274...|       0.0|
|  ham|&lt;#&gt;  w jett...|[&lt;#&gt;, , w, ...|(1502,[2,20,29,40...|0.0|[-34.942124728177...|[0.99999960047460...|       0.0|
|  ham|&lt;#&gt; ISH MIN...|[&lt;#&gt;, ish, ...|(1502,[40,58,408,...|0.0|[-34.609496567593...|[0.998687

In [19]:
pipe_model.transform(test).orderBy("probability").select("prediction", "label", "message").show()

+----------+-----+--------------------+
|prediction|label|             message|
+----------+-----+--------------------+
|       1.0| spam|URGENT! Your Mobi...|
|       1.0| spam|URGENT! We are tr...|
|       1.0| spam|74355 XMAS iscomi...|
|       1.0| spam|Urgent! call 0906...|
|       1.0| spam|Congratulations u...|
|       1.0| spam|You have WON a gu...|
|       1.0| spam|8007 FREE for 1st...|
|       1.0| spam|Urgent! Please ca...|
|       1.0| spam|Last Chance! Clai...|
|       1.0| spam|Great NEW Offer -...|
|       1.0| spam|GENT! We are tryi...|
|       1.0| spam|URGENT! We are tr...|
|       1.0| spam|Urgent -call 0906...|
|       1.0| spam|URGENT! We are tr...|
|       1.0| spam|PRIVATE! Your 200...|
|       1.0| spam|PRIVATE! Your 200...|
|       1.0| spam|FREE for 1st week...|
|       1.0| spam|HOT LIVE FANTASIE...|
|       1.0| spam|You are a winner ...|
|       1.0| spam|Today's Offer! Cl...|
+----------+-----+--------------------+
only showing top 20 rows



In [20]:
from pyspark.sql import functions

In [21]:
pipe_model.transform(test).orderBy(functions.desc("probability")).select("prediction", "y", "label", "message").show()

+----------+---+-----+--------------------+
|prediction|  y|label|             message|
+----------+---+-----+--------------------+
|       0.0|0.0|  ham|Hi, my love! How ...|
|       0.0|0.0|  ham|Good afternoon, m...|
|       0.0|0.0|  ham|Although i told u...|
|       0.0|0.0|  ham|Yar lor wait 4 my...|
|       0.0|0.0|  ham|Ah, well that con...|
|       0.0|0.0|  ham|see, i knew givin...|
|       0.0|0.0|  ham|I bought the test...|
|       0.0|0.0|  ham|I hope your alrig...|
|       0.0|0.0|  ham|Idk. You keep say...|
|       0.0|0.0|  ham|NEFT Transaction ...|
|       0.0|0.0|  ham|Then ü wait 4 me ...|
|       0.0|0.0|  ham|If i not meeting ...|
|       0.0|0.0|  ham|I wish! I don't t...|
|       0.0|0.0|  ham|Good morning, my ...|
|       0.0|0.0|  ham|The guy (kadeem) ...|
|       0.0|0.0|  ham|Sad story of a Ma...|
|       0.0|0.0|  ham|My love ! How com...|
|       0.0|0.0|  ham|What do u want wh...|
|       0.0|0.0|  ham|I got it before t...|
|       0.0|0.0|  ham|I cant kee

In [22]:
from pyspark.ml import evaluation

In [23]:
evaluation.MulticlassClassificationEvaluator?

In [24]:
(
    evaluation
    .MulticlassClassificationEvaluator(labelCol="y", metricName="f1")
    .evaluate(pipe_model.transform(test))
)

0.9764323269877127

In [25]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="y", rawPredictionCol="probability", metricName='areaUnderROC')


In [27]:
predictionAndTarget = pipe_model.transform(test).select("y", "probability")
auc = evaluator.evaluate(predictionAndTarget)

In [28]:
auc

0.9790307548928255

In [44]:
pipe_model.write().save("./fittedNB")

In [52]:
pipe_model =  pipeline.PipelineModel.load(path="./fittedNB")

In [53]:
pipe_model

PipelineModel_f5000c54b300

In [54]:
pipe_model.transform(test).show()

+-----+--------------------+--------------------+--------------------+---+--------------------+--------------------+----------+
|label|             message|              tokens|              vector|  y|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+---+--------------------+--------------------+----------+
|  ham| &lt;#&gt;  in mc...|[, &lt;#&gt;, , i...|(1502,[7,20,24,26...|0.0|[-29.189065862648...|[0.99999999839701...|       0.0|
|  ham| &lt;#&gt;  mins ...|[, &lt;#&gt;, , m...|(1502,[0,1,20,24,...|0.0|[-57.645920593005...|[0.99999995129602...|       0.0|
|  ham| came to look at ...|[, came, to, look...|(1502,[0,4,7,8,20...|0.0|[-106.80303662568...|[0.99999999987274...|       0.0|
|  ham|&lt;#&gt;  w jett...|[&lt;#&gt;, , w, ...|(1502,[2,20,29,40...|0.0|[-34.942124728177...|[0.99999960047460...|       0.0|
|  ham|&lt;#&gt; ISH MIN...|[&lt;#&gt;, ish, ...|(1502,[40,58,408,...|0.0|[-34.609496567593...|[0.998687