In [1]:
# Install PySpark
!pip install -q findspark
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.0.0.tar.gz (204.7 MB)
[K     |████████████████████████████████| 204.7 MB 7.0 MB/s eta 0:00:012   |█▊                              | 11.3 MB 4.7 MB/s eta 0:00:42�████████▍                  | 85.5 MB 1.6 MB/s eta 0:01:15     |██████████████████▍             | 117.3 MB 44.3 MB/s eta 0:00:02     |███████████████████▏            | 122.6 MB 1.9 MB/s eta 0:00:43     |████████████████████▊           | 132.8 MB 8.1 MB/s eta 0:00:09     |████████████████████████████▏   | 179.9 MB 2.4 MB/s eta 0:00:11     |████████████████████████████▍   | 181.5 MB 2.4 MB/s eta 0:00:10     |█████████████████████████████   | 185.7 MB 5.5 MB/s eta 0:00:04     |██████████████████████████████▉ | 197.5 MB 2.6 MB/s eta 0:00:03
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 4.7 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ..

In [2]:
# %%writefile sparkSpam.py
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import RegexTokenizer, Tokenizer, CountVectorizer
from pyspark.ml.classification import NaiveBayes
from pyspark.mllib.evaluation import MulticlassMetrics
from datetime import datetime
import sys

APP_NAME = "spamFilter"

spark = SparkSession.builder.appName(APP_NAME).getOrCreate()

spark

file_path = './data/spam.csv'

data = spark.read.format('csv').option('header', 'true').option('inferSchema', 'true').load(file_path)

## Exploring data

In [3]:
data.show()

+----+--------------------+----+----+----+
|  v1|                  v2| _c2| _c3| _c4|
+----+--------------------+----+----+----+
| ham|Go until jurong p...|null|null|null|
| ham|Ok lar... Joking ...|null|null|null|
|spam|Free entry in 2 a...|null|null|null|
| ham|U dun say so earl...|null|null|null|
| ham|Nah I don't think...|null|null|null|
|spam|FreeMsg Hey there...|null|null|null|
| ham|Even my brother i...|null|null|null|
| ham|As per your reque...|null|null|null|
|spam|WINNER!! As a val...|null|null|null|
|spam|Had your mobile 1...|null|null|null|
| ham|I'm gonna be home...|null|null|null|
|spam|SIX chances to wi...|null|null|null|
|spam|URGENT! You have ...|null|null|null|
| ham|I've been searchi...|null|null|null|
| ham|I HAVE A DATE ON ...|null|null|null|
|spam|XXXMobileMovieClu...|null|null|null|
| ham|Oh k...i'm watchi...|null|null|null|
| ham|Eh u remember how...|null|null|null|
| ham|Fine if that��s t...|null|null|null|
|spam|England v Macedon...|null|null|null|
+----+-----

## Removing columns and keep only v1 and v2

In [4]:
# keep only important columns
df = data.drop('_c1','_c2','_c3','_c4')
print('keep only v1 and v2')
df.show()

keep only v1 and v2
+----+--------------------+
|  v1|                  v2|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
|spam|FreeMsg Hey there...|
| ham|Even my brother i...|
| ham|As per your reque...|
|spam|WINNER!! As a val...|
|spam|Had your mobile 1...|
| ham|I'm gonna be home...|
|spam|SIX chances to wi...|
|spam|URGENT! You have ...|
| ham|I've been searchi...|
| ham|I HAVE A DATE ON ...|
|spam|XXXMobileMovieClu...|
| ham|Oh k...i'm watchi...|
| ham|Eh u remember how...|
| ham|Fine if that��s t...|
|spam|England v Macedon...|
+----+--------------------+
only showing top 20 rows



## Checking unicue values of target variable

In [5]:
print('checking unique values in label')
df.groupby('v1').count().show()

checking unique values in label
+------+-----+
|    v1|count|
+------+-----+
|ham"""|    2|
|   ham| 4825|
|  spam|  747|
+------+-----+



## remving row with label ham""""

As we can see that there are 2 rows that the label is ham""", since there are only two instances, we decided to remove them

In [6]:
df.createOrReplaceTempView('data')

df = spark.sql("SELECT * FROM data WHERE v1 = 'ham' OR v1 = 'spam'")
print('Removing ham"""')
group = df.groupby('v1').count()
group.show()
ham = group.collect()[0][1]
spam = group.collect()[1][1]
print('class priori probability')
print(f'spam: {spam/(ham+spam)} \nham:{ham/(spam+ham)}')

Removing ham"""
+----+-----+
|  v1|count|
+----+-----+
| ham| 4825|
|spam|  747|
+----+-----+

class priori probability
spam: 0.13406317300789664 
ham:0.8659368269921034


## Indexing lables to 0 and 1
We index labels which ham = 0 and spam = 1. Then assign to a new column name label

In [7]:
print('indexing label to 1 and 0')
# indexing label to ham = 0 and  spam = 1 and put in column name label
indexer = StringIndexer(inputCol = 'v1', outputCol = 'label')
index_label = indexer.fit(df)

df = index_label.transform(df)
df.show(10)

indexing label to 1 and 0
+----+--------------------+-----+
|  v1|                  v2|label|
+----+--------------------+-----+
| ham|Go until jurong p...|  0.0|
| ham|Ok lar... Joking ...|  0.0|
|spam|Free entry in 2 a...|  1.0|
| ham|U dun say so earl...|  0.0|
| ham|Nah I don't think...|  0.0|
|spam|FreeMsg Hey there...|  1.0|
| ham|Even my brother i...|  0.0|
| ham|As per your reque...|  0.0|
|spam|WINNER!! As a val...|  1.0|
|spam|Had your mobile 1...|  1.0|
+----+--------------------+-----+
only showing top 10 rows



## Tokenize documents
Tokenize columb v2 and assign to tokens column

In [8]:
print('toknize documents and assign to "tokens" column')
tokenizer = RegexTokenizer(inputCol="v2", outputCol="tokens", pattern=" ")
df = tokenizer.transform(df)
df.show(10)

toknize documents and assign to "tokens" column
+----+--------------------+-----+--------------------+
|  v1|                  v2|label|              tokens|
+----+--------------------+-----+--------------------+
| ham|Go until jurong p...|  0.0|[go, until, juron...|
| ham|Ok lar... Joking ...|  0.0|[ok, lar..., joki...|
|spam|Free entry in 2 a...|  1.0|[free, entry, in,...|
| ham|U dun say so earl...|  0.0|[u, dun, say, so,...|
| ham|Nah I don't think...|  0.0|[nah, i, don't, t...|
|spam|FreeMsg Hey there...|  1.0|[freemsg, hey, th...|
| ham|Even my brother i...|  0.0|[even, my, brothe...|
| ham|As per your reque...|  0.0|[as, per, your, r...|
|spam|WINNER!! As a val...|  1.0|[winner!!, as, a,...|
|spam|Had your mobile 1...|  1.0|[had, your, mobil...|
+----+--------------------+-----+--------------------+
only showing top 10 rows



## Create feature vectors from tokens

In [9]:
print('change token column to feature vectors as "features" column')
#count and vectorize decoment
vectorize = CountVectorizer(inputCol = 'tokens', outputCol='features', vocabSize=20_000)
model_cv = vectorize.fit(df)
df = model_cv.transform(df)
df.show()

change token column to feature vectors as "features" column
+----+--------------------+-----+--------------------+--------------------+
|  v1|                  v2|label|              tokens|            features|
+----+--------------------+-----+--------------------+--------------------+
| ham|Go until jurong p...|  0.0|[go, until, juron...|(13539,[8,41,51,6...|
| ham|Ok lar... Joking ...|  0.0|[ok, lar..., joki...|(13539,[5,73,401,...|
|spam|Free entry in 2 a...|  1.0|[free, entry, in,...|(13539,[0,3,8,20,...|
| ham|U dun say so earl...|  0.0|[u, dun, say, so,...|(13539,[5,21,59,1...|
| ham|Nah I don't think...|  0.0|[nah, i, don't, t...|(13539,[0,1,65,88...|
|spam|FreeMsg Hey there...|  1.0|[freemsg, hey, th...|(13539,[0,2,6,10,...|
| ham|Even my brother i...|  0.0|[even, my, brothe...|(13539,[0,7,9,13,...|
| ham|As per your reque...|  0.0|[as, per, your, r...|(13539,[0,10,11,4...|
|spam|WINNER!! As a val...|  1.0|[winner!!, as, a,...|(13539,[0,2,3,14,...|
|spam|Had your mobile 1...| 

## Create naiveBayes model with different laplace smoothing factor
Train model with different laplace smooting factor and see the effect of the factor

In [16]:
#split data in to traintest

train, test = df.select('features','label').randomSplit([0.8,0.2],seed = 0)

laplace_smooth = [0,0.25,0.5,0.75,1,2]
print('-------------------------------------------')
print('\t\tModel Evalutaion')
print('-------------------------------------------')
for laplace in laplace_smooth:
    start = start=datetime.now()
    nb = NaiveBayes(featuresCol = 'features', labelCol = 'label',smoothing=laplace, modelType='multinomial')
    nb_model = nb.fit(train)
    prediction = nb_model.transform(test)
    result = prediction.select('prediction','label').rdd.map(tuple)
    metrics = MulticlassMetrics(result)
    f_measure = (2 * metrics.recall(label=1) * metrics.precision(label=1)) / (metrics.recall(label=1) + metrics.precision(label=1))
    print(f'with Laplace smoothing = {laplace}')
    print(f'Accuracy : {metrics.accuracy}')
    print(f'Precision : {metrics.precision(label=1)}')
    print(f'Recall : {metrics.recall(label=1)}')
    print('F-measure : ', f_measure )
    print('Run Time: ', datetime.now()-start, '\n')

-------------------------------------------
		Model Evalutaion
-------------------------------------------
with Laplace smoothing = 0
Accuracy : 0.9052173913043479
Precision : 0.9423076923076923
Recall : 0.3161290322580645
F-measure :  0.47342995169082125
Run Time:  0:00:01.335840 

with Laplace smoothing = 0.25
Accuracy : 0.9686956521739131
Precision : 0.8287292817679558
Recall : 0.967741935483871
F-measure :  0.8928571428571429
Run Time:  0:00:01.336856 

with Laplace smoothing = 0.5
Accuracy : 0.971304347826087
Precision : 0.8426966292134831
Recall : 0.967741935483871
F-measure :  0.9009009009009008
Run Time:  0:00:01.240171 

with Laplace smoothing = 0.75
Accuracy : 0.9730434782608696
Precision : 0.8563218390804598
Recall : 0.9612903225806452
F-measure :  0.905775075987842
Run Time:  0:00:01.122621 

with Laplace smoothing = 1
Accuracy : 0.9756521739130435
Precision : 0.8713450292397661
Recall : 0.9612903225806452
F-measure :  0.9141104294478527
Run Time:  0:00:01.068811 

with Lap

We can see that without smoothing factor i.e. smotting = 0 the model does a bad job on recall. The reason that model yeild high accuracy with smooth = 0 is that the unblance of classes in the data set i.e. the class prior is 0.87 and 0.13