### Requirement: Build a spam filter. Use the various NLP tools and a new classifier, Naive Bayes to predict if one email is ham or spam.

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import length

from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer, VectorAssembler
from pyspark.ml.linalg import Vector
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [4]:
# Load data
data = spark.read.csv("../../Data/smsspamcollection/SMSSpamCollection", header=False, inferSchema=True, sep='\t')

In [5]:
data = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text')

In [6]:
data.show(5)

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
+-----+--------------------+
only showing top 5 rows



### Clean and prepare the data

In [7]:
data = data.withColumn('length', length(data['text']))

In [8]:
data.show(5)

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
+-----+--------------------+------+
only showing top 5 rows



In [9]:
# Pretty Clear Difference
data.groupby('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



### Feature Transformation

In [10]:
ham_spam_to_num = StringIndexer(inputCol='class', outputCol='label')
tokenizer = Tokenizer(inputCol='text', outputCol='token_text')
stopremove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec')
idf = IDF(inputCol='c_vec', outputCol='tf_idf')

In [11]:
clean_up = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features')

### The model

In [12]:
nb = NaiveBayes()

### Pipeline

In [13]:
data_prep_pipe = Pipeline(stages=[ham_spam_to_num, tokenizer, stopremove, count_vec, idf, clean_up])

In [14]:
clearer = data_prep_pipe.fit(data)

In [15]:
clean_data = clearer.transform(data)

### Traing and Evaluation

In [16]:
clean_data = clean_data.select(['label', 'features'])

In [17]:
clean_data.show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(13424,[7,11,31,6...|
|  0.0|(13424,[0,24,297,...|
|  1.0|(13424,[2,13,19,3...|
|  0.0|(13424,[0,70,80,1...|
|  0.0|(13424,[36,134,31...|
|  1.0|(13424,[10,60,139...|
|  0.0|(13424,[10,53,103...|
|  0.0|(13424,[125,184,4...|
|  1.0|(13424,[1,47,118,...|
|  1.0|(13424,[0,1,13,27...|
+-----+--------------------+
only showing top 10 rows



In [18]:
(training, testing) = clean_data.randomSplit([0.7, 0.3])

In [19]:
spam_predictor = nb.fit(training)

In [20]:
data.printSchema()

root
 |-- class: string (nullable = true)
 |-- text: string (nullable = true)
 |-- length: integer (nullable = true)



In [21]:
test_results = spam_predictor.transform(testing)

In [22]:
test_results.show(10)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13424,[0,1,2,7,8...|[-792.79034828082...|[1.0,1.4805110028...|       0.0|
|  0.0|(13424,[0,1,7,8,1...|[-1169.2232877244...|[1.0,8.1175560568...|       0.0|
|  0.0|(13424,[0,1,7,15,...|[-655.34277725349...|[1.0,1.6828206137...|       0.0|
|  0.0|(13424,[0,1,11,32...|[-886.44082056488...|[1.0,1.0793443485...|       0.0|
|  0.0|(13424,[0,1,14,31...|[-216.73467066729...|[1.0,1.3453794081...|       0.0|
|  0.0|(13424,[0,1,20,27...|[-966.44711485338...|[1.0,7.9774235370...|       0.0|
|  0.0|(13424,[0,1,43,69...|[-631.27756648910...|[0.00244260233112...|       1.0|
|  0.0|(13424,[0,1,46,17...|[-1136.1193158210...|[3.71213280589205...|       1.0|
|  0.0|(13424,[0,1,146,1...|[-249.85876843146...|[0.97167859142874...|       0.0|
|  0.0|(13424,[0

In [23]:
test_results.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|  204|
|  0.0|       1.0|  141|
|  1.0|       0.0|    8|
|  0.0|       0.0| 1336|
+-----+----------+-----+



In [24]:
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting spam was: {}".format(acc))

Accuracy of model at predicting spam was: 0.9202348380853032
