In [3]:
import pyspark.sql.functions as F

In [4]:
spark = SparkSession.builder.appName("SpamDetection Notebook").getOrCreate()

In [5]:
raw = spark.read.option("delimiter","\t").csv("use_cases/SMSSpamCollection").toDF("spam","message")
raw.count()
raw.show(2)

+----+--------------------+
|spam|             message|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
+----+--------------------+
only showing top 2 rows



In [6]:
# Extract word
from pyspark.ml.feature import Tokenizer
tockenizer = Tokenizer().setInputCol("message").setOutputCol("words")
transformed = tockenizer.transform(raw)
transformed.show(2)

+----+--------------------+--------------------+
|spam|             message|               words|
+----+--------------------+--------------------+
| ham|Go until jurong p...|[go, until, juron...|
| ham|Ok lar... Joking ...|[ok, lar..., joki...|
+----+--------------------+--------------------+
only showing top 2 rows



In [7]:
# Remove stopwords
from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover().setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(transformed)
cleaned.show(2)
StopWordsRemover().getStopWords()

+----+--------------------+--------------------+--------------------+
|spam|             message|               words|            filtered|
+----+--------------------+--------------------+--------------------+
| ham|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|
| ham|Ok lar... Joking ...|[ok, lar..., joki...|[ok, lar..., joki...|
+----+--------------------+--------------------+--------------------+
only showing top 2 rows



[u'i',
 u'me',
 u'my',
 u'myself',
 u'we',
 u'our',
 u'ours',
 u'ourselves',
 u'you',
 u'your',
 u'yours',
 u'yourself',
 u'yourselves',
 u'he',
 u'him',
 u'his',
 u'himself',
 u'she',
 u'her',
 u'hers',
 u'herself',
 u'it',
 u'its',
 u'itself',
 u'they',
 u'them',
 u'their',
 u'theirs',
 u'themselves',
 u'what',
 u'which',
 u'who',
 u'whom',
 u'this',
 u'that',
 u'these',
 u'those',
 u'am',
 u'is',
 u'are',
 u'was',
 u'were',
 u'be',
 u'been',
 u'being',
 u'have',
 u'has',
 u'had',
 u'having',
 u'do',
 u'does',
 u'did',
 u'doing',
 u'a',
 u'an',
 u'the',
 u'and',
 u'but',
 u'if',
 u'or',
 u'because',
 u'as',
 u'until',
 u'while',
 u'of',
 u'at',
 u'by',
 u'for',
 u'with',
 u'about',
 u'against',
 u'between',
 u'into',
 u'through',
 u'during',
 u'before',
 u'after',
 u'above',
 u'below',
 u'to',
 u'from',
 u'up',
 u'down',
 u'in',
 u'out',
 u'on',
 u'off',
 u'over',
 u'under',
 u'again',
 u'further',
 u'then',
 u'once',
 u'here',
 u'there',
 u'when',
 u'where',
 u'why',
 u'how',
 u'all

In [8]:
# custom stopwords
stopwords = StopWordsRemover().getStopWords() + ["-"]
remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(transformed)

In [9]:
# Generate features
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol("features").fit(cleaned)
featured = cvmodel.transform(cleaned)
featured.show(2)

+----+--------------------+--------------------+--------------------+--------------------+
|spam|             message|               words|            filtered|            features|
+----+--------------------+--------------------+--------------------+--------------------+
| ham|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|(13457,[8,12,33,6...|
| ham|Ok lar... Joking ...|[ok, lar..., joki...|[ok, lar..., joki...|(13457,[0,26,307,...|
+----+--------------------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [10]:
# convert to binary label
from pyspark.ml.feature import OneHotEncoder, StringIndexer
indexer = StringIndexer().setInputCol("spam").setOutputCol("label").fit(featured)
indexed = indexer.transform(featured)
indexed.show(20)

+----+--------------------+--------------------+--------------------+--------------------+-----+
|spam|             message|               words|            filtered|            features|label|
+----+--------------------+--------------------+--------------------+--------------------+-----+
| ham|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|(13457,[8,12,33,6...|  0.0|
| ham|Ok lar... Joking ...|[ok, lar..., joki...|[ok, lar..., joki...|(13457,[0,26,307,...|  0.0|
|spam|Free entry in 2 a...|[free, entry, in,...|[free, entry, 2, ...|(13457,[2,14,20,3...|  1.0|
| ham|U dun say so earl...|[u, dun, say, so,...|[u, dun, say, ear...|(13457,[0,71,83,1...|  0.0|
| ham|Nah I don't think...|[nah, i, don't, t...|[nah, don't, thin...|(13457,[36,39,141...|  0.0|
|spam|FreeMsg Hey there...|[freemsg, hey, th...|[freemsg, hey, da...|(13457,[11,57,62,...|  1.0|
| ham|Even my brother i...|[even, my, brothe...|[even, brother, l...|(13457,[11,55,108...|  0.0|
| ham|As per your reque...|[as

In [11]:
# Split to train and test
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
training, test = indexed.randomSplit([0.7, 0.3], seed = 123)
training.show(1)

+----+--------------------+--------------------+--------------------+--------------------+-----+
|spam|             message|               words|            filtered|            features|label|
+----+--------------------+--------------------+--------------------+--------------------+-----+
| ham| &lt;#&gt;  in mc...|[, &lt;#&gt;, , i...|[, &lt;#&gt;, , m...|(13457,[3,7,5193,...|  0.0|
+----+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 1 row



In [12]:
# Logistic regression
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
lrModel = lr.fit(training)
predictions = lrModel.transform(test)
predictions.select("features", "label", "prediction").show(2)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction").setMetricName("areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print ("Accuracy", accuracy)

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(13457,[3,7,44,21...|  0.0|       0.0|
|(13457,[3,87,117,...|  0.0|       0.0|
+--------------------+-----+----------+
only showing top 2 rows

('Accuracy', 0.5)


In [13]:
# Random Forest
from pyspark.ml.classification import RandomForestClassificationModel, RandomForestClassifier
rf = RandomForestClassifier().setLabelCol("label").setFeaturesCol("features").setNumTrees(10)
model = rf.fit(training)
predictions = model.transform(test)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction").setMetricName("areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print ("Accuracy", accuracy)

('Accuracy', 0.5021834061135371)


In [14]:
from pyspark.ml.feature import NGram
ngram = NGram().setN(2).setInputCol("filtered").setOutputCol("ngrams")
ngramDataFrame = ngram.transform(cleaned)
ngramDataFrame.select("ngrams").show(2, False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ngrams                                                                                                                                                                                       |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[go jurong, jurong point,, point, crazy.., crazy.. available, available bugis, bugis n, n great, great world, world la, la e, e buffet..., buffet... cine, cine got, got amore, amore wat...]|
|[ok lar..., lar... joking, joking wif, wif u, u oni...]                                                                                                                                      |
+---------------------------------------

In [20]:
from pyspark.ml import Pipeline, PipelineModel
tokenizer = Tokenizer().setInputCol("message").setOutputCol("words")

stopwords = StopWordsRemover().getStopWords()+ ["-"]
remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")
cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol("features")
indexer = StringIndexer().setInputCol("spam").setOutputCol("label")
#lr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
rf = RandomForestClassifier().setLabelCol("label").setFeaturesCol("features").setNumTrees(10)
pipeline = Pipeline().setStages([tokenizer, remover, cvmodel, indexer, rf])
model = pipeline.fit(raw)
model.write().overwrite().save("use_cases/spam_model4.4")

In [25]:
pipeline_model = PipelineModel.load("use_cases/spam_model4.4")
pipeline.stages

[Tokenizer_49338d5723284896f832,
 StopWordsRemover_4119b791bc44870af081,
 CountVectorizer_4069ba267fc1ec8c5b5c,
 StringIndexer_460097ce0707a30782e4,
 RandomForestClassificationModel (uid=rfc_764684cd4958) with 10 trees]

In [30]:
pipeline.transform(raw).show(5)

+----+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|spam|             message|               words|            filtered|            features|label|       rawPrediction|         probability|prediction|
+----+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
| ham|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|(13457,[8,12,33,6...|  0.0|[8.95971895993946...|[0.89597189599394...|       0.0|
| ham|Ok lar... Joking ...|[ok, lar..., joki...|[ok, lar..., joki...|(13457,[0,26,307,...|  0.0|[8.95971895993946...|[0.89597189599394...|       0.0|
|spam|Free entry in 2 a...|[free, entry, in,...|[free, entry, 2, ...|(13457,[2,14,20,3...|  1.0|[7.09109101771172...|[0.70910910177117...|       0.0|
| ham|U dun say so earl...|[u, dun, say, so,...|[u, dun, say, ear...|(13457,[0,71,83,1...|  0.0|[8.9