In [None]:
import pyspark.sql.functions as F

In [None]:
spark = SparkSession.builder.appName("SpamDetection Notebook").getOrCreate()

In [None]:
raw = spark.read.option("delimiter","\t").csv("/user/edureka_524533/Datasets/SMSSpamCollection").toDF("spam","message")

In [None]:
# Extract word
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer().setInputCol("message").setOutputCol("words")
transformed = tokenizer.transform(raw)

In [None]:
# Remove stopwords
from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover().setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(transformed)

In [None]:
# custom stopwords
stopwords = StopWordsRemover().getStopWords() + ["-"]
remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(transformed)

In [None]:
# Generate features
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol("features").fit(cleaned)
featured = cvmodel.transform(cleaned)

In [None]:
# convert to binary label
from pyspark.ml.feature import OneHotEncoder, StringIndexer
indexer = StringIndexer().setInputCol("spam").setOutputCol("label").fit(featured)
indexed = indexer.transform(featured)

In [None]:
# Split to train and test
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
training, test = indexed.randomSplit([0.7, 0.3], seed = 12345)

In [None]:
# Logistic regression
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
lrModel = lr.fit(training)
predictions = lrModel.transform(test)
predictions.select("features", "label", "prediction").show(2)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction").setMetricName("areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print ("Accuracy", accuracy)

In [None]:
from pyspark.ml import Pipeline, PipelineModel
tokenizer = Tokenizer().setInputCol("message").setOutputCol("words")

stopwords = StopWordsRemover().getStopWords()+ ["-"]
remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")
cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol("features")
indexer = StringIndexer().setInputCol("spam").setOutputCol("label")
lr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
pipeline = Pipeline().setStages([tokenizer, remover, cvmodel, indexer, lr])
model = pipeline.fit(raw)
model.write().overwrite().save("use_cases/spam_model4.4")