In [0]:
data = spark.read.csv("/FileStore/tables/SMSSpamCollection", inferSchema = True, sep ='\t')

In [0]:
data.show()

In [0]:
data = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text')

In [0]:
data.show()

In [0]:
from pyspark.sql.functions import length

In [0]:
data = data.withColumn('length', length(data['text']))

In [0]:
data.show()

In [0]:
data.groupby('class').mean('length').show()

In [0]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer

In [0]:
 tokenizer = Tokenizer(inputCol = 'text', outputCol = 'token_text')

In [0]:
stop_remove = StopWordsRemover(inputCol = 'token_text', outputCol ='stop_token' )

In [0]:
count_vec = CountVectorizer(inputCol ='stop_token', outputCol = 'c_vec' )

In [0]:
idf = IDF(inputCol = 'c_vec', outputCol = 'tf_idf')

In [0]:
ham_spam = StringIndexer(inputCol = 'class', outputCol= 'label')

In [0]:
from pyspark.ml.feature import VectorAssembler

In [0]:
clean_up = VectorAssembler(inputCols = ['tf_idf', 'length'], outputCol = 'features')

In [0]:
from pyspark.ml.classification import NaiveBayes  # classic models for NPL

In [0]:
from pyspark.ml import Pipeline

In [0]:
data_pipe = Pipeline(stages = [ham_spam, tokenizer ,stop_remove , count_vec, idf, clean_up])

In [0]:
cleaner = data_pipe.fit(data).transform(data)

In [0]:
cleaner.show()

In [0]:
train_data, test_data = cleaner.randomSplit([0.7,0.3])

In [0]:
nb = NaiveBayes()

In [0]:
spam_detector = nb.fit(train_data)

In [0]:
 test_results = spam_detector.transform(test_data)

In [0]:
 test_results.select('label', 'prediction').show()

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
acc_eval = MulticlassClassificationEvaluator(metricName = 'accuracy')

In [0]:
acc = acc_eval.evaluate(test_results)

In [0]:
acc