In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('spamham').getOrCreate()

In [2]:
df = spark.read.csv('FileStore/tables/SMSSpamCollection', inferSchema=True, sep='\t')

In [3]:
df = df.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text')

In [4]:
df.show()

In [5]:
from pyspark.sql.functions import length

In [6]:
df = df.withColumn('length', length(df['text']))

In [7]:
df.show(5)

In [8]:
df.groupBy('class').mean().show()

In [9]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer

In [10]:
tokenizer= Tokenizer(inputCol='text', outputCol='tokenText')
stop_rem = StopWordsRemover(inputCol='tokenText', outputCol='stop_token')
cv = CountVectorizer(inputCol='stop_token', outputCol='c_vec')
idf = IDF(inputCol='c_vec', outputCol='tf_idf')

ham_spam_to_num = StringIndexer(inputCol='class', outputCol='label')

In [11]:
from pyspark.ml.feature import VectorAssembler

In [12]:
cleanup = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features')

In [13]:
from pyspark.ml.classification import NaiveBayes

In [14]:
nb = NaiveBayes()

In [15]:
from pyspark.ml import Pipeline

In [16]:
data_prep_pipe = Pipeline(stages=[ham_spam_to_num, tokenizer, stop_rem, cv, idf, cleanup])

In [17]:
final_df = data_prep_pipe.fit(df).transform(df)

In [18]:
final_df = final_df.select(['label', 'features'])

In [19]:
final_df.show(5)

In [20]:
train_df, test_df = final_df.randomSplit([0.7,0.3])

In [21]:
spam_det = nb.fit(train_df)

In [22]:
results = spam_det.transform(test_df)

In [23]:
results.show()

In [24]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [25]:
acc_eval = MulticlassClassificationEvaluator()

In [26]:
acc = acc_eval.evaluate(results)

In [27]:
acc