In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer,StopWordsRemover, CountVectorizer,IDF,StringIndexer
from pyspark.sql.functions import isnan, when, count, col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector
from pyspark.ml.classification import LinearSVC
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
spark = SparkSession.builder.appName('HamSpam').getOrCreate()

#**Part 1 - Data Preprocessing**

In [None]:
df = spark.read.csv("/FileStore/tables/smsspamcollection.tsv",inferSchema=True,sep='\t',header=True)
df.show(5)

In [None]:
df = df.withColumnRenamed('label','label_cat')
df.columns

**1.1 Cleaning Null Values**

In [None]:
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show() #the survival label is mostly null, so lets remove it anyway since this is unsupervised. Also remove _c9 column which is entirely null

**1.2 TFIDF**

In [None]:
tokenize = Tokenizer(inputCol="message", outputCol="token_message")
stopwords = StopWordsRemover(inputCol='token_message',outputCol='stop_tokens')
count_vectorize = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')
idf = IDF(inputCol="c_vec", outputCol="tf_idf")
hamspam_cat = StringIndexer(inputCol='label_cat',outputCol='label')

In [None]:
features = VectorAssembler(inputCols=['tf_idf'],outputCol='features')

**1.3 Linear SVC Model**


In [None]:
lsvc = LinearSVC(maxIter=10, regParam=0.1)

# **Part 2 - Creating Pipeline**


In [None]:
pipe = Pipeline(stages=[hamspam_cat,tokenize,stopwords,count_vectorize,idf,features])

In [None]:
pipeline_model = pipe.fit(df)

In [None]:
pipeline_transform = pipeline_model.transform(df)

#**Part 3 - Training and Evaluating Model**


In [None]:
df_final = pipeline_transform.select(['label','features'])
df_final.show(5)

**3.1 Train Test Split**


In [None]:
train, test = df_final.randomSplit([0.8,0.2]) #train will have 80%, test will have 20% of data

**3.2 Training on our Linear SVC**


In [None]:
pred = lsvc.fit(train)

In [None]:
pred_test = pred.transform(test)
pred_test.show(5)

**Evaluating Results**

In [None]:
evaluator = MulticlassClassificationEvaluator()
accuracy = evaluator.evaluate(pred_test)
print("Accuracy of model at predicting spam was: {}".format(accuracy))