In [1]:
from pyspark.sql.session import SparkSession

In [2]:
import string

In [3]:
from pyspark.sql.types import IntegerType

In [4]:
from pyspark.sql.functions import *

In [5]:
spark = SparkSession.builder.appName("Module 8 Case Study 1").getOrCreate()

In [6]:
smsDF = spark.read.csv("/user/edureka_524533/Datasets/SMSSpamCollection",inferSchema=True,header=False,sep='\t').withColumnRenamed("_c0","message_type").withColumnRenamed("_c1","message_content")

In [7]:
smsDF.printSchema()

root
 |-- message_type: string (nullable = true)
 |-- message_content: string (nullable = true)



In [8]:
smsDF.show(5)

+------------+--------------------+
|message_type|     message_content|
+------------+--------------------+
|         ham|Go until jurong p...|
|         ham|Ok lar... Joking ...|
|        spam|Free entry in 2 a...|
|         ham|U dun say so earl...|
|         ham|Nah I don't think...|
+------------+--------------------+
only showing top 5 rows



In [9]:
def numeric_messType(messType):
    if messType == 'ham':
        return 1
    else:
        return 0    

In [10]:
udf_convertToNumeric = udf(numeric_messType,IntegerType())

In [11]:
#Replace all 'ham' with 1 and 'spam' with 0, so we have numeric fields instead of string
smsDFStat = smsDF.select('*',udf_convertToNumeric(smsDF['message_type']).alias('message_status'))

In [12]:
smsDFStat.show(5)

+------------+--------------------+--------------+
|message_type|     message_content|message_status|
+------------+--------------------+--------------+
|         ham|Go until jurong p...|             1|
|         ham|Ok lar... Joking ...|             1|
|        spam|Free entry in 2 a...|             0|
|         ham|U dun say so earl...|             1|
|         ham|Nah I don't think...|             1|
+------------+--------------------+--------------+
only showing top 5 rows



In [13]:
smsDFStat.printSchema()

root
 |-- message_type: string (nullable = true)
 |-- message_content: string (nullable = true)
 |-- message_status: integer (nullable = true)



In [14]:
def remove_punctuations(message):
    messageEdit = [char for char in message if char not in string.punctuation]
    message = ''.join(messageEdit)
    return message

In [15]:
udf_puncEdit = udf(remove_punctuations)

In [16]:
smsDF1 = (smsDFStat.select('*', udf_puncEdit(smsDFStat['message_content']).alias('message_punc')))

In [17]:
smsDF1.show(5)

+------------+--------------------+--------------+--------------------+
|message_type|     message_content|message_status|        message_punc|
+------------+--------------------+--------------+--------------------+
|         ham|Go until jurong p...|             1|Go until jurong p...|
|         ham|Ok lar... Joking ...|             1|Ok lar Joking wif...|
|        spam|Free entry in 2 a...|             0|Free entry in 2 a...|
|         ham|U dun say so earl...|             1|U dun say so earl...|
|         ham|Nah I don't think...|             1|Nah I dont think ...|
+------------+--------------------+--------------+--------------------+
only showing top 5 rows



In [18]:
from pyspark.ml.feature import StopWordsRemover

In [19]:
stopWords = StopWordsRemover.loadDefaultStopWords('english')

In [20]:
def remove_stopWords(message):
    wordList = message.split(' ')
    messageEdit = [word for word in wordList if word not in stopWords]
    message = ' '.join(messageEdit)
    return message
    

In [21]:
udf_stopWEdit = udf(remove_stopWords)

In [22]:
smsDF2 = (smsDF1.select('*', udf_stopWEdit(smsDF1['message_punc']).alias('message_stopW')))

In [23]:
smsDF2.show(5)

+------------+--------------------+--------------+--------------------+--------------------+
|message_type|     message_content|message_status|        message_punc|       message_stopW|
+------------+--------------------+--------------+--------------------+--------------------+
|         ham|Go until jurong p...|             1|Go until jurong p...|Go jurong point c...|
|         ham|Ok lar... Joking ...|             1|Ok lar Joking wif...|Ok lar Joking wif...|
|        spam|Free entry in 2 a...|             0|Free entry in 2 a...|Free entry 2 wkly...|
|         ham|U dun say so earl...|             1|U dun say so earl...|U dun say early h...|
|         ham|Nah I don't think...|             1|Nah I dont think ...|Nah I dont think ...|
+------------+--------------------+--------------+--------------------+--------------------+
only showing top 5 rows



In [24]:
smsDF2.printSchema()

root
 |-- message_type: string (nullable = true)
 |-- message_content: string (nullable = true)
 |-- message_status: integer (nullable = true)
 |-- message_punc: string (nullable = true)
 |-- message_stopW: string (nullable = true)



# Reduce the number of columns in the dataset to message_status and message_stopW

In [25]:
df = smsDF2.select('message_status','message_stopW')

In [26]:
df.printSchema()

root
 |-- message_status: integer (nullable = true)
 |-- message_stopW: string (nullable = true)



In [27]:
from pyspark.sql.functions import col, split

In [28]:
df = df.withColumn("Message_Array", split(col("message_stopW")," "))

In [29]:
df.show()

+--------------+--------------------+--------------------+
|message_status|       message_stopW|       Message_Array|
+--------------+--------------------+--------------------+
|             1|Go jurong point c...|[Go, jurong, poin...|
|             1|Ok lar Joking wif...|[Ok, lar, Joking,...|
|             0|Free entry 2 wkly...|[Free, entry, 2, ...|
|             1|U dun say early h...|[U, dun, say, ear...|
|             1|Nah I dont think ...|[Nah, I, dont, th...|
|             0|FreeMsg Hey darli...|[FreeMsg, Hey, da...|
|             1|Even brother like...|[Even, brother, l...|
|             1|As per request Me...|[As, per, request...|
|             0|WINNER As valued ...|[WINNER, As, valu...|
|             0|Had mobile 11 mon...|[Had, mobile, 11,...|
|             1|Im gonna home soo...|[Im, gonna, home,...|
|             0|SIX chances win C...|[SIX, chances, wi...|
|             0|URGENT You 1 week...|[URGENT, You, 1, ...|
|             1|Ive searching rig...|[Ive, searching, ..

In [30]:
df.printSchema()

root
 |-- message_status: integer (nullable = true)
 |-- message_stopW: string (nullable = true)
 |-- Message_Array: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [31]:
df1 = df.select('message_status','Message_Array')

In [32]:
df1.printSchema()

root
 |-- message_status: integer (nullable = true)
 |-- Message_Array: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [33]:
df1.show(5)

+--------------+--------------------+
|message_status|       Message_Array|
+--------------+--------------------+
|             1|[Go, jurong, poin...|
|             1|[Ok, lar, Joking,...|
|             0|[Free, entry, 2, ...|
|             1|[U, dun, say, ear...|
|             1|[Nah, I, dont, th...|
+--------------+--------------------+
only showing top 5 rows



# Train Data

In [34]:
#sms_data = df1.select('Message_Array')

In [35]:
#status_data = df1.select('message_status')

In [36]:
#Split the whole dataframe into train and test DF
trainData,testData = df1.randomSplit([0.7,0.3])

In [37]:
from pyspark.ml.feature import CountVectorizer,IDF,StringIndexer

In [38]:
from pyspark.ml.classification import LogisticRegression

In [39]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [40]:
#Create CountVectorizer on the column which needs to be vectorized which is 'Message_Array'
cv = CountVectorizer(inputCol="Message_Array", outputCol="cv", vocabSize=4, minDF=1.0)

In [41]:
idf = IDF(inputCol='cv', outputCol="features", minDocFreq=5)

In [42]:
label_stringIdx = StringIndexer(inputCol = "message_status", outputCol = "label")

In [43]:
lr = LogisticRegression(maxIter=100)

In [44]:
pipeline = Pipeline(stages=[cv, idf, label_stringIdx, lr])

In [45]:
pipelineFit = pipeline.fit(trainData)

In [46]:
predictions = pipelineFit.transform(testData)

In [47]:
predictions.show(5)

+--------------+--------------------+-------------------+--------------------+-----+--------------------+--------------------+----------+
|message_status|       Message_Array|                 cv|            features|label|       rawPrediction|         probability|prediction|
+--------------+--------------------+-------------------+--------------------+-----+--------------------+--------------------+----------+
|             0|[08714712388, 10a...|          (4,[],[])|           (4,[],[])|  1.0|[1.72320551645813...|[0.84854126650309...|       0.0|
|             0|[123, Congratulat...|(4,[0,2],[1.0,1.0])|(4,[0,2],[1.69448...|  1.0|[2.30787005352106...|[0.90952673939586...|       0.0|
|             0|[18, days, Euro20...|          (4,[],[])|           (4,[],[])|  1.0|[1.72320551645813...|[0.84854126650309...|       0.0|
|             0|[1st, wk, FREE, G...|(4,[2,3],[1.0,1.0])|(4,[2,3],[2.33296...|  1.0|[1.17969670190232...|[0.76489326557647...|       0.0|
|             0|     [22, 146tf150

In [48]:
predictions.select('message_status','prediction').show(100)

+--------------+----------+
|message_status|prediction|
+--------------+----------+
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       1.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|       0.0|
|             0|    

In [49]:
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(testData.count())

In [50]:
evaluate1 = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='message_status')

In [51]:
roc_auc = evaluate1.evaluate(predictions)

In [52]:
accuracy

0.872277810476751

In [53]:
roc_auc

0.48325363246162417

# Random Forest

In [None]:
from pyspark.ml.classification import RandomForestClassifier

In [None]:
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="message_status", featuresCol="features")

In [None]:
from pyspark.ml import Pipeline

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
pipeline = Pipeline(stages=[cv, idf, label_stringIdx, rf])

In [None]:
pipelineFit = pipeline.fit(trainData)

In [None]:
predictions = pipelineFit.transform(testData)

In [None]:
predictions.show(5)

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="message_status", predictionCol="prediction", metricName="accuracy")

In [None]:
accuracy = evaluator.evaluate(predictions)

In [None]:
accuracy

In [None]:
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

In [None]:
evaluatorwp = MulticlassClassificationEvaluator(labelCol="message_status", predictionCol="prediction", metricName="weightedPrecision")
wp = evaluatorwp.evaluate(predictions)
print("weightedPrecision = %g" % wp)

# NGram

In [54]:
from pyspark.ml.feature import NGram, CountVectorizer, VectorAssembler

In [55]:
def build_ngrams(inputCol="Message_Array", n=3):

    ngrams = [
        NGram(n=i, inputCol="Message_Array", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    vectorizers = [
        CountVectorizer(inputCol="{0}_grams".format(i),
            outputCol="{0}_counts".format(i))
        for i in range(1, n + 1)
    ]

    assembler = [VectorAssembler(
        inputCols=["{0}_counts".format(i) for i in range(1, n + 1)],
        outputCol="features"
    )]

    return Pipeline(stages=ngrams + vectorizers + assembler)

In [None]:
#pipeline = Pipeline()

In [None]:
#Pipeline = build_ngrams()

In [56]:
result = build_ngrams().fit(trainData).transform(testData) 

In [57]:
result.show(5)

+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|message_status|       Message_Array|             1_grams|             2_grams|             3_grams|            1_counts|            2_counts|            3_counts|            features|
+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             0|[08714712388, 10a...|[08714712388, 10a...|[08714712388 10am...|[08714712388 10am...|(9441,[649,826,86...|(28685,[1341,1507...|(29594,[28827],[1...|(67720,[649,826,8...|
|             0|[123, Congratulat...|[123, Congratulat...|[123 Congratulati...|[123 Congratulati...|(9441,[0,2,4,84,1...|(28685,[67,2372,2...|(29594,[1098,2200...|(67720,[0,2,4,84,...|
|             0|[18, days, Euro20...|[18, days, Euro20...|[18 days, days Eu