# Jonathan Halverson
# Wednesday, February 7, 2018
# Spam classification

In this notebook we build a classifier for emails. We begin by preprocessing the text:

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.config("spark.driver.host", "localhost").getOrCreate()

In [2]:
import pyspark
pyspark.__version__

'2.2.1'

Read in the raw data and relabel the columns of the dataframe:

In [3]:
texts = spark.read.csv('smsspamcollection/SMSSpamCollection', sep='\t', header=False, inferSchema=True)
texts = texts.withColumnRenamed('_c0', 'label')
texts = texts.withColumnRenamed('_c1', 'messages')

In [4]:
texts.show(5)

+-----+--------------------+
|label|            messages|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
+-----+--------------------+
only showing top 5 rows



Convert labels to numerical values and change data type:

In [5]:
from pyspark.sql.functions import regexp_replace

texts = texts.withColumn('label', regexp_replace('label', 'ham', '0'))\
             .withColumn('label', regexp_replace('label', 'spam', '1'))
df = texts.withColumn('label', texts['label'].cast('int'))
texts.show(3)

+-----+--------------------+
|label|            messages|
+-----+--------------------+
|    0|Go until jurong p...|
|    0|Ok lar... Joking ...|
|    1|Free entry in 2 a...|
+-----+--------------------+
only showing top 3 rows



In [6]:
df.printSchema()

root
 |-- label: integer (nullable = true)
 |-- messages: string (nullable = true)



In [7]:
from pyspark.sql.functions import lit

df = df.withColumn('weight', lit(1))

In [8]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import re

def clean_messages(message):
     letters_only = re.sub("[^a-zA-Z]", " ", message)
     words = letters_only.lower().split()
     return " ".join(words)

In [9]:
trans_udf = udf(clean_messages, StringType())
df = df.withColumn('messages', trans_udf(df.messages))

In [10]:
df.select('messages').show(10)

+--------------------+
|            messages|
+--------------------+
|go until jurong p...|
|ok lar joking wif...|
|free entry in a w...|
|u dun say so earl...|
|nah i don t think...|
|freemsg hey there...|
|even my brother i...|
|as per your reque...|
|winner as a value...|
|had your mobile m...|
+--------------------+
only showing top 10 rows



In [11]:
df.printSchema()

root
 |-- label: integer (nullable = true)
 |-- messages: string (nullable = true)
 |-- weight: integer (nullable = false)



In [12]:
df.groupby('label').count().toPandas()

Unnamed: 0,label,count
0,1,747
1,0,4827


Create a test train split:

In [47]:
df_train, df_test = df.randomSplit([0.7, 0.3], seed=42)

The text is converted to word count vectors (bag of words):

In [14]:
from pyspark.ml.feature import HashingTF, Tokenizer, IDF

In [15]:
tf = HashingTF(numFeatures=5000, inputCol='words', outputCol='rawFeatures')
tokenizer = Tokenizer(inputCol="messages", outputCol="words")
wordsData = tokenizer.transform(df_train)

In [16]:
featurizedData = tf.transform(wordsData)
featurizedData.show(3)

+-----+--------------------+------+--------------------+--------------------+
|label|            messages|weight|               words|         rawFeatures|
+-----+--------------------+------+--------------------+--------------------+
|    0|                    |     1|                  []| (5000,[3372],[1.0])|
|    0|                    |     1|                  []| (5000,[3372],[1.0])|
|    0|a bloo bloo bloo ...|     1|[a, bloo, bloo, b...|(5000,[329,387,68...|
+-----+--------------------+------+--------------------+--------------------+
only showing top 3 rows



In [17]:
featurizedData.first()

Row(label=0, messages=u'', weight=1, words=[u''], rawFeatures=SparseVector(5000, {3372: 1.0}))

In [18]:
featurizedData.printSchema()

root
 |-- label: integer (nullable = true)
 |-- messages: string (nullable = true)
 |-- weight: integer (nullable = false)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeatures: vector (nullable = true)



In [19]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData).cache()

In [20]:
rescaledData.printSchema()

root
 |-- label: integer (nullable = true)
 |-- messages: string (nullable = true)
 |-- weight: integer (nullable = false)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeatures: vector (nullable = true)
 |-- features: vector (nullable = true)



In [21]:
rescaledData.show(3)

+-----+--------------------+------+--------------------+--------------------+--------------------+
|label|            messages|weight|               words|         rawFeatures|            features|
+-----+--------------------+------+--------------------+--------------------+--------------------+
|    0|                    |     1|                  []| (5000,[3372],[1.0])|(5000,[3372],[7.1...|
|    0|                    |     1|                  []| (5000,[3372],[1.0])|(5000,[3372],[7.1...|
|    0|a bloo bloo bloo ...|     1|[a, bloo, bloo, b...|(5000,[329,387,68...|(5000,[329,387,68...|
+-----+--------------------+------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [22]:
rescaledData.first()

Row(label=0, messages=u'', weight=1, words=[u''], rawFeatures=SparseVector(5000, {3372: 1.0}), features=SparseVector(5000, {3372: 7.1699}))

In [23]:
from pyspark.ml.classification import LogisticRegression

In [24]:
lr = LogisticRegression(featuresCol='features', labelCol='label', weightCol='weight', maxIter=10, threshold=0.5)
model = lr.fit(rescaledData)

Let's try two out-of-sample emails are see if they are correctly classified:

In [50]:
y_pred = model.transform(idfModel.transform(tf.transform(tokenizer.transform(df_test)))).select('prediction')
y_pred.show(5)

+----------+
|prediction|
+----------+
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
+----------+
only showing top 5 rows



In [51]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [52]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="raw")
evaluator.evaluate(y_pred, {evaluator.metricName: "accuracy"})

IllegalArgumentException: u'BinaryClassificationEvaluator_48c58412f7c991cb6ccf parameter metricName given invalid value accuracy.'

In [26]:
sample = spark.createDataFrame([(0, "Get a free mansion by sending 1 million dollars to me.")], ["label", "messages"])
model.transform(idfModel.transform(tf.transform(tokenizer.transform(sample)))).select('prediction').show()

+----------+
|prediction|
+----------+
|       0.0|
+----------+



In [27]:
sample = spark.createDataFrame([(0, "Hi Mark, Let's meet at the coffee shop at 3 pm.")], ["label", "messages"])
model.transform(idfModel.transform(tf.transform(tokenizer.transform(sample)))).select('prediction').show()

+----------+
|prediction|
+----------+
|       0.0|
+----------+



We see that both predictions are correct. One could extend this example by doing more pre-processing on the emails and working with more data. The model could also be evaluated by looking at an ROC curve.

# Repeat above with sklearn

In [28]:
Xy = df.toPandas()

In [29]:
Xy.messages[10]

u'i m gonna be home soon and i don t want to talk about this stuff anymore tonight k i ve cried enough today'

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
X_train, X_test, y_train, y_test = train_test_split(Xy.messages, Xy.label, test_size=0.3,
                                                    shuffle=True, stratify=Xy.label)

In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None, max_features=5000)
param_grid = [{'vect__ngram_range': [(1, 5)],
               'clf__C': [0.01, 0.1, 1.0],
               'clf__class_weight': ['balanced'],
               'vect__use_idf': [True], 'vect__norm': [None]}]
lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression())])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, refit=True)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   24.3s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'vect__ngram_range': [(1, 5)], 'vect__norm': [None], 'vect__use_idf': [True], 'clf__C': [0.01, 0.1, 1.0], 'clf__class_weight': ['balanced']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [34]:
print gs_lr_tfidf.best_params_

{'vect__ngram_range': (1, 5), 'vect__norm': None, 'vect__use_idf': True, 'clf__C': 0.1, 'clf__class_weight': 'balanced'}


In [35]:
print 'Accuracy (train):', gs_lr_tfidf.score(X_train, y_train)
print 'Accuracy (test):', gs_lr_tfidf.score(X_test, y_test)

Accuracy (train): 0.999487310946
Accuracy (test): 0.983861326958
