In [1]:
# Import the PySpark module
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [2]:
##Create SparkContext
sc = SparkContext.getOrCreate()

# Create SparkSession object
spark = SparkSession.builder.master('local[*]').appName('textMining').getOrCreate()

In [3]:
## Is not the best choose for large data sets
sms = spark.read.csv('sms.csv', sep=';', inferSchema=True, nullValue='NA')

In [4]:
sms = sms.withColumnRenamed('_c0','id')
sms = sms.withColumnRenamed('_c1','text')
sms = sms.withColumnRenamed('_c2','label')

In [5]:
sms.show()

+---+--------------------+-----+
| id|                text|label|
+---+--------------------+-----+
|  1|Sorry, I'll call ...|    0|
|  2|Dont worry. I gue...|    0|
|  3|Call FREEPHONE 08...|    1|
|  4|Win a 1000 cash p...|    1|
|  5|Go until jurong p...|    0|
|  6|Ok lar... Joking ...|    0|
|  7|Free entry in 2 a...|    1|
|  8|U dun say so earl...|    0|
|  9|Nah I don't think...|    0|
| 10|FreeMsg Hey there...|    1|
| 11|Even my brother i...|    0|
| 12|As per your reque...|    0|
| 13|WINNER!! As a val...|    1|
| 14|Had your mobile 1...|    1|
| 15|I'm gonna be home...|    0|
| 16|SIX chances to wi...|    1|
| 17|URGENT! You have ...|    1|
| 18|I've been searchi...|    0|
| 19|I HAVE A DATE ON ...|    0|
| 20|XXXMobileMovieClu...|    1|
+---+--------------------+-----+
only showing top 20 rows



In [6]:
# Import the necessary functions
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer

# Remove punctuation (REGEX provided) and numbers
wrangled = sms.withColumn('text', regexp_replace(sms.text, '[_():;,.!?\\-]', ' '))
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, '[0-9]', ' '))

# Merge multiple spaces
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +', ' '))

# Split the text into words
wrangled = Tokenizer(inputCol='text', outputCol='words').transform(wrangled)

#wrangled.show(4, truncate=False)

In [7]:
sms = wrangled

In [8]:
sms.show()

+---+--------------------+-----+--------------------+
| id|                text|label|               words|
+---+--------------------+-----+--------------------+
|  1|Sorry I'll call l...|    0|[sorry, i'll, cal...|
|  2|Dont worry I gues...|    0|[dont, worry, i, ...|
|  3| Call FREEPHONE now |    1|[call, freephone,...|
|  4|Win a cash prize ...|    1|[win, a, cash, pr...|
|  5|Go until jurong p...|    0|[go, until, juron...|
|  6|Ok lar Joking wif...|    0|[ok, lar, joking,...|
|  7|Free entry in a w...|    1|[free, entry, in,...|
|  8|U dun say so earl...|    0|[u, dun, say, so,...|
|  9|Nah I don't think...|    0|[nah, i, don't, t...|
| 10|FreeMsg Hey there...|    1|[freemsg, hey, th...|
| 11|Even my brother i...|    0|[even, my, brothe...|
| 12|As per your reque...|    0|[as, per, your, r...|
| 13|WINNER As a value...|    1|[winner, as, a, v...|
| 14|Had your mobile m...|    1|[had, your, mobil...|
| 15|I'm gonna be home...|    0|[i'm, gonna, be, ...|
| 16|SIX chances to wi...|  

In [9]:
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF

# Remove stop words.
wrangled = StopWordsRemover(inputCol='words', outputCol='terms').transform(sms)

# Apply the hashing trick
wrangled = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024).transform(wrangled)

# Convert hashed symbols to TF-IDF
tf_idf = IDF(inputCol='hash', outputCol='features').fit(wrangled).transform(wrangled)

In [10]:
sms = tf_idf
tf_idf.show()

+---+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
| id|                text|label|               words|               terms|                hash|            features|
+---+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
|  1|Sorry I'll call l...|    0|[sorry, i'll, cal...|[sorry, call, lat...|(1024,[138,344,37...|(1024,[138,344,37...|
|  2|Dont worry I gues...|    0|[dont, worry, i, ...|[dont, worry, gue...|(1024,[53,233,329...|(1024,[53,233,329...|
|  3| Call FREEPHONE now |    1|[call, freephone,...|   [call, freephone]|(1024,[138,396],[...|(1024,[138,396],[...|
|  4|Win a cash prize ...|    1|[win, a, cash, pr...|[win, cash, prize...|(1024,[31,69,387,...|(1024,[31,69,387,...|
|  5|Go until jurong p...|    0|[go, until, juron...|[go, jurong, poin...|(1024,[116,262,33...|(1024,[116,262,33...|
|  6|Ok lar Joking wif...|    0|[ok, lar, joking,...|[ok, lar, j

In [11]:
# Import the logistic regression class
from pyspark.ml.classification import LogisticRegression 

# Split the data into training and testing sets
sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed=13)

# Fit a Logistic Regression model to the training data
logistic = LogisticRegression(regParam=0.2).fit(sms_train)

# Make predictions on the testing data
prediction = logistic.transform(sms_test)

# Create a confusion matrix, comparing predictions to known labels
prediction.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|   47|
|    0|       0.0|  987|
|    1|       1.0|  124|
|    0|       1.0|    3|
+-----+----------+-----+



### Pipeline

In [12]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline 

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol='hash')
idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])