#### Natural Language Processing with PySpark

NLP Tools
- Tokenizer
- StopWordRemoval
- n-grams
- TF-IDF
- CountVectorizer

In [None]:
!curl https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip >> smsspamcollection.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  198k  100  198k    0     0   441k      0 --:--:-- --:--:-- --:--:--  441k


In [None]:
# !apt install unzip
!unzip /content/smsspamcollection.zip

Archive:  /content/smsspamcollection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 35 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 40.6 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=017d1532593466cb1f48bc09c014dd638c5ed34575b94ca7872c874828465087
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("NLP Learning").getOrCreate()

#### Tokenizer

In [None]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [None]:
sent_df = spark.createDataFrame(
    [
     (0,"hello i am happy to be learning Apache Spark"),
     (1, "I enjoy learning about python and javascript progamming"),
     (2, "i am familiar with Machine Learning applications"),
     (3, "here, is,a,list,of,words")
    ],
    ['id','sentences']
)

sent_df.show(truncate=False)

+---+-------------------------------------------------------+
|id |sentences                                              |
+---+-------------------------------------------------------+
|0  |hello i am happy to be learning Apache Spark           |
|1  |I enjoy learning about python and javascript progamming|
|2  |i am familiar with Machine Learning applications       |
|3  |here, is,a,list,of,words                               |
+---+-------------------------------------------------------+



In [None]:
tokenizer = Tokenizer(inputCol='sentences', outputCol='tokenOutput')
regexTokenizer = RegexTokenizer(inputCol='sentences', outputCol='regxOutput',pattern="\\W")

# word count for each sentences
countTokens = udf(lambda w:len(w), IntegerType())


In [None]:
tokenized = tokenizer.transform(sent_df)
tokenized.show(truncate=False)

+---+-------------------------------------------------------+----------------------------------------------------------------+
|id |sentences                                              |tokenOutput                                                     |
+---+-------------------------------------------------------+----------------------------------------------------------------+
|0  |hello i am happy to be learning Apache Spark           |[hello, i, am, happy, to, be, learning, apache, spark]          |
|1  |I enjoy learning about python and javascript progamming|[i, enjoy, learning, about, python, and, javascript, progamming]|
|2  |i am familiar with Machine Learning applications       |[i, am, familiar, with, machine, learning, applications]        |
|3  |here, is,a,list,of,words                               |[here,, is,a,list,of,words]                                     |
+---+-------------------------------------------------------+--------------------------------------------------

In [None]:
tokenized.select('sentences','tokenOutput').withColumn("tokens", countTokens(col('tokenOutput'))).show(truncate=False)

+-------------------------------------------------------+----------------------------------------------------------------+------+
|sentences                                              |tokenOutput                                                     |tokens|
+-------------------------------------------------------+----------------------------------------------------------------+------+
|hello i am happy to be learning Apache Spark           |[hello, i, am, happy, to, be, learning, apache, spark]          |9     |
|I enjoy learning about python and javascript progamming|[i, enjoy, learning, about, python, and, javascript, progamming]|8     |
|i am familiar with Machine Learning applications       |[i, am, familiar, with, machine, learning, applications]        |7     |
|here, is,a,list,of,words                               |[here,, is,a,list,of,words]                                     |2     |
+-------------------------------------------------------+---------------------------------

In [None]:
regexTokenized = regexTokenizer.transform(sent_df)
regexTokenized.select('sentences','regxOutput').withColumn("tokens", countTokens(col('regxOutput'))).show(truncate=False)

+-------------------------------------------------------+----------------------------------------------------------------+------+
|sentences                                              |regxOutput                                                      |tokens|
+-------------------------------------------------------+----------------------------------------------------------------+------+
|hello i am happy to be learning Apache Spark           |[hello, i, am, happy, to, be, learning, apache, spark]          |9     |
|I enjoy learning about python and javascript progamming|[i, enjoy, learning, about, python, and, javascript, progamming]|8     |
|i am familiar with Machine Learning applications       |[i, am, familiar, with, machine, learning, applications]        |7     |
|here, is,a,list,of,words                               |[here, is, a, list, of, words]                                  |6     |
+-------------------------------------------------------+---------------------------------

#### Stop Word Removal

In [None]:
from pyspark.ml.feature import StopWordsRemover

In [None]:
remover = StopWordsRemover(inputCol='regxOutput',outputCol='cleaned')
remover.transform(regexTokenized).show(truncate=False)

+---+-------------------------------------------------------+----------------------------------------------------------------+-------------------------------------------------+
|id |sentences                                              |regxOutput                                                      |cleaned                                          |
+---+-------------------------------------------------------+----------------------------------------------------------------+-------------------------------------------------+
|0  |hello i am happy to be learning Apache Spark           |[hello, i, am, happy, to, be, learning, apache, spark]          |[hello, happy, learning, apache, spark]          |
|1  |I enjoy learning about python and javascript progamming|[i, enjoy, learning, about, python, and, javascript, progamming]|[enjoy, learning, python, javascript, progamming]|
|2  |i am familiar with Machine Learning applications       |[i, am, familiar, with, machine, learning, application

#### n-gram

In [None]:
from pyspark.ml.feature import NGram

In [None]:
tokenized.show(truncate=False)

+---+-------------------------------------------------------+----------------------------------------------------------------+
|id |sentences                                              |tokenOutput                                                     |
+---+-------------------------------------------------------+----------------------------------------------------------------+
|0  |hello i am happy to be learning Apache Spark           |[hello, i, am, happy, to, be, learning, apache, spark]          |
|1  |I enjoy learning about python and javascript progamming|[i, enjoy, learning, about, python, and, javascript, progamming]|
|2  |i am familiar with Machine Learning applications       |[i, am, familiar, with, machine, learning, applications]        |
|3  |here, is,a,list,of,words                               |[here,, is,a,list,of,words]                                     |
+---+-------------------------------------------------------+--------------------------------------------------

In [None]:
bigram = NGram(n=2,inputCol='tokenOutput',outputCol='bigrams')
bigram_df = bigram.transform(tokenized)
bigram_df.show(truncate=False)

+---+-------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------------------------------------------------+
|id |sentences                                              |tokenOutput                                                     |bigrams                                                                                                   |
+---+-------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------------------------------------------------+
|0  |hello i am happy to be learning Apache Spark           |[hello, i, am, happy, to, be, learning, apache, spark]          |[hello i, i am, am happy, happy to, to be, be learning, learning apache, apache spark]                    |
|1  |I enjoy learning about python and javascript progamming|[i,

#### Featue Extraction 
- TF-IDF

In [None]:
from pyspark.ml.feature import IDF, HashingTF, Tokenizer

In [None]:
sent_df = spark.createDataFrame(
    [
     (0, 0.0,"hello i am happy to be learning Apache Spark"),
     (1, 0.0,"I enjoy learning about python and javascript progamming"),
     (2, 1.0,"i am familiar with Machine Learning applications"),
     (3, 1.0, "here, is,a,list,of,words")
    ],
    ['id','label','sentences']
)

sent_df.show(truncate=False)

+---+-----+-------------------------------------------------------+
|id |label|sentences                                              |
+---+-----+-------------------------------------------------------+
|0  |0.0  |hello i am happy to be learning Apache Spark           |
|1  |0.0  |I enjoy learning about python and javascript progamming|
|2  |1.0  |i am familiar with Machine Learning applications       |
|3  |1.0  |here, is,a,list,of,words                               |
+---+-----+-------------------------------------------------------+



In [None]:
tokenizer = RegexTokenizer(inputCol='sentences',outputCol='words',pattern="\\W")
words_df  = tokenizer.transform(sent_df)
words_df.show(truncate=False)

+---+-----+-------------------------------------------------------+----------------------------------------------------------------+
|id |label|sentences                                              |words                                                           |
+---+-----+-------------------------------------------------------+----------------------------------------------------------------+
|0  |0.0  |hello i am happy to be learning Apache Spark           |[hello, i, am, happy, to, be, learning, apache, spark]          |
|1  |0.0  |I enjoy learning about python and javascript progamming|[i, enjoy, learning, about, python, and, javascript, progamming]|
|2  |1.0  |i am familiar with Machine Learning applications       |[i, am, familiar, with, machine, learning, applications]        |
|3  |1.0  |here, is,a,list,of,words                               |[here, is, a, list, of, words]                                  |
+---+-----+-------------------------------------------------------+--

In [None]:
hashingTF = HashingTF(inputCol='words',outputCol='rawFeatures', numFeatures=20)
featurized = hashingTF.transform(words_df)

In [None]:
featurized.show(truncate=False)

+---+-----+-------------------------------------------------------+----------------------------------------------------------------+-----------------------------------------------------------------+
|id |label|sentences                                              |words                                                           |rawFeatures                                                      |
+---+-----+-------------------------------------------------------+----------------------------------------------------------------+-----------------------------------------------------------------+
|0  |0.0  |hello i am happy to be learning Apache Spark           |[hello, i, am, happy, to, be, learning, apache, spark]          |(20,[3,5,6,7,8,9,12,15,16],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|1  |0.0  |I enjoy learning about python and javascript progamming|[i, enjoy, learning, about, python, and, javascript, progamming]|(20,[5,6,9,11,12,14,16],[1.0,1.0,1.0,1.0,1.0,1.0,2.0])           |
|2  |

In [None]:
idf = IDF(inputCol='rawFeatures',outputCol='features')
idf_model = idf.fit(featurized)

In [None]:
rescale = idf_model.transform(featurized)
rescale.select("label",'features').show(truncate=False)

+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                    |
+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(20,[3,5,6,7,8,9,12,15,16],[0.9162907318741551,0.22314355131420976,0.5108256237659907,0.5108256237659907,0.5108256237659907,0.22314355131420976,0.0,0.5108256237659907,0.22314355131420976])|
|0.0  |(20,[5,6,9,11,12,14,16],[0.22314355131420976,0.5108256237659907,0.22314355131420976,0.9162907318741551,0.0,0.9162907318741551,0.44628710262841953])                                         |
|1.0  |(20,[0,2

#### Count Vectorization

In [None]:
from pyspark.ml.feature import CountVectorizer

In [None]:
df = spark.createDataFrame(
    [
     (0,list('abcde')),
      (1,list('abbbccddee')),
    ],
    ['id','words']
)

df.show()

+---+--------------------+
| id|               words|
+---+--------------------+
|  0|     [a, b, c, d, e]|
|  1|[a, b, b, b, c, c...|
+---+--------------------+



In [None]:
cv = CountVectorizer(inputCol='words', outputCol='features', vocabSize=5,minDF=2.0)
model = cv.fit(df)
res = model.transform(df)
res.show(truncate=False)

+---+------------------------------+-------------------------------------+
|id |words                         |features                             |
+---+------------------------------+-------------------------------------+
|0  |[a, b, c, d, e]               |(5,[0,1,2,3,4],[1.0,1.0,1.0,1.0,1.0])|
|1  |[a, b, b, b, c, c, d, d, e, e]|(5,[0,1,2,3,4],[3.0,2.0,2.0,2.0,1.0])|
+---+------------------------------+-------------------------------------+



#### NLP with Naive Bayse : PySpark

In [None]:
!head SMSSpamCollection

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham	Ok lar... Joking wif u oni...
spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
ham	U dun say so early hor... U c already then say...
ham	Nah I don't think he goes to usf, he lives around here though
spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
ham	Even my brother is not like to speak with me. They treat me like aids patent.
ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
spam	WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
spam	H

In [None]:
df  = spark.read.csv("/content/SMSSpamCollection", inferSchema=True, sep='\t')
df.show(5)

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
+----+--------------------+
only showing top 5 rows



In [None]:
df = df.withColumnRenamed('_c0',"class").withColumnRenamed("_c1", "text")
df.show(5)

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
+-----+--------------------+
only showing top 5 rows



#### Clean Data

In [None]:
from pyspark.sql.functions import length

In [None]:
df = df.withColumn("length", length(df['text']))
df.show(5)

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
+-----+--------------------+------+
only showing top 5 rows



In [None]:
df.groupby('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



#### Feature Transformation

In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, StringIndexer

In [None]:
tokenizer = Tokenizer(inputCol='text',outputCol='token_text')
stop_word_remover = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')
idf = IDF(inputCol="c_vec",outputCol='tf_idf')
ham_spam_to_num = StringIndexer(inputCol='class',outputCol='label')

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [None]:
cleaned = VectorAssembler(inputCols=['tf_idf','length'], outputCol='features')

#### loading model

In [None]:
from pyspark.ml.classification import NaiveBayes

In [None]:
nb = NaiveBayes()

In [None]:
# pipeline
from pyspark.ml import Pipeline

In [None]:
pipeline = Pipeline(stages=[
                            ham_spam_to_num,
                            tokenizer,
                            stop_word_remover,
                            count_vec,
                            idf,
                            cleaned])

In [None]:
cleaner = pipeline.fit(df)

In [None]:
clean_df = cleaner.transform(df)

In [None]:
clean_df.show(5)

+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|class|                text|length|label|          token_text|         stop_tokens|               c_vec|              tf_idf|            features|
+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|   111|  0.0|[go, until, juron...|[go, jurong, poin...|(13423,[7,11,31,6...|(13423,[7,11,31,6...|(13424,[7,11,31,6...|
|  ham|Ok lar... Joking ...|    29|  0.0|[ok, lar..., joki...|[ok, lar..., joki...|(13423,[0,24,297,...|(13423,[0,24,297,...|(13424,[0,24,297,...|
| spam|Free entry in 2 a...|   155|  1.0|[free, entry, in,...|[free, entry, 2, ...|(13423,[2,13,19,3...|(13423,[2,13,19,3...|(13424,[2,13,19,3...|
|  ham|U dun say so earl...|    49|  0.0|[u, dun, say, so,...|[u, dun, say, ear...|(13423,[0,70,80,1...|(13423,[0,70,8

#### Training and evaluation

In [None]:
clean_df = clean_df.select(['label','features'])
clean_df.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(13424,[7,11,31,6...|
|  0.0|(13424,[0,24,297,...|
|  1.0|(13424,[2,13,19,3...|
|  0.0|(13424,[0,70,80,1...|
|  0.0|(13424,[36,134,31...|
+-----+--------------------+
only showing top 5 rows



In [None]:
(train, test) = clean_df.randomSplit([0.7,0.3],seed=42)

In [None]:
prediction = nb.fit(train)

In [None]:
res = prediction.transform(test)

In [None]:
res.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13424,[0,1,2,41,...|[-1060.7325420854...|[1.0,9.6391158107...|       0.0|
|  0.0|(13424,[0,1,5,20,...|[-803.13623340156...|[1.0,2.7071860143...|       0.0|
|  0.0|(13424,[0,1,7,8,1...|[-1152.0926413349...|[1.0,6.3682506790...|       0.0|
|  0.0|(13424,[0,1,7,15,...|[-656.71821333935...|[1.0,7.6641099247...|       0.0|
|  0.0|(13424,[0,1,12,33...|[-444.22584589378...|[1.0,1.4534997554...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
eval = MulticlassClassificationEvaluator()
acc = eval.evaluate(res)
acc

0.9266021977210805