#### Natural Language Processing with PySpark

NLP Tools
- Tokenizer
- StopWordRemoval
- n-grams
- TF-IDF
- CountVectorizer

In [8]:
!curl https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip >> smsspamcollection.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  198k  100  198k    0     0   185k      0  0:00:01  0:00:01 --:--:--  185k


In [None]:
# !apt install unzip
!unzip /content/smsspamcollection.zip

In [None]:
!pip install pyspark

In [12]:
from pyspark.sql import SparkSession

In [14]:
spark = SparkSession.builder.appName("NLP Learning").getOrCreate()

#### Tokenizer

In [15]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [23]:
sent_df = spark.createDataFrame(
    [
     (0,"hello i am happy to be learning Apache Spark"),
     (1, "I enjoy learning about python and javascript progamming"),
     (2, "i am familiar with Machine Learning applications"),
     (3, "here, is,a,list,of,words")
    ],
    ['id','sentences']
)

sent_df.show(truncate=False)

+---+-------------------------------------------------------+
|id |sentences                                              |
+---+-------------------------------------------------------+
|0  |hello i am happy to be learning Apache Spark           |
|1  |I enjoy learning about python and javascript progamming|
|2  |i am familiar with Machine Learning applications       |
|3  |here, is,a,list,of,words                               |
+---+-------------------------------------------------------+



In [24]:
tokenizer = Tokenizer(inputCol='sentences', outputCol='tokenOutput')
regexTokenizer = RegexTokenizer(inputCol='sentences', outputCol='regxOutput',pattern="\\W")

# word count for each sentences
countTokens = udf(lambda w:len(w), IntegerType())


In [25]:
tokenized = tokenizer.transform(sent_df)
tokenized.show(truncate=False)

+---+-------------------------------------------------------+----------------------------------------------------------------+
|id |sentences                                              |tokenOutput                                                     |
+---+-------------------------------------------------------+----------------------------------------------------------------+
|0  |hello i am happy to be learning Apache Spark           |[hello, i, am, happy, to, be, learning, apache, spark]          |
|1  |I enjoy learning about python and javascript progamming|[i, enjoy, learning, about, python, and, javascript, progamming]|
|2  |i am familiar with Machine Learning applications       |[i, am, familiar, with, machine, learning, applications]        |
|3  |here, is,a,list,of,words                               |[here,, is,a,list,of,words]                                     |
+---+-------------------------------------------------------+--------------------------------------------------

In [29]:
tokenized.select('sentences','tokenOutput').withColumn("tokens", countTokens(col('tokenOutput'))).show(truncate=False)

+-------------------------------------------------------+----------------------------------------------------------------+------+
|sentences                                              |tokenOutput                                                     |tokens|
+-------------------------------------------------------+----------------------------------------------------------------+------+
|hello i am happy to be learning Apache Spark           |[hello, i, am, happy, to, be, learning, apache, spark]          |9     |
|I enjoy learning about python and javascript progamming|[i, enjoy, learning, about, python, and, javascript, progamming]|8     |
|i am familiar with Machine Learning applications       |[i, am, familiar, with, machine, learning, applications]        |7     |
|here, is,a,list,of,words                               |[here,, is,a,list,of,words]                                     |2     |
+-------------------------------------------------------+---------------------------------

In [28]:
regexTokenized = regexTokenizer.transform(sent_df)
regexTokenized.select('sentences','regxOutput').withColumn("tokens", countTokens(col('regxOutput'))).show(truncate=False)

+-------------------------------------------------------+----------------------------------------------------------------+------+
|sentences                                              |regxOutput                                                      |tokens|
+-------------------------------------------------------+----------------------------------------------------------------+------+
|hello i am happy to be learning Apache Spark           |[hello, i, am, happy, to, be, learning, apache, spark]          |9     |
|I enjoy learning about python and javascript progamming|[i, enjoy, learning, about, python, and, javascript, progamming]|8     |
|i am familiar with Machine Learning applications       |[i, am, familiar, with, machine, learning, applications]        |7     |
|here, is,a,list,of,words                               |[here, is, a, list, of, words]                                  |6     |
+-------------------------------------------------------+---------------------------------

#### Stop Word Removal

In [30]:
from pyspark.ml.feature import StopWordsRemover

In [36]:
remover = StopWordsRemover(inputCol='regxOutput',outputCol='cleaned')
remover.transform(regexTokenized).show(truncate=False)

+---+-------------------------------------------------------+----------------------------------------------------------------+-------------------------------------------------+
|id |sentences                                              |regxOutput                                                      |cleaned                                          |
+---+-------------------------------------------------------+----------------------------------------------------------------+-------------------------------------------------+
|0  |hello i am happy to be learning Apache Spark           |[hello, i, am, happy, to, be, learning, apache, spark]          |[hello, happy, learning, apache, spark]          |
|1  |I enjoy learning about python and javascript progamming|[i, enjoy, learning, about, python, and, javascript, progamming]|[enjoy, learning, python, javascript, progamming]|
|2  |i am familiar with Machine Learning applications       |[i, am, familiar, with, machine, learning, application

#### n-gram

In [37]:
from pyspark.ml.feature import NGram

In [42]:
tokenized.show(truncate=False)

+---+-------------------------------------------------------+----------------------------------------------------------------+
|id |sentences                                              |tokenOutput                                                     |
+---+-------------------------------------------------------+----------------------------------------------------------------+
|0  |hello i am happy to be learning Apache Spark           |[hello, i, am, happy, to, be, learning, apache, spark]          |
|1  |I enjoy learning about python and javascript progamming|[i, enjoy, learning, about, python, and, javascript, progamming]|
|2  |i am familiar with Machine Learning applications       |[i, am, familiar, with, machine, learning, applications]        |
|3  |here, is,a,list,of,words                               |[here,, is,a,list,of,words]                                     |
+---+-------------------------------------------------------+--------------------------------------------------

In [44]:
bigram = NGram(n=2,inputCol='tokenOutput',outputCol='bigrams')
bigram_df = bigram.transform(tokenized)
bigram_df.show(truncate=False)

+---+-------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------------------------------------------------+
|id |sentences                                              |tokenOutput                                                     |bigrams                                                                                                   |
+---+-------------------------------------------------------+----------------------------------------------------------------+----------------------------------------------------------------------------------------------------------+
|0  |hello i am happy to be learning Apache Spark           |[hello, i, am, happy, to, be, learning, apache, spark]          |[hello i, i am, am happy, happy to, to be, be learning, learning apache, apache spark]                    |
|1  |I enjoy learning about python and javascript progamming|[i,

#### Featue Extraction 
- TF-IDF

In [47]:
from pyspark.ml.feature import IDF, HashingTF, Tokenizer

In [49]:
sent_df = spark.createDataFrame(
    [
     (0, 0.0,"hello i am happy to be learning Apache Spark"),
     (1, 0.0,"I enjoy learning about python and javascript progamming"),
     (2, 1.0,"i am familiar with Machine Learning applications"),
     (3, 1.0, "here, is,a,list,of,words")
    ],
    ['id','label','sentences']
)

sent_df.show(truncate=False)

+---+-----+-------------------------------------------------------+
|id |label|sentences                                              |
+---+-----+-------------------------------------------------------+
|0  |0.0  |hello i am happy to be learning Apache Spark           |
|1  |0.0  |I enjoy learning about python and javascript progamming|
|2  |1.0  |i am familiar with Machine Learning applications       |
|3  |1.0  |here, is,a,list,of,words                               |
+---+-----+-------------------------------------------------------+



In [50]:
tokenizer = RegexTokenizer(inputCol='sentences',outputCol='words',pattern="\\W")
words_df  = tokenizer.transform(sent_df)
words_df.show(truncate=False)

+---+-----+-------------------------------------------------------+----------------------------------------------------------------+
|id |label|sentences                                              |words                                                           |
+---+-----+-------------------------------------------------------+----------------------------------------------------------------+
|0  |0.0  |hello i am happy to be learning Apache Spark           |[hello, i, am, happy, to, be, learning, apache, spark]          |
|1  |0.0  |I enjoy learning about python and javascript progamming|[i, enjoy, learning, about, python, and, javascript, progamming]|
|2  |1.0  |i am familiar with Machine Learning applications       |[i, am, familiar, with, machine, learning, applications]        |
|3  |1.0  |here, is,a,list,of,words                               |[here, is, a, list, of, words]                                  |
+---+-----+-------------------------------------------------------+--

In [52]:
hashingTF = HashingTF(inputCol='words',outputCol='rawFeatures', numFeatures=20)
featurized = hashingTF.transform(words_df)

In [54]:
featurized.show(truncate=False)

+---+-----+-------------------------------------------------------+----------------------------------------------------------------+-----------------------------------------------------------------+
|id |label|sentences                                              |words                                                           |rawFeatures                                                      |
+---+-----+-------------------------------------------------------+----------------------------------------------------------------+-----------------------------------------------------------------+
|0  |0.0  |hello i am happy to be learning Apache Spark           |[hello, i, am, happy, to, be, learning, apache, spark]          |(20,[3,5,6,7,8,9,12,15,16],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|1  |0.0  |I enjoy learning about python and javascript progamming|[i, enjoy, learning, about, python, and, javascript, progamming]|(20,[5,6,9,11,12,14,16],[1.0,1.0,1.0,1.0,1.0,1.0,2.0])           |
|2  |

In [55]:
idf = IDF(inputCol='rawFeatures',outputCol='features')
idf_model = idf.fit(featurized)

In [57]:
rescale = idf_model.transform(featurized)
rescale.select("label",'features').show(truncate=False)

+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                    |
+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(20,[3,5,6,7,8,9,12,15,16],[0.9162907318741551,0.22314355131420976,0.5108256237659907,0.5108256237659907,0.5108256237659907,0.22314355131420976,0.0,0.5108256237659907,0.22314355131420976])|
|0.0  |(20,[5,6,9,11,12,14,16],[0.22314355131420976,0.5108256237659907,0.22314355131420976,0.9162907318741551,0.0,0.9162907318741551,0.44628710262841953])                                         |
|1.0  |(20,[0,2

#### Count Vectorization

In [58]:
from pyspark.ml.feature import CountVectorizer

In [59]:
df = spark.createDataFrame(
    [
     (0,list('abcde')),
      (1,list('abbbccddee')),
    ],
    ['id','words']
)

df.show()

+---+--------------------+
| id|               words|
+---+--------------------+
|  0|     [a, b, c, d, e]|
|  1|[a, b, b, b, c, c...|
+---+--------------------+



In [61]:
cv = CountVectorizer(inputCol='words', outputCol='features', vocabSize=5,minDF=2.0)
model = cv.fit(df)
res = model.transform(df)
res.show(truncate=False)

+---+------------------------------+-------------------------------------+
|id |words                         |features                             |
+---+------------------------------+-------------------------------------+
|0  |[a, b, c, d, e]               |(5,[0,1,2,3,4],[1.0,1.0,1.0,1.0,1.0])|
|1  |[a, b, b, b, c, c, d, d, e, e]|(5,[0,1,2,3,4],[3.0,2.0,2.0,2.0,1.0])|
+---+------------------------------+-------------------------------------+

