In [None]:
import findspark
findspark.init()

In [None]:
import pyspark

In [None]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('lr_demo').getOrCreate()

In [None]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [None]:
sentenceDataFrame = spark.createDataFrame([
    (0, 'Hi I heard about Spark'),
    (1, 'I know Spark can work well with NLP'),
    (2, 'Logistic,regression,models,are,supervised')
], ['id', 'sentence'])

In [None]:
sentenceDataFrame.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I know Spark can ...|
|  2|Logistic,regressi...|
+---+--------------------+



## Tokenizer

In [None]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')

regexTokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern='\\W')
#alternatively, pattern = '\\w+', gaps(False)

countTokens = udf(lambda words: len(words), IntegerType())

tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.select('sentence', 'words')\
    .withColumn('tokens', countTokens(col('words'))).show(truncate=False)

+-----------------------------------------+--------------------------------------------+------+
|sentence                                 |words                                       |tokens|
+-----------------------------------------+--------------------------------------------+------+
|Hi I heard about Spark                   |[hi, i, heard, about, spark]                |5     |
|I know Spark can work well with NLP      |[i, know, spark, can, work, well, with, nlp]|8     |
|Logistic,regression,models,are,supervised|[logistic,regression,models,are,supervised] |1     |
+-----------------------------------------+--------------------------------------------+------+



In [None]:
regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.select('sentence', 'words')\
    .withColumn('tokens', countTokens(col('words'))).show(truncate=False)

+-----------------------------------------+-----------------------------------------------+------+
|sentence                                 |words                                          |tokens|
+-----------------------------------------+-----------------------------------------------+------+
|Hi I heard about Spark                   |[hi, i, heard, about, spark]                   |5     |
|I know Spark can work well with NLP      |[i, know, spark, can, work, well, with, nlp]   |8     |
|Logistic,regression,models,are,supervised|[logistic, regression, models, are, supervised]|5     |
+-----------------------------------------+-----------------------------------------------+------+



## StopWordsRemover

In [None]:
from pyspark.ml.feature import StopWordsRemover

sentenceData = spark.createDataFrame([
    (0, ['I', 'go', 'to', 'school', 'by', 'bus']),
    (1, ['Minh', 'has', 'lots', 'of', 'pencils']),    
], ['id', 'raw'])

remover = StopWordsRemover(inputCol='raw', outputCol='filtered')
remover.transform(sentenceData).show(truncate=False)

+---+------------------------------+---------------------+
|id |raw                           |filtered             |
+---+------------------------------+---------------------+
|0  |[I, go, to, school, by, bus]  |[go, school, bus]    |
|1  |[Minh, has, lots, of, pencils]|[Minh, lots, pencils]|
+---+------------------------------+---------------------+



## Ngram

In [None]:
from pyspark.ml.feature import NGram

wordDataFrame = spark.createDataFrame([
    (0, ['Hi', 'I', 'heard', 'about', 'Spark']),
    (1, ['I', 'know', 'Spark' 'can' 'work' 'well' 'with' 'NLP']),
    (2, ['Logistic','regression','models','are','supervised'])
], ['id', 'words'])

ngram = NGram(n = 2, inputCol='words', outputCol='ngrams')

ngramDataFrame = ngram.transform(wordDataFrame)
ngramDataFrame.select('ngrams').show(truncate=False)

+--------------------------------------------------------------------+
|ngrams                                                              |
+--------------------------------------------------------------------+
|[Hi I, I heard, heard about, about Spark]                           |
|[I know, know SparkcanworkwellwithNLP]                              |
|[Logistic regression, regression models, models are, are supervised]|
+--------------------------------------------------------------------+



## TF-IDF

In [None]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [None]:
sentenceData = spark.createDataFrame([
    (0, 'a b c'),
    (0, 'a b c a'),
    (1, 'a b d d a c c')], ['label', 'sentence'])

In [None]:
sentenceData.show(truncate=False)

+-----+-------------+
|label|sentence     |
+-----+-------------+
|0    |a b c        |
|0    |a b c a      |
|1    |a b d d a c c|
+-----+-------------+



In [None]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
wordsData = tokenizer.transform(sentenceData)
wordsData.show(truncate=False)

+-----+-------------+---------------------+
|label|sentence     |words                |
+-----+-------------+---------------------+
|0    |a b c        |[a, b, c]            |
|0    |a b c a      |[a, b, c, a]         |
|1    |a b d d a c c|[a, b, d, d, a, c, c]|
+-----+-------------+---------------------+



In [None]:
hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=10)
featurizedData = hashingTF.transform(wordsData)
featurizedData.show(truncate=False)

+-----+-------------+---------------------+--------------------------------+
|label|sentence     |words                |rawFeatures                     |
+-----+-------------+---------------------+--------------------------------+
|0    |a b c        |[a, b, c]            |(10,[0,1,2],[1.0,1.0,1.0])      |
|0    |a b c a      |[a, b, c, a]         |(10,[0,1,2],[2.0,1.0,1.0])      |
|1    |a b d d a c c|[a, b, d, d, a, c, c]|(10,[0,1,2,4],[2.0,1.0,2.0,2.0])|
+-----+-------------+---------------------+--------------------------------+



In [None]:
idf = IDF(inputCol='rawFeatures', outputCol='features')
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select('label', 'features').show(truncate=False)

+-----+-----------------------------------------------+
|label|features                                       |
+-----+-----------------------------------------------+
|0    |(10,[0,1,2],[0.0,0.0,0.0])                     |
|0    |(10,[0,1,2],[0.0,0.0,0.0])                     |
|1    |(10,[0,1,2,4],[0.0,0.0,0.0,1.3862943611198906])|
+-----+-----------------------------------------------+



## CountVectorizer

In [None]:
from pyspark.ml.feature import CountVectorizer
# Input data: Each row is a bag of words with a ID
df = spark.createDataFrame([
     (0, 'a b c'.split(' ')),
     (0, 'a b c a'.split(' ')),
     (1, 'a b d d a c c'.split(' '))], ['id', 'words'])
# fit a CountVectorizerModel from the corpus
cv = CountVectorizer(inputCol='words', outputCol='features', vocabSize=4, minDF=1)

model = cv.fit(df)
result = model.transform(df)
result.show(truncate=False)

+---+---------------------+-------------------------------+
|id |words                |features                       |
+---+---------------------+-------------------------------+
|0  |[a, b, c]            |(4,[0,1,2],[1.0,1.0,1.0])      |
|0  |[a, b, c, a]         |(4,[0,1,2],[2.0,1.0,1.0])      |
|1  |[a, b, d, d, a, c, c]|(4,[0,1,2,3],[2.0,2.0,1.0,2.0])|
+---+---------------------+-------------------------------+

