In [1]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
import findspark
findspark.init()

In [3]:
import pyspark
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [5]:
spark = SparkSession.builder.appName('association_rule').getOrCreate()

In [6]:
sentenceDataFrame = spark.createDataFrame([
    (0, "Hi I heard about Spark"),
    (1, "I know Spark can work well with NLP"),
    (2, "Logistic,regession,models,are,supervised")
], ["id", "sentence"])

### Tokenizer

In [7]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer

In [8]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')

regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
# alternatively, pattern="\\W+", gap(False)

countTokens = udf(lambda words: len(words), IntegerType())

tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.select('sentence', 'words').withColumn('tokens', countTokens(col('words'))).show(truncate=False)

+----------------------------------------+--------------------------------------------+------+
|sentence                                |words                                       |tokens|
+----------------------------------------+--------------------------------------------+------+
|Hi I heard about Spark                  |[hi, i, heard, about, spark]                |5     |
|I know Spark can work well with NLP     |[i, know, spark, can, work, well, with, nlp]|8     |
|Logistic,regession,models,are,supervised|[logistic,regession,models,are,supervised]  |1     |
+----------------------------------------+--------------------------------------------+------+



In [9]:
regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.select('sentence', 'words').withColumn('tokens', countTokens(col('words'))).show(truncate=False)

+----------------------------------------+----------------------------------------------+------+
|sentence                                |words                                         |tokens|
+----------------------------------------+----------------------------------------------+------+
|Hi I heard about Spark                  |[hi, i, heard, about, spark]                  |5     |
|I know Spark can work well with NLP     |[i, know, spark, can, work, well, with, nlp]  |8     |
|Logistic,regession,models,are,supervised|[logistic, regession, models, are, supervised]|5     |
+----------------------------------------+----------------------------------------------+------+



### Stopword Remover

In [10]:
from pyspark.ml.feature import StopWordsRemover

In [11]:
sentenceData = spark.createDataFrame([(0, ['I', "go", "to", "school", "by", "bus"]),
                                      (1, ['Minh', "has", 'lots', 'of', "pencils"])], 
                                     ["id", "raw"])

remover = StopWordsRemover(inputCol = 'raw', outputCol = 'filtered')
remover.transform(sentenceData).show(truncate = False)

+---+------------------------------+---------------------+
|id |raw                           |filtered             |
+---+------------------------------+---------------------+
|0  |[I, go, to, school, by, bus]  |[go, school, bus]    |
|1  |[Minh, has, lots, of, pencils]|[Minh, lots, pencils]|
+---+------------------------------+---------------------+



### NGram

In [12]:
from pyspark.ml.feature import NGram

In [13]:
ngram = NGram(n=2, inputCol='words', outputCol="ngrams")
ngramDataFrame = ngram.transform(regexTokenized)
ngramDataFrame.select('sentence', 'words', 'ngrams').show(truncate=False)

+----------------------------------------+----------------------------------------------+-------------------------------------------------------------------------+
|sentence                                |words                                         |ngrams                                                                   |
+----------------------------------------+----------------------------------------------+-------------------------------------------------------------------------+
|Hi I heard about Spark                  |[hi, i, heard, about, spark]                  |[hi i, i heard, heard about, about spark]                                |
|I know Spark can work well with NLP     |[i, know, spark, can, work, well, with, nlp]  |[i know, know spark, spark can, can work, work well, well with, with nlp]|
|Logistic,regession,models,are,supervised|[logistic, regession, models, are, supervised]|[logistic regession, regession models, models are, are supervised]       |
+---------------

### TF-IDF

In [14]:
from pyspark.ml.feature import HashingTF

In [15]:
hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=10)
featureizedData = hashingTF.transform(regexTokenized)
featureizedData.show(truncate=False)

+---+----------------------------------------+----------------------------------------------+--------------------------------------+
|id |sentence                                |words                                         |rawFeatures                           |
+---+----------------------------------------+----------------------------------------------+--------------------------------------+
|0  |Hi I heard about Spark                  |[hi, i, heard, about, spark]                  |(10,[3,6,8],[1.0,3.0,1.0])            |
|1  |I know Spark can work well with NLP     |[i, know, spark, can, work, well, with, nlp]  |(10,[0,3,6,7,9],[1.0,1.0,2.0,2.0,2.0])|
|2  |Logistic,regession,models,are,supervised|[logistic, regession, models, are, supervised]|(10,[1,3,4,5,8],[1.0,1.0,1.0,1.0,1.0])|
+---+----------------------------------------+----------------------------------------------+--------------------------------------+



In [16]:
from pyspark.ml.feature import IDF

In [17]:
idf = IDF(inputCol='rawFeatures', outputCol='features')
idfModel = idf.fit(featureizedData)
rescaledData = idfModel.transform(featureizedData)

rescaledData.select('id', 'features').show(truncate=False)

+---+---------------------------------------------------------------------------------------------------+
|id |features                                                                                           |
+---+---------------------------------------------------------------------------------------------------+
|0  |(10,[3,6,8],[0.0,0.8630462173553426,0.28768207245178085])                                          |
|1  |(10,[0,3,6,7,9],[0.6931471805599453,0.0,0.5753641449035617,1.3862943611198906,1.3862943611198906]) |
|2  |(10,[1,3,4,5,8],[0.6931471805599453,0.0,0.6931471805599453,0.6931471805599453,0.28768207245178085])|
+---+---------------------------------------------------------------------------------------------------+



### CountVectorizer

In [18]:
from pyspark.ml.feature import CountVectorizer

In [19]:
# Fit a CountVectorizerModel from the corpus
# vocabSize: Số lượng từ duy nhất
cv = CountVectorizer(inputCol="words", outputCol="features_C", vocabSize=4, minDF=1)
model = cv.fit(featureizedData)
result = model.transform(featureizedData)
result.show(truncate=False)

+---+----------------------------------------+----------------------------------------------+--------------------------------------+-------------------------+
|id |sentence                                |words                                         |rawFeatures                           |features_C               |
+---+----------------------------------------+----------------------------------------------+--------------------------------------+-------------------------+
|0  |Hi I heard about Spark                  |[hi, i, heard, about, spark]                  |(10,[3,6,8],[1.0,3.0,1.0])            |(4,[0,1,3],[1.0,1.0,1.0])|
|1  |I know Spark can work well with NLP     |[i, know, spark, can, work, well, with, nlp]  |(10,[0,3,6,7,9],[1.0,1.0,2.0,2.0,2.0])|(4,[0,1],[1.0,1.0])      |
|2  |Logistic,regession,models,are,supervised|[logistic, regession, models, are, supervised]|(10,[1,3,4,5,8],[1.0,1.0,1.0,1.0,1.0])|(4,[2],[1.0])            |
+---+----------------------------------------+