# Spark MLlib TF-IDF – Example

https://www.tutorialkart.com/apache-spark/spark-mllib-tf-idf/

In [None]:
!pip install pyspark

In [None]:
from __future__ import print_function
 
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession\
        .builder\
        .appName("TfIdf Example")\
        .getOrCreate() 

In [None]:
sentenceData = spark.createDataFrame([
        (0.0, "Welcome to TutorialKart."),
        (0.0, "Learn Spark at TutorialKart. cows"),
        (1.0, "Spark Mllib has TF-IDF."),
        (1.0, "Spark Mllib has has has has has has has TF-IDF cows.")
    ], ["label", "sentence"]) 

In [None]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData) 

In [None]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=199)
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors





In [None]:
type(tokenizer)

In [None]:
type(wordsData)

In [None]:
wordsData.show()

In [None]:
featurizedData.show()

In [None]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData) 

In [None]:
rescaledData.select("label", "features").show()

In [None]:
idfModel


In [None]:
idfModel.vocabulary