<a href="https://colab.research.google.com/github/kishan20-00/Spark_NLP/blob/main/Spark_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install -q pyspark py4j

In [2]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession\
        .builder\
        .getOrCreate()

In [4]:
sentenceData = spark.createDataFrame([(0.0, "Hi I heard about Spark"),
                                      (0.0, "I wish Java could use some classes"),
                                      (0.0, "Logistic regression models are neat")],
                                     ["label","sentence"])

In [5]:
sentenceData.show()

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|  0.0|Hi I heard about ...|
|  0.0|I wish Java could...|
|  0.0|Logistic regressi...|
+-----+--------------------+



In [6]:
tokenizer = Tokenizer(inputCol= "sentence", outputCol="words")

In [7]:
wordsData = tokenizer.transform(sentenceData)

In [8]:
wordsData.show()

+-----+--------------------+--------------------+
|label|            sentence|               words|
+-----+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|
|  0.0|I wish Java could...|[i, wish, java, c...|
|  0.0|Logistic regressi...|[logistic, regres...|
+-----+--------------------+--------------------+



In [9]:
hashingTF = HashingTF(inputCol= "words", outputCol="rawFeatures", numFeatures=20)

In [10]:
featurizedData = hashingTF.transform(wordsData)

In [11]:
featurizedData['rawFeatures']

Column<'rawFeatures'>

In [12]:
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [13]:
idfModel = idf.fit(featurizedData)

In [14]:
rescaledData = idfModel.transform(featurizedData)

In [15]:
rescaledData.select("label", "features").show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(20,[6,8,13,16],[...|
|  0.0|(20,[0,7,13,15,16...|
|  0.0|(20,[3,4,6,11,19]...|
+-----+--------------------+



In [16]:
spark.stop()