In [12]:
import findspark  
findspark.init()  
import pyspark

In [13]:
from pyspark.sql import SparkSession

#Create new sparkSession
spark = SparkSession.builder\
        .appName("IndexToStringExample and ngrams")\
        .getOrCreate()


In [14]:
# import from pyspark from ml 
from pyspark.ml.feature import IndexToString, StringIndexer

# Create the dataframe for string 
df = spark.createDataFrame(
     [(0, "a"), (1, "b"), (2, "c"), (3, "d"), (4, "e"), (5, "a")],
     ["id", "category"])

#Conversion of string to index 
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = indexer.fit(df)
indexed_output = model.transform(df)

print("Transformed string column '%s' to indexed column '%s'"
      % (indexer.getInputCol(), indexer.getOutputCol()))
indexed_output.show()

print("Labels of StringIndexer are stored in output column metadata\n")

#Conversion of index to string
converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
converted = converter.transform(indexed)

print("Output indexed column '%s' back to original string column '%s' using "
      "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
converted.select("id", "categoryIndex", "originalCategory").show()
       

Transformed string column 'category' to indexed column 'categoryIndex'
+---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
|  0|       a|          0.0|
|  1|       b|          1.0|
|  2|       c|          2.0|
|  3|       d|          3.0|
|  4|       e|          4.0|
|  5|       a|          0.0|
+---+--------+-------------+

Labels of StringIndexer are stored in output column metadata

Output indexed column 'categoryIndex' back to original string column 'originalCategory' using labels in metadata
+---+-------------+----------------+
| id|categoryIndex|originalCategory|
+---+-------------+----------------+
|  0|          0.0|               a|
|  1|          1.0|               b|
|  2|          2.0|               c|
|  3|          3.0|               d|
|  4|          4.0|               e|
|  5|          0.0|               a|
+---+-------------+----------------+



In [15]:
from pyspark.ml.feature import NGram

# Example on n gram for words
wordDataFrame = spark.createDataFrame([
            (0, ["Hi", "I", "heard", "about", "learn", "Python"]),
            (1, ["I", "desire", "Spark", "could", "use", "case", "classes"]),
            (2, ["Linear", "regression", "models", "are", "fast"])
            ], ["id", "words"])

ngram = NGram(n=3, inputCol="words", outputCol="ngrams")

ngramDataFrame = ngram.transform(wordDataFrame)
ngramDataFrame.select("ngrams").show(truncate=False)
    


+---------------------------------------------------------------------------------------+
|ngrams                                                                                 |
+---------------------------------------------------------------------------------------+
|[Hi I heard, I heard about, heard about learn, about learn Python]                     |
|[I desire Spark, desire Spark could, Spark could use, could use case, use case classes]|
|[Linear regression models, regression models are, models are fast]                     |
+---------------------------------------------------------------------------------------+

