In [1]:
import findspark
findspark.init('/Users/kiranrudresha/Documents/spark/spark-2.2.0-bin-hadoop2.7')

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [6]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer

In [7]:
from pyspark.sql.functions import col,udf

In [8]:
from pyspark.sql.types import IntegerType

In [9]:
data = spark.createDataFrame(
    [('1','Hi i am kiran' ),
     ('2','what do you do'),
     ('3', 'i,work,has,Data,engineer')], 
     ['id', 'sentence'])

In [10]:
data

DataFrame[id: string, sentence: string]

In [11]:
data.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  1|       Hi i am kiran|
|  2|      what do you do|
|  3|i,work,has,Data,e...|
+---+--------------------+



In [12]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='tokens')

In [13]:
type(tokenizer)

pyspark.ml.feature.Tokenizer

In [28]:
token_data = tokenizer.transform(data)

In [29]:
token_data.show()

+---+--------------------+--------------------+
| id|            sentence|              tokens|
+---+--------------------+--------------------+
|  1|       Hi i am kiran|  [hi, i, am, kiran]|
|  2|      what do you do| [what, do, you, do]|
|  3|i,work,has,Data,e...|[i,work,has,data,...|
+---+--------------------+--------------------+



In [30]:
count_token = udf(lambda words : len(words), IntegerType())
# IntergetType is return type 

In [31]:
type(token_data)

pyspark.sql.dataframe.DataFrame

In [32]:
token_data.withColumn('token_count', count_token(token_data['tokens'])).show()

+---+--------------------+--------------------+-----------+
| id|            sentence|              tokens|token_count|
+---+--------------------+--------------------+-----------+
|  1|       Hi i am kiran|  [hi, i, am, kiran]|          4|
|  2|      what do you do| [what, do, you, do]|          4|
|  3|i,work,has,Data,e...|[i,work,has,data,...|          1|
+---+--------------------+--------------------+-----------+



In [39]:
regex_token = RegexTokenizer(inputCol='sentence', outputCol='token',pattern = '\\w')

In [40]:
regex_data = regex_token.transform(data)

In [41]:
regex_data.show()

+---+--------------------+------------+
| id|            sentence|       token|
+---+--------------------+------------+
|  1|       Hi i am kiran|   [ ,  ,  ]|
|  2|      what do you do|   [ ,  ,  ]|
|  3|i,work,has,Data,e...|[,, ,, ,, ,]|
+---+--------------------+------------+



In [43]:
regex_data.withColumn('token', count_token(regex_data['token'])).show()

+---+--------------------+-----+
| id|            sentence|token|
+---+--------------------+-----+
|  1|       Hi i am kiran|    3|
|  2|      what do you do|    3|
|  3|i,work,has,Data,e...|    4|
+---+--------------------+-----+



In [44]:
from pyspark.ml.feature import StopWordsRemover

In [46]:
data = spark.createDataFrame([
    ('0', ['I', 'saw', 'green', 'horse']),
    ('1', ['Mary', 'had', 'a', 'little', 'lamb'])],
    ['id', 'token'])

In [47]:
data.show()

+---+--------------------+
| id|               token|
+---+--------------------+
|  0|[I, saw, green, h...|
|  1|[Mary, had, a, li...|
+---+--------------------+



In [48]:
stopword = StopWordsRemover(inputCol='token', outputCol='filtered')

In [49]:
stop_data = stopword.transform(data)

In [50]:
stop_data.show()

+---+--------------------+--------------------+
| id|               token|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw, green, h...| [saw, green, horse]|
|  1|[Mary, had, a, li...|[Mary, little, lamb]|
+---+--------------------+--------------------+



In [51]:
from pyspark.ml.feature import NGram

In [55]:
ngram = NGram(n=2, inputCol='token', outputCol='Grams')

In [58]:
ngram.transform(data).show(truncate = False)

+---+----------------------------+----------------------------------------+
|id |token                       |Grams                                   |
+---+----------------------------+----------------------------------------+
|0  |[I, saw, green, horse]      |[I saw, saw green, green horse]         |
|1  |[Mary, had, a, little, lamb]|[Mary had, had a, a little, little lamb]|
+---+----------------------------+----------------------------------------+

