In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [3]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer

In [6]:
from pyspark.sql.functions import col,udf
from pyspark.sql.types import IntegerType

In [11]:
sen_df = spark.createDataFrame([
    (0, 'I love learning spark'),
    (1, 'I would also like to learn scala'),
    (2, 'Yesterday the temperature was -7 celcius'),
    (3, 'Enthusiastic,innovative,happy,energetic,willing')
],['id','sentence'])

In [13]:
sen_df.show(truncate=False)

+---+-----------------------------------------------+
|id |sentence                                       |
+---+-----------------------------------------------+
|0  |I love learning spark                          |
|1  |I would also like to learn scala               |
|2  |Yesterday the temperature was -7 celcius       |
|3  |Enthusiastic,innovative,happy,energetic,willing|
+---+-----------------------------------------------+



In [14]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')

In [16]:
regex_tokenizer = RegexTokenizer(inputCol='sentence',outputCol='words', pattern='\\W')

In [17]:
count_tokens = udf(lambda words:len(words),IntegerType())

In [18]:
tokenized = tokenizer.transform(sen_df)

In [20]:
tokenized.show(truncate=False)

+---+-----------------------------------------------+-------------------------------------------------+
|id |sentence                                       |words                                            |
+---+-----------------------------------------------+-------------------------------------------------+
|0  |I love learning spark                          |[i, love, learning, spark]                       |
|1  |I would also like to learn scala               |[i, would, also, like, to, learn, scala]         |
|2  |Yesterday the temperature was -7 celcius       |[yesterday, the, temperature, was, -7, celcius]  |
|3  |Enthusiastic,innovative,happy,energetic,willing|[enthusiastic,innovative,happy,energetic,willing]|
+---+-----------------------------------------------+-------------------------------------------------+



In [21]:
tokenized.withColumn('tokens', count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|I love learning s...|[i, love, learnin...|     4|
|  1|I would also like...|[i, would, also, ...|     7|
|  2|Yesterday the tem...|[yesterday, the, ...|     6|
|  3|Enthusiastic,inno...|[enthusiastic,inn...|     1|
+---+--------------------+--------------------+------+



In [22]:
rg_tokenized = regex_tokenizer.transform(sen_df)

In [24]:
rg_tokenized.withColumn('tokens', count_tokens(col('words'))).show(truncate=False)

+---+-----------------------------------------------+-----------------------------------------------------+------+
|id |sentence                                       |words                                                |tokens|
+---+-----------------------------------------------+-----------------------------------------------------+------+
|0  |I love learning spark                          |[i, love, learning, spark]                           |4     |
|1  |I would also like to learn scala               |[i, would, also, like, to, learn, scala]             |7     |
|2  |Yesterday the temperature was -7 celcius       |[yesterday, the, temperature, was, 7, celcius]       |6     |
|3  |Enthusiastic,innovative,happy,energetic,willing|[enthusiastic, innovative, happy, energetic, willing]|5     |
+---+-----------------------------------------------+-----------------------------------------------------+------+



In [25]:
from pyspark.ml.feature import StopWordsRemover

In [28]:
sentence_df = spark.createDataFrame([
    (0, ['I','will','pizza','tonight','with','Theo']),
    (1, ['The','sun','has','not','appeared','for','40','days'])
],['id','tokens'])

In [32]:
remover = StopWordsRemover(inputCol='tokens', outputCol='filtered')

In [35]:
remover.transform(sentence_df).show(truncate=False)

+---+---------------------------------------------+-------------------------+
|id |tokens                                       |filtered                 |
+---+---------------------------------------------+-------------------------+
|0  |[I, will, pizza, tonight, with, Theo]        |[pizza, tonight, Theo]   |
|1  |[The, sun, has, not, appeared, for, 40, days]|[sun, appeared, 40, days]|
+---+---------------------------------------------+-------------------------+



In [36]:
from pyspark.ml.feature import NGram

In [40]:
word_df = spark.createDataFrame([
    (0, ['I','will','pizza','tonight','with','Theo']),
    (1, ['The','sun','has','not','appeared','for','40','days'])
],['id','words'])

In [41]:
ngram = NGram(n=2, inputCol='words',outputCol='grams')

In [42]:
ngram.transform(word_df).show(trunc)

+---+--------------------+--------------------+
| id|               words|               grams|
+---+--------------------+--------------------+
|  0|[I, will, pizza, ...|[I will, will piz...|
|  1|[The, sun, has, n...|[The sun, sun has...|
+---+--------------------+--------------------+

