In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('nlp_tools').getOrCreate()

In [3]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer

from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [4]:
sent_df = spark.createDataFrame([
  (0, 'Hi I heard about Spark'),
  (1, 'I wish Java can use case classes'),
  (2, 'Logistic,regression,models,are,neat')
], ['id', 'sentence'])

In [5]:
sent_df.show()

In [6]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')

In [7]:
regex_tokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern='\\W')

In [8]:
count_tokens = udf(lambda words: len(words), IntegerType())

In [9]:
token_df = tokenizer.transform(sent_df)

In [10]:
token_df.show()

In [11]:
token_df.withColumn('tokens', count_tokens(col('words'))).show()

In [12]:
reg_df = regex_tokenizer.transform(sent_df)

In [13]:
reg_df.withColumn('tokens', count_tokens(col('words'))).show()

In [14]:
from pyspark.ml.feature import StopWordsRemover

In [15]:
sentence_df = spark.createDataFrame([
  (0, ['I', 'saw', 'a', 'green', 'horse']),
  (1, ['Mary', 'had', 'a', 'little', 'lamb'])
], ['id','tokens'])

In [16]:
sentence_df.show()

In [17]:
remover = StopWordsRemover(inputCol='tokens', outputCol='filtered')

In [18]:
remover.transform(sentence_df).show()

In [19]:
from pyspark.ml.feature import NGram

In [20]:
word_df = spark.createDataFrame([
  (0, ['Hi', 'I', 'heard', 'about', 'Spark']),
  (1, ['I', 'wish', 'Java', 'can', 'use', 'case', 'classes']),
  (2, ['Logistic','regression','models','are','neat'])
], ['id', 'words'])

In [21]:
ngram = NGram(n=2, inputCol='words', outputCol='grams')

In [22]:
ngram.transform(word_df).select('grams').show(truncate=False)

In [23]:
from pyspark.ml.feature import HashingTF, IDF

In [24]:
sentenceData = spark.createDataFrame([
  (0, 'Hi I heard about Spark'),
  (1, 'I wish Java can use case classes'),
  (1, 'Logistic regression models are neat')
], ['label', 'sentence'])

In [25]:
sentenceData.show()

In [26]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')

In [27]:
words_data = tokenizer.transform(sentenceData)

In [28]:
words_data.show()

In [29]:
hashing_tf = HashingTF(inputCol='words', outputCol='rawFeatures')

In [30]:
featuredData = hashing_tf.transform(words_data)

In [31]:
idf = IDF(inputCol='rawFeatures', outputCol='features')

In [32]:
rescaledData = idf.fit(featuredData).transform(featuredData)

In [33]:
rescaledData.select(['features', 'label']).show(truncate=False)

In [34]:
from pyspark.ml.feature import CountVectorizer

In [35]:
df = spark.createDataFrame([
  (0, 'a b c'.split()),
  (1, 'a b b c a'.split())
], ['id', 'words'])

In [36]:
cv = CountVectorizer(inputCol='words', outputCol='features', vocabSize=3, minDF=2)

In [37]:
results = cv.fit(df).transform(df)

In [38]:
results.show(truncate=False)