In [1]:
# Always needs to be done in Rasberry Pi
import findspark
findspark.init('/home/baxman/spark-2.4.7-bin-hadoop2.7')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp_tools').getOrCreate()

In [2]:
from pyspark.ml.feature import RegexTokenizer,Tokenizer

In [5]:
from pyspark.sql.functions import col,udf
from pyspark.sql.types import IntegerType

In [17]:
# Making 3 sentences and ids
sen_df = spark.createDataFrame([
                              (0,'spanky 2811, sparky doodle mcguyver pants'),
                              (1,'what makes a man a man? what was will be, what will be was!'),
                              (2, 'I,ike,so,much,this game,one,we.call,stellaris!')],
                              ['id', 'sentence'])

In [18]:
sen_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|spanky 2811, spar...|
|  1|what makes a man ...|
|  2|I,ike,so,much,thi...|
+---+--------------------+



In [19]:
# Define tokenizers
tokenizer = Tokenizer(inputCol = 'sentence', outputCol = 'words')
regextokenizer = RegexTokenizer(inputCol = 'sentence', outputCol = 'words', pattern = '\\W')

In [20]:
count_tokens = udf(lambda words:len(words),IntegerType())

In [21]:
# Tokenize the sentence df
tokenized_df = tokenizer.transform(sen_df)

In [22]:
tokenized_df.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|spanky 2811, spar...|[spanky, 2811,, s...|
|  1|what makes a man ...|[what, makes, a, ...|
|  2|I,ike,so,much,thi...|[i,ike,so,much,th...|
+---+--------------------+--------------------+



In [23]:
# Show number of tokens
tokenized_df.withColumn('tokens', count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|spanky 2811, spar...|[spanky, 2811,, s...|     6|
|  1|what makes a man ...|[what, makes, a, ...|    14|
|  2|I,ike,so,much,thi...|[i,ike,so,much,th...|     2|
+---+--------------------+--------------------+------+



In [24]:
# Tokenize the sentence df (Regex)
tokenized_regex_df = regextokenizer.transform(sen_df)

In [25]:
tokenized_regex_df.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|spanky 2811, spar...|[spanky, 2811, sp...|
|  1|what makes a man ...|[what, makes, a, ...|
|  2|I,ike,so,much,thi...|[i, ike, so, much...|
+---+--------------------+--------------------+



In [26]:
# Show number of tokens - Regex
tokenized_regex_df.withColumn('tokens', count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|spanky 2811, spar...|[spanky, 2811, sp...|     6|
|  1|what makes a man ...|[what, makes, a, ...|    14|
|  2|I,ike,so,much,thi...|[i, ike, so, much...|    10|
+---+--------------------+--------------------+------+



In [27]:
# Removing stopwords
from pyspark.ml.feature import StopWordsRemover

sentenceData = spark.createDataFrame([
    (0, ["I", "saw", "the", "red", "balloon"]),
    (1, ["Mary", "had", "a", "little", "lamb"])
], ["id", "raw"])

remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
remover.transform(sentenceData).show(truncate=False)


+---+----------------------------+--------------------+
|id |raw                         |filtered            |
+---+----------------------------+--------------------+
|0  |[I, saw, the, red, balloon] |[saw, red, balloon] |
|1  |[Mary, had, a, little, lamb]|[Mary, little, lamb]|
+---+----------------------------+--------------------+



In [28]:
# N-grams -> numbers of tokens, so ecah gram is 2 tokens 
from pyspark.ml.feature import NGram

wordDataFrame = spark.createDataFrame([
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
    (2, ["Logistic", "regression", "models", "are", "neat"])
], ["id", "words"])

ngram = NGram(n=2, inputCol="words", outputCol="ngrams")

ngramDataFrame = ngram.transform(wordDataFrame)
ngramDataFrame.select("ngrams").show(truncate=False)


+------------------------------------------------------------------+
|ngrams                                                            |
+------------------------------------------------------------------+
|[Hi I, I heard, heard about, about Spark]                         |
|[I wish, wish Java, Java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+



In [32]:
# TF-IDF -> from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
sentenceData = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])

sentenceData.show()

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|  0.0|Hi I heard about ...|
|  0.0|I wish Java could...|
|  1.0|Logistic regressi...|
+-----+--------------------+



In [33]:
# Tokenize
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
wordsData.show()

+-----+--------------------+--------------------+
|label|            sentence|               words|
+-----+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|
|  0.0|I wish Java could...|[i, wish, java, c...|
|  1.0|Logistic regressi...|[logistic, regres...|
+-----+--------------------+--------------------+



In [35]:
# Get term frequency -> TF part and IDF part
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)

In [36]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select("label", "features").show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(20,[0,5,9,17],[0...|
|  0.0|(20,[2,7,9,13,15]...|
|  1.0|(20,[4,6,13,15,18...|
+-----+--------------------+



In [37]:
# Words transformed into lable and features for ML

In [38]:
# Count vectorizer
from pyspark.ml.feature import CountVectorizer

# Input data: Each row is a bag of words with a ID.
df = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, "a b b c a".split(" "))
], ["id", "words"])

In [39]:
# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)

In [41]:
model = cv.fit(df)

In [42]:
result = model.transform(df)
result.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+



In [None]:
# Can see counts of words in the corpus