In [18]:
import findspark
findspark.init()

In [19]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("nlp").getOrCreate()


Tokenizer

In [20]:
from pyspark.ml.feature import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [21]:
df = spark.read.csv(r"C:\Users\Kartikeya Mandhar\Desktop\AWSMachineLearning\bbc-text.csv",header=True,inferSchema=True)

In [22]:
df.show()

+-------------+--------------------+
|     category|                text|
+-------------+--------------------+
|         tech|tv future in the ...|
|     business|worldcom boss  le...|
|        sport|tigers wary of fa...|
|        sport|yeading face newc...|
|entertainment|ocean s twelve ra...|
|     politics|howard hits back ...|
|     politics|blair prepares to...|
|        sport|henman hopes ende...|
|        sport|wilkinson fit to ...|
|entertainment|last star wars  n...|
|entertainment|berlin cheers for...|
|     business|virgin blue share...|
|     business|crude oil prices ...|
|     politics|hague  given up  ...|
|        sport|moya emotional af...|
|     business|s korean credit c...|
|     politics|howard backs stem...|
|        sport|connors boost for...|
|     business|japanese banking ...|
|         tech|games maker fight...|
+-------------+--------------------+
only showing top 20 rows



In [23]:
tokenizer = Tokenizer(inputCol = "text",outputCol = "words")
regexTokenizer = RegexTokenizer(inputCol = "text",outputCol = "words",pattern = "\\W")

countTokens = udf(lambda x:len(x),IntegerType())     #Word count for each sentence


In [24]:
tokenized = tokenizer.transform(df)

In [25]:
tokenized.show()

+-------------+--------------------+--------------------+
|     category|                text|               words|
+-------------+--------------------+--------------------+
|         tech|tv future in the ...|[tv, future, in, ...|
|     business|worldcom boss  le...|[worldcom, boss, ...|
|        sport|tigers wary of fa...|[tigers, wary, of...|
|        sport|yeading face newc...|[yeading, face, n...|
|entertainment|ocean s twelve ra...|[ocean, s, twelve...|
|     politics|howard hits back ...|[howard, hits, ba...|
|     politics|blair prepares to...|[blair, prepares,...|
|        sport|henman hopes ende...|[henman, hopes, e...|
|        sport|wilkinson fit to ...|[wilkinson, fit, ...|
|entertainment|last star wars  n...|[last, star, wars...|
|entertainment|berlin cheers for...|[berlin, cheers, ...|
|     business|virgin blue share...|[virgin, blue, sh...|
|     business|crude oil prices ...|[crude, oil, pric...|
|     politics|hague  given up  ...|[hague, , given, ...|
|        sport

In [26]:
tokenized.select("text","words").withColumn("tokens",countTokens(col("words"))).show()

+--------------------+--------------------+------+
|                text|               words|tokens|
+--------------------+--------------------+------+
|tv future in the ...|[tv, future, in, ...|   806|
|worldcom boss  le...|[worldcom, boss, ...|   332|
|tigers wary of fa...|[tigers, wary, of...|   270|
|yeading face newc...|[yeading, face, n...|   390|
|ocean s twelve ra...|[ocean, s, twelve...|   287|
|howard hits back ...|[howard, hits, ba...|   701|
|blair prepares to...|[blair, prepares,...|   284|
|henman hopes ende...|[henman, hopes, e...|   202|
|wilkinson fit to ...|[wilkinson, fit, ...|   163|
|last star wars  n...|[last, star, wars...|   253|
|berlin cheers for...|[berlin, cheers, ...|   326|
|virgin blue share...|[virgin, blue, sh...|   216|
|crude oil prices ...|[crude, oil, pric...|   360|
|hague  given up  ...|[hague, , given, ...|   324|
|moya emotional af...|[moya, emotional,...|   498|
|s korean credit c...|[s, korean, credi...|   311|
|howard backs stem...|[howard, 

In [27]:
regexTokenized = regexTokenizer.transform(df)
regexTokenized.select("text","words").withColumn("tokens",countTokens(col("words"))).show()

+--------------------+--------------------+------+
|                text|               words|tokens|
+--------------------+--------------------+------+
|tv future in the ...|[tv, future, in, ...|   750|
|worldcom boss  le...|[worldcom, boss, ...|   300|
|tigers wary of fa...|[tigers, wary, of...|   248|
|yeading face newc...|[yeading, face, n...|   349|
|ocean s twelve ra...|[ocean, s, twelve...|   269|
|howard hits back ...|[howard, hits, ba...|   625|
|blair prepares to...|[blair, prepares,...|   269|
|henman hopes ende...|[henman, hopes, e...|   198|
|wilkinson fit to ...|[wilkinson, fit, ...|   165|
|last star wars  n...|[last, star, wars...|   236|
|berlin cheers for...|[berlin, cheers, ...|   311|
|virgin blue share...|[virgin, blue, sh...|   202|
|crude oil prices ...|[crude, oil, pric...|   342|
|hague  given up  ...|[hague, given, up...|   291|
|moya emotional af...|[moya, emotional,...|   466|
|s korean credit c...|[s, korean, credi...|   294|
|howard backs stem...|[howard, 

Regex grabs each and every word, whereas normal lambda seperation looks for spaces giving rise to inaccurate tokens ( length count ) Therefore regex tokenization is always preferred.

# Stop Word Removal

In [28]:
remover = StopWordsRemover(inputCol = "words",outputCol = "cleaned")

In [42]:
df_token = regexTokenized.select("text","words").withColumn("tokens",countTokens(col("words")))

In [43]:
df_token = remover.transform(df_token)

# n-grams

In [44]:
bigrams = NGram(n=2,inputCol = "words",outputCol = "bigrams")

In [45]:
bigrams_df = bigrams.transform(df_token)

In [47]:
bigrams_df.select("bigrams").show()

+--------------------+
|             bigrams|
+--------------------+
|[tv future, futur...|
|[worldcom boss, b...|
|[tigers wary, war...|
|[yeading face, fa...|
|[ocean s, s twelv...|
|[howard hits, hit...|
|[blair prepares, ...|
|[henman hopes, ho...|
|[wilkinson fit, f...|
|[last star, star ...|
|[berlin cheers, c...|
|[virgin blue, blu...|
|[crude oil, oil p...|
|[hague given, giv...|
|[moya emotional, ...|
|[s korean, korean...|
|[howard backs, ba...|
|[connors boost, b...|
|[japanese banking...|
|[games maker, mak...|
+--------------------+
only showing top 20 rows



# TF-IDF ( Term Frequency- Inverse Document Frequency )

In [48]:
tokenizer=RegexTokenizer(inputCol = "text",outputCol = "words",pattern="\\W")
words_df = tokenizer.transform(df)

In [50]:
words_df.show()

+-------------+--------------------+--------------------+
|     category|                text|               words|
+-------------+--------------------+--------------------+
|         tech|tv future in the ...|[tv, future, in, ...|
|     business|worldcom boss  le...|[worldcom, boss, ...|
|        sport|tigers wary of fa...|[tigers, wary, of...|
|        sport|yeading face newc...|[yeading, face, n...|
|entertainment|ocean s twelve ra...|[ocean, s, twelve...|
|     politics|howard hits back ...|[howard, hits, ba...|
|     politics|blair prepares to...|[blair, prepares,...|
|        sport|henman hopes ende...|[henman, hopes, e...|
|        sport|wilkinson fit to ...|[wilkinson, fit, ...|
|entertainment|last star wars  n...|[last, star, wars...|
|entertainment|berlin cheers for...|[berlin, cheers, ...|
|     business|virgin blue share...|[virgin, blue, sh...|
|     business|crude oil prices ...|[crude, oil, pric...|
|     politics|hague  given up  ...|[hague, given, up...|
|        sport

In [51]:
hashingTF = HashingTF(inputCol = "words",outputCol = "rawFeatures",numFeatures = 20)
featurized = hashingTF.transform(words_df)

In [55]:
featurized.show()

+-------------+--------------------+--------------------+--------------------+
|     category|                text|               words|         rawFeatures|
+-------------+--------------------+--------------------+--------------------+
|         tech|tv future in the ...|[tv, future, in, ...|(20,[0,1,2,3,4,5,...|
|     business|worldcom boss  le...|[worldcom, boss, ...|(20,[0,1,2,3,4,5,...|
|        sport|tigers wary of fa...|[tigers, wary, of...|(20,[0,1,2,3,4,5,...|
|        sport|yeading face newc...|[yeading, face, n...|(20,[0,1,2,3,4,5,...|
|entertainment|ocean s twelve ra...|[ocean, s, twelve...|(20,[0,1,2,3,4,5,...|
|     politics|howard hits back ...|[howard, hits, ba...|(20,[0,1,2,3,4,5,...|
|     politics|blair prepares to...|[blair, prepares,...|(20,[0,1,2,3,4,5,...|
|        sport|henman hopes ende...|[henman, hopes, e...|(20,[0,1,2,3,4,5,...|
|        sport|wilkinson fit to ...|[wilkinson, fit, ...|(20,[0,1,2,3,4,5,...|
|entertainment|last star wars  n...|[last, star, war

In [56]:
idf = IDF(inputCol = "rawFeatures",outputCol = "features")
idf_model = idf.fit(featurized)

In [57]:
rescale = idf_model.transform(featurized)
rescale.select("category","features").show()

+-------------+--------------------+
|     category|            features|
+-------------+--------------------+
|         tech|(20,[0,1,2,3,4,5,...|
|     business|(20,[0,1,2,3,4,5,...|
|        sport|(20,[0,1,2,3,4,5,...|
|        sport|(20,[0,1,2,3,4,5,...|
|entertainment|(20,[0,1,2,3,4,5,...|
|     politics|(20,[0,1,2,3,4,5,...|
|     politics|(20,[0,1,2,3,4,5,...|
|        sport|(20,[0,1,2,3,4,5,...|
|        sport|(20,[0,1,2,3,4,5,...|
|entertainment|(20,[0,1,2,3,4,5,...|
|entertainment|(20,[0,1,2,3,4,5,...|
|     business|(20,[0,1,2,3,4,5,...|
|     business|(20,[0,1,2,3,4,5,...|
|     politics|(20,[0,1,2,3,4,5,...|
|        sport|(20,[0,1,2,3,4,5,...|
|     business|(20,[0,1,2,3,4,5,...|
|     politics|(20,[0,1,2,3,4,5,...|
|        sport|(20,[0,1,2,3,4,5,...|
|     business|(20,[0,1,2,3,4,5,...|
|         tech|(20,[0,1,2,3,4,5,...|
+-------------+--------------------+
only showing top 20 rows



# Count Vectorization

In [59]:
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol="words", outputCol="features")
model = cv.fit(words_df)
result = model.transform(words_df)

In [61]:
result.select("features").head()

Row(features=SparseVector(29457, {0: 43.0, 1: 17.0, 2: 26.0, 3: 24.0, 4: 8.0, 5: 13.0, 6: 6.0, 7: 8.0, 8: 12.0, 9: 7.0, 10: 6.0, 11: 3.0, 12: 3.0, 14: 1.0, 15: 6.0, 16: 10.0, 17: 2.0, 18: 3.0, 19: 2.0, 20: 6.0, 21: 1.0, 22: 6.0, 23: 4.0, 24: 9.0, 26: 1.0, 27: 3.0, 28: 12.0, 29: 1.0, 30: 2.0, 31: 2.0, 32: 3.0, 33: 2.0, 34: 4.0, 37: 2.0, 38: 3.0, 39: 4.0, 41: 2.0, 42: 1.0, 44: 3.0, 45: 1.0, 46: 7.0, 47: 3.0, 48: 7.0, 49: 4.0, 50: 1.0, 51: 5.0, 52: 2.0, 54: 6.0, 55: 1.0, 56: 1.0, 57: 1.0, 58: 1.0, 60: 1.0, 65: 2.0, 66: 2.0, 70: 2.0, 71: 1.0, 72: 3.0, 73: 7.0, 74: 1.0, 75: 3.0, 76: 3.0, 78: 2.0, 79: 1.0, 81: 5.0, 83: 1.0, 84: 1.0, 85: 1.0, 87: 4.0, 88: 1.0, 92: 2.0, 94: 1.0, 100: 2.0, 102: 2.0, 105: 1.0, 109: 3.0, 110: 1.0, 111: 2.0, 115: 2.0, 116: 1.0, 117: 2.0, 118: 1.0, 119: 1.0, 124: 1.0, 125: 2.0, 127: 1.0, 129: 1.0, 140: 3.0, 144: 1.0, 146: 1.0, 149: 1.0, 150: 4.0, 159: 3.0, 160: 2.0, 162: 4.0, 165: 7.0, 169: 1.0, 170: 1.0, 174: 1.0, 176: 1.0, 177: 13.0, 180: 2.0, 181: 1.0, 182: 1.0,