In [1]:
import os
import json
from difflib import unified_diff
import nltk
from pyspark import ml
#nltk.download('punkt')
from nltk import word_tokenize


cwd = os.getcwd()
print(cwd)

C:\Users\u0115374\Documents\PhD\Courses\Big Data\SparkStream


In [2]:
def make_diff(old, new):
    return '\n'.join([ l for l in unified_diff(old.split('\n'), new.split('\n')) if l.startswith('+') or l.startswith('-') ])

In [3]:
'''Function to extract the added pieces of text in the document.'''
def get_added_text(diff):
    edits = []
    for text in diff.split('\n'):
        if text.startswith("+"):
            edits.append(text)
    #print(edits)        
    return edits

In [4]:
from pyspark import SparkContext
sc = SparkContext("local", "example")

In [5]:
file = sc.textFile(cwd+"\Data\\test")

In [6]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('example').getOrCreate()

In [7]:
spark

In [9]:
df = spark.read.format("json").load(cwd+"\Data\\test\\part-*")

In [10]:
print(df.collect())



In [11]:
df.printSchema()

root
 |-- comment: string (nullable = true)
 |-- label: string (nullable = true)
 |-- name_user: string (nullable = true)
 |-- text_new: string (nullable = true)
 |-- text_old: string (nullable = true)
 |-- title_page: string (nullable = true)
 |-- url_page: string (nullable = true)



In [12]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *
udfMake_Diff = udf(make_diff, StringType())
df_new = df.withColumn("diff", udfMake_Diff("text_old", "text_new"))

In [13]:
df_new.select("diff").show(2,False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, RegexTokenizer
tokenizer = RegexTokenizer(inputCol="diff", outputCol="words", pattern="\\W")
tokenizer2 = RegexTokenizer(inputCol="diff", outputCol="words", pattern="\\d")

In [15]:
df_tknz = tokenizer.transform(df_new)

In [16]:
df_tknz.select("words").collect()

[Row(words=['whole', 'foods', 'market', 'inc', 'is', 'an', 'american', 'multinational', 'supermarket', 'chain', 'store', 'chain', 'headquartered', 'in', 'austin', 'texas', 'which', 'exclusively', 'sells', 'products', 'free', 'from', 'hydrogenated', 'fats', 'and', 'artificial', 'colors', 'flavors', 'and', 'preservatives', 'ref', 'cite', 'web', 'url', 'https', 'www', 'wholefoodsmarket', 'com', 'about', 'our', 'products', 'quality', 'standards', 'food', 'ingredient', 'title', 'food', 'ingredient', 'quality', 'standards', 'date', 'march', '12', '2012', 'website', 'whole', 'foods', 'market', 'ref', 'a', 'usda', 'certified', 'organic', 'grocer', 'in', 'the', 'united', 'states', 'the', 'chain', 'is', 'popularly', 'known', 'for', 'its', 'organic', 'food', 'organic', 'selections', 'ref', 'cite', 'web', 'url', 'https', 'www', 'wholefoodsmarket', 'com', 'about', 'our', 'products', 'organic', 'food', 'organic', 'grocer', 'title', 'certified', 'organic', 'grocer', 'date', 'april', '7', '2016', 'web

In [17]:
from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
df_filter = remover.transform(df_tknz)

In [18]:
df_filter.select("filtered").collect()
'''TODO: There are still a lot of numbers in there, which might not be so relevant. 
How can we filter this out? spark filter was not working well, maybe add a udf?'''

'TODO: There are still a lot of numbers in there, which might not be so relevant. \nHow can we filter this out? spark filter was not working well, maybe add a udf?'

In [24]:
hashingTFer = HashingTF(inputCol="filtered", outputCol="rawFeatures")
df_featurized = hashingTFer.transform(df_filter)
'''Add numFeatures parameter to reduce dimensionality, now it is 2^20'''

'Add numFeatures parameter to reduce dimensionality, now it is 2^20'

In [25]:
hashingTFer.extractParamMap()

{Param(parent='HashingTF_4f50b28969f1', name='outputCol', doc='output column name.'): 'rawFeatures',
 Param(parent='HashingTF_4f50b28969f1', name='numFeatures', doc='number of features.'): 262144,
 Param(parent='HashingTF_4f50b28969f1', name='binary', doc='If True, all non zero counts are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False.'): False,
 Param(parent='HashingTF_4f50b28969f1', name='inputCol', doc='input column name.'): 'filtered'}

In [26]:
df_featurized.select("rawFeatures").collect()

[Row(rawFeatures=SparseVector(262144, {3778: 2.0, 7367: 2.0, 12781: 2.0, 21300: 8.0, 21872: 2.0, 23574: 2.0, 28294: 6.0, 32764: 8.0, 33182: 4.0, 35119: 4.0, 37834: 4.0, 39688: 2.0, 41129: 2.0, 44738: 2.0, 49185: 4.0, 49564: 2.0, 54288: 2.0, 59177: 6.0, 60697: 2.0, 61868: 2.0, 63624: 2.0, 63836: 16.0, 64841: 2.0, 69752: 4.0, 74318: 6.0, 77099: 2.0, 78722: 2.0, 84463: 2.0, 89074: 2.0, 93003: 4.0, 94472: 12.0, 100534: 8.0, 106211: 2.0, 108772: 2.0, 109706: 2.0, 115275: 2.0, 116282: 2.0, 117162: 2.0, 120993: 2.0, 121133: 8.0, 128082: 8.0, 131881: 2.0, 133480: 6.0, 135524: 2.0, 138364: 6.0, 138836: 2.0, 142884: 2.0, 144134: 6.0, 144684: 6.0, 147946: 4.0, 148807: 2.0, 149762: 2.0, 154736: 2.0, 156000: 2.0, 157061: 2.0, 159586: 2.0, 162130: 2.0, 163502: 2.0, 169800: 2.0, 170698: 6.0, 178003: 14.0, 180708: 4.0, 182909: 4.0, 183858: 4.0, 183984: 2.0, 188978: 2.0, 191260: 2.0, 191774: 2.0, 192137: 2.0, 192828: 2.0, 194478: 4.0, 199937: 8.0, 203001: 2.0, 205325: 10.0, 208952: 2.0, 211332: 2.0, 21

In [27]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(df_featurized)
rescaledData = idfModel.transform(df_featurized)

rescaledData.select("label", "features").show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
| safe|(262144,[3778,736...|
| safe|(262144,[5381,619...|
| safe|(262144,[5381,822...|
| safe|(262144,[558,1156...|
| safe|(262144,[2722,419...|
+-----+--------------------+



In [28]:
rescaledData.select("features").collect()

[Row(features=SparseVector(262144, {3778: 2.1972, 7367: 2.1972, 12781: 2.1972, 21300: 0.0, 21872: 0.8109, 23574: 2.1972, 28294: 1.0939, 32764: 3.2437, 33182: 4.3944, 35119: 2.7726, 37834: 2.7726, 39688: 2.1972, 41129: 0.8109, 44738: 2.1972, 49185: 1.6219, 49564: 2.1972, 54288: 2.1972, 59177: 4.1589, 60697: 2.1972, 61868: 2.1972, 63624: 2.1972, 63836: 0.0, 64841: 0.8109, 69752: 4.3944, 74318: 6.5917, 77099: 2.1972, 78722: 2.1972, 84463: 2.1972, 89074: 1.3863, 93003: 4.3944, 94472: 13.1833, 100534: 0.0, 106211: 2.1972, 108772: 1.3863, 109706: 2.1972, 115275: 2.1972, 116282: 2.1972, 117162: 2.1972, 120993: 2.1972, 121133: 8.7889, 128082: 1.4586, 131881: 2.1972, 133480: 1.0939, 135524: 2.1972, 138364: 6.5917, 138836: 2.1972, 142884: 2.1972, 144134: 6.5917, 144684: 6.5917, 147946: 2.7726, 148807: 2.1972, 149762: 2.1972, 154736: 2.1972, 156000: 2.1972, 157061: 2.1972, 159586: 2.1972, 162130: 2.1972, 163502: 2.1972, 169800: 1.3863, 170698: 6.5917, 178003: 15.3806, 180708: 4.3944, 182909: 4.39

In [29]:
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol="filtered", outputCol="vectors")
model = cv.fit(df_filter)
df_cv = model.transform(df_filter)
df_cv.printSchema()

root
 |-- comment: string (nullable = true)
 |-- label: string (nullable = true)
 |-- name_user: string (nullable = true)
 |-- text_new: string (nullable = true)
 |-- text_old: string (nullable = true)
 |-- title_page: string (nullable = true)
 |-- url_page: string (nullable = true)
 |-- diff: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- vectors: vector (nullable = true)



In [30]:
df_cv.select("vectors").collect()

[Row(vectors=SparseVector(444, {0: 16.0, 1: 6.0, 2: 8.0, 3: 8.0, 4: 4.0, 5: 8.0, 7: 2.0, 8: 8.0, 10: 6.0, 11: 8.0, 13: 10.0, 14: 14.0, 15: 14.0, 17: 2.0, 19: 2.0, 21: 12.0, 25: 6.0, 28: 2.0, 30: 4.0, 43: 8.0, 47: 6.0, 53: 6.0, 55: 2.0, 56: 2.0, 59: 6.0, 60: 6.0, 61: 4.0, 69: 6.0, 70: 4.0, 72: 4.0, 85: 4.0, 89: 4.0, 92: 4.0, 95: 4.0, 108: 2.0, 112: 4.0, 119: 2.0, 132: 2.0, 133: 4.0, 138: 4.0, 140: 2.0, 149: 2.0, 165: 2.0, 168: 2.0, 174: 2.0, 178: 2.0, 179: 2.0, 180: 2.0, 182: 2.0, 191: 2.0, 194: 2.0, 197: 2.0, 208: 2.0, 218: 2.0, 221: 2.0, 227: 2.0, 229: 2.0, 230: 2.0, 233: 2.0, 252: 2.0, 253: 2.0, 254: 2.0, 257: 2.0, 258: 2.0, 269: 2.0, 273: 2.0, 277: 2.0, 287: 2.0, 293: 2.0, 297: 2.0, 301: 2.0, 331: 2.0, 333: 2.0, 336: 2.0, 337: 2.0, 343: 2.0, 357: 2.0, 360: 2.0, 364: 2.0, 367: 2.0, 369: 2.0, 375: 2.0, 384: 2.0, 385: 2.0, 392: 2.0, 396: 2.0, 401: 2.0, 402: 2.0, 407: 2.0, 415: 2.0, 429: 2.0})),
 Row(vectors=SparseVector(444, {0: 4.0, 1: 2.0, 2: 1.0, 3: 1.0, 4: 2.0, 5: 4.0, 10: 3.0, 16:

In [34]:
model.vocabulary[278]

'coast'

In [36]:
idf = IDF(inputCol="vectors", outputCol="features")
idfModel = idf.fit(df_cv)
df_tfidf = idfModel.transform(df_cv)

In [42]:
df_tfidf.printSchema()

root
 |-- comment: string (nullable = true)
 |-- label: string (nullable = true)
 |-- name_user: string (nullable = true)
 |-- text_new: string (nullable = true)
 |-- text_old: string (nullable = true)
 |-- title_page: string (nullable = true)
 |-- url_page: string (nullable = true)
 |-- diff: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- vectors: vector (nullable = true)
 |-- features: vector (nullable = true)



In [41]:
df_tfidf.select("label","features").collect()

[Row(label='safe', features=SparseVector(444, {0: 0.0, 1: 1.0939, 2: 0.0, 3: 0.0, 4: 1.6219, 5: 1.4586, 7: 0.8109, 8: 3.2437, 10: 1.0939, 11: 3.2437, 13: 6.9315, 14: 15.3806, 15: 15.3806, 17: 0.3646, 19: 0.8109, 21: 13.1833, 25: 4.1589, 28: 1.3863, 30: 2.7726, 43: 8.7889, 47: 6.5917, 53: 6.5917, 55: 0.8109, 56: 0.8109, 59: 6.5917, 60: 6.5917, 61: 2.7726, 69: 6.5917, 70: 2.7726, 72: 4.3944, 85: 4.3944, 89: 4.3944, 92: 4.3944, 95: 4.3944, 108: 1.3863, 112: 4.3944, 119: 1.3863, 132: 1.3863, 133: 4.3944, 138: 4.3944, 140: 1.3863, 149: 2.1972, 165: 2.1972, 168: 2.1972, 174: 2.1972, 178: 2.1972, 179: 2.1972, 180: 2.1972, 182: 2.1972, 191: 2.1972, 194: 2.1972, 197: 2.1972, 208: 2.1972, 218: 2.1972, 221: 2.1972, 227: 2.1972, 229: 2.1972, 230: 2.1972, 233: 2.1972, 252: 2.1972, 253: 2.1972, 254: 2.1972, 257: 2.1972, 258: 2.1972, 269: 2.1972, 273: 2.1972, 277: 2.1972, 287: 2.1972, 293: 2.1972, 297: 2.1972, 301: 2.1972, 331: 2.1972, 333: 2.1972, 336: 2.1972, 337: 2.1972, 343: 2.1972, 357: 2.1972, 

In [45]:
df_tfidf.select("vectors")*df_tfidf.select("features")

TypeError: unsupported operand type(s) for *: 'DataFrame' and 'DataFrame'