In [22]:
import os
import json
from difflib import unified_diff
from pyspark import ml
import numpy as np
from pyspark import SparkContext
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import udf
from pyspark.ml import Pipeline, Transformer
from pyspark.sql.types import *
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, RegexTokenizer, StopWordsRemover, CountVectorizer, VectorAssembler, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier, NaiveBayes

cwd = os.getcwd()
print(cwd)

C:\Users\u0115374\Documents\PhD\Courses\Big Data\SparkStream


In [3]:
sc = SparkContext("local", "assignment3")
spark = SparkSession.builder.appName('assignment3').getOrCreate()
spark

In [13]:
dirs = os.listdir('C:\\Users\\u0115374\\Documents\\PhD\\Courses\\Big Data\\Assignment3\\streaming_1a')
dirs

['outputs-1585746250000',
 'outputs-1585746260000',
 'outputs-1585746270000',
 'outputs-1585746280000',
 'outputs-1585746290000',
 'outputs-1585746300000',
 'outputs-1585746310000',
 'outputs-1585746320000',
 'outputs-1585746330000',
 'outputs-1585746340000',
 'outputs-1585746350000',
 'outputs-1585746360000',
 'outputs-1585746370000',
 'outputs-1585746380000',
 'outputs-1585746390000',
 'outputs-1585746400000',
 'outputs-1585746410000',
 'outputs-1585746420000',
 'outputs-1585746430000',
 'outputs-1585746440000',
 'outputs-1585746450000',
 'outputs-1585746460000',
 'outputs-1585746470000',
 'outputs-1585746480000',
 'outputs-1585746490000',
 'outputs-1585746500000',
 'outputs-1585746510000',
 'outputs-1585746520000',
 'outputs-1585746530000',
 'outputs-1585746540000',
 'outputs-1585746550000',
 'outputs-1585746560000',
 'outputs-1585746570000',
 'outputs-1585746580000',
 'outputs-1585746590000',
 'outputs-1585746600000',
 'outputs-1585746610000',
 'outputs-1585746620000',
 'outputs-15

In [20]:
df = spark.read.format("json").load('C:\\Users\\u0115374\\Documents\\PhD\\Courses\\Big Data\\Assignment3\\test\\*')
df.printSchema()

root
 |-- comment: string (nullable = true)
 |-- label: string (nullable = true)
 |-- name_user: string (nullable = true)
 |-- text_new: string (nullable = true)
 |-- text_old: string (nullable = true)
 |-- title_page: string (nullable = true)
 |-- url_page: string (nullable = true)



In [21]:
def make_diff(old, new):
    return '\n'.join([ l for l in unified_diff(old.split('\n'), new.split('\n')) if l.startswith('+') or l.startswith('-') ])

udfMake_Diff = udf(make_diff, StringType())

def add_diff(df):
    df.withColumn("diff", udfMake_Diff("text_old", "text_new"))

In [23]:
class DiffColTransformer(Transformer):
    '''Custom transformer to get udf makeDiff into pipeline.'''
    def __init__(self):
        super(DiffColTransformer, self).__init__()
        
    def _transform(self, df: DataFrame) -> DataFrame:
        df = df.withColumn("diff", udfMake_Diff("text_old", "text_new"))
        return df

In [53]:
train, test = df.randomSplit([0.70, 0.30])

In [55]:
dct = DiffColTransformer()
tokenizer = RegexTokenizer(inputCol="diff", outputCol="words", pattern="\\W")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
cv = CountVectorizer(inputCol="filtered", outputCol="vectors")
idf = IDF(inputCol="vectors", outputCol="features")
label_indexer = StringIndexer(inputCol = "label", outputCol = "target")

nb = NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol="target", featuresCol="features")

In [56]:
stages = [dct, tokenizer, remover, cv, idf, label_indexer, nb]

In [60]:
model = Pipeline(stages=stages).fit(dataset = train)

In [50]:
df_test = model.transform(test)

In [63]:
evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="accuracy")

In [64]:
evaluator.evaluate(model.transform(test))

0.9