In [1]:
import mlflow
from pyspark import Row
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import Tokenizer, CountVectorizer
from pyspark.ml.clustering import LDA
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, LongType
from pyspark.sql.functions import col

spark = SparkSession.builder.appName('Project').getOrCreate()

dataset=spark.read.csv("reviews.tbl", inferSchema = True, header = True, sep = '|')

dataset.createTempView("product_reviews")
q="SELECT CASE pr_rating WHEN 1 THEN 'NEG' WHEN 2 THEN 'NEG' WHEN 3 THEN 'NEU' WHEN 4 THEN 'POS' WHEN 5 THEN 'POS' END AS pr_r_rating, pr_content FROM product_reviews WHERE pmod(pr_review_id, 5) IN (1,2,3)"

  from collections import (
  class ResultIterable(collections.Iterable):


In [2]:
df = spark.sql(q).toDF("label", "sentence")
tokenizer = Tokenizer(inputCol="sentence", outputCol="tokens")
wordsData = tokenizer.transform(df)

In [3]:
df.show()

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|  NEU|attainments are; ...|
|  NEU|fluffily ironic e...|
|  NEU|dolphins about ov...|
|  NEU|bullshit must sub...|
|  NEG|silent decline or...|
|  NEU|bold platelets ex...|
|  NEU|special sustainab...|
|  POS|quick winner afte...|
|  NEU|frays doze whitho...|
|  NEG|dolphins in place...|
|  NEU|bravely permanent...|
|  POS|ruthlessly risk-f...|
|  NEU|best-performing d...|
|  NEU|orbits need to in...|
|  NEU|idle patience cou...|
|  NEG|busy deny tithes ...|
|  NEU|quick brave notor...|
|  NEU|sheaves will have...|
|  NEG|daringly fluffy f...|
|  NEU|carefully express...|
+-----+--------------------+
only showing top 20 rows



In [4]:
wordsData.show()

+-----+--------------------+--------------------+
|label|            sentence|              tokens|
+-----+--------------------+--------------------+
|  NEU|attainments are; ...|[attainments, are...|
|  NEU|fluffily ironic e...|[fluffily, ironic...|
|  NEU|dolphins about ov...|[dolphins, about,...|
|  NEU|bullshit must sub...|[bullshit, must, ...|
|  NEG|silent decline or...|[silent, decline,...|
|  NEU|bold platelets ex...|[bold, platelets,...|
|  NEU|special sustainab...|[special, sustain...|
|  POS|quick winner afte...|[quick, winner, a...|
|  NEU|frays doze whitho...|[frays, doze, whi...|
|  NEG|dolphins in place...|[dolphins, in, pl...|
|  NEU|bravely permanent...|[bravely, permane...|
|  POS|ruthlessly risk-f...|[ruthlessly, risk...|
|  NEU|best-performing d...|[best-performing,...|
|  NEU|orbits need to in...|[orbits, need, to...|
|  NEU|idle patience cou...|[idle, patience, ...|
|  NEG|busy deny tithes ...|[busy, deny, tith...|
|  NEU|quick brave notor...|[quick, brave, no...|


In [5]:
remover = StopWordsRemover(inputCol="tokens", outputCol="words")
cleaned = remover.transform(wordsData)

In [6]:
cleaned.show()

+-----+--------------------+--------------------+--------------------+
|label|            sentence|              tokens|               words|
+-----+--------------------+--------------------+--------------------+
|  NEU|attainments are; ...|[attainments, are...|[attainments, are...|
|  NEU|fluffily ironic e...|[fluffily, ironic...|[fluffily, ironic...|
|  NEU|dolphins about ov...|[dolphins, about,...|[dolphins, overst...|
|  NEU|bullshit must sub...|[bullshit, must, ...|[bullshit, must, ...|
|  NEG|silent decline or...|[silent, decline,...|[silent, decline,...|
|  NEU|bold platelets ex...|[bold, platelets,...|[bold, platelets,...|
|  NEU|special sustainab...|[special, sustain...|[special, sustain...|
|  POS|quick winner afte...|[quick, winner, a...|[quick, winner, r...|
|  NEU|frays doze whitho...|[frays, doze, whi...|[frays, doze, whi...|
|  NEG|dolphins in place...|[dolphins, in, pl...|[dolphins, place,...|
|  NEU|bravely permanent...|[bravely, permane...|[bravely, permane...|
|  POS

In [7]:
cv = CountVectorizer(inputCol="words", outputCol="features")
count_vectorizer_model = cv.fit(cleaned)
result = count_vectorizer_model.transform(cleaned)

In [8]:
result.show()

+-----+--------------------+--------------------+--------------------+--------------------+
|label|            sentence|              tokens|               words|            features|
+-----+--------------------+--------------------+--------------------+--------------------+
|  NEU|attainments are; ...|[attainments, are...|[attainments, are...|(3338,[11,21,41,4...|
|  NEU|fluffily ironic e...|[fluffily, ironic...|[fluffily, ironic...|(3338,[6,33,47,71...|
|  NEU|dolphins about ov...|[dolphins, about,...|[dolphins, overst...|(3338,[0,11,23,24...|
|  NEU|bullshit must sub...|[bullshit, must, ...|[bullshit, must, ...|(3338,[1,47,78,26...|
|  NEG|silent decline or...|[silent, decline,...|[silent, decline,...|(3338,[0,2,3,20,4...|
|  NEU|bold platelets ex...|[bold, platelets,...|[bold, platelets,...|(3338,[9,20,21,22...|
|  NEU|special sustainab...|[special, sustain...|[special, sustain...|(3338,[6,11,14,18...|
|  POS|quick winner afte...|[quick, winner, a...|[quick, winner, r...|(3338,[2,9

In [9]:
vocabulary = count_vectorizer_model.vocabulary

In [11]:
print(vocabulary)

['shall', 'must', 'slow', 'silent', 'quiet', 'stealthy', 'ironic', 'busy', 'enticing', 'ruthless', 'quick', 'special', 'blithe', 'regular', 'sly', 'dogged', 'express', 'permanent', 'thin', 'furious', 'fluffy', 'close', 'careful', 'daring', 'bold', 'final', 'even', 'idle', 'pending', 'unusual', 'brave', 'need', 'might', 'may', 'try', 'pinto', 'hockey', 'furiously', 'busily', 'ironically', 'idly', 'quickly', 'blithely', 'carefully', 'never', 'permanently', 'stealthily', 'fluffily', 'thinly', 'bravely', 'evenly', 'slyly', 'boldly', 'doggedly', 'always', 'closely', 'ruthlessly', 'sometimes', 'regularly', 'enticingly', 'slowly', 'silently', 'finally', 'quietly', 'daringly', 'excuses', 'boost', 'maintain', 'detect', 'decoys', 'dazzle', 'hang', 'breach', 'sauternes', 'use', 'serve', 'ideas', 'mold', 'sublate', 'haggle', 'engage', 'wake', 'hinder', 'asymptotes', 'impress', 'dinos', 'platelets', 'print', 'nod', 'thrash', 'solve', 'frets', 'realms', 'poach', 'notornis', 'packages', 'deposits', '