# Bag of words

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, BooleanType, IntegerType
from pyspark.sql.functions import col, struct
from pyspark.sql.functions import when

In [79]:
# create a SparkSession

spark = SparkSession.builder.appName("ReadJSON").getOrCreate()

In [80]:
# define schema
schema = StructType(
    [
        StructField("asin", StringType(), True),
        StructField("reviewerID", StringType(), True),
        StructField("summary", StringType(), True),
        StructField("overall", StringType(), True),
        StructField("reviewText", StringType(), True)
    ]
)

# read a JSON file into a DataFrame
df_raw = spark.read.schema(schema).json(r"C:\Users\Emma\Downloads\school\Big_Data\project\finaldata\combined_train.json")

In [81]:
# sample 1% of data
df = df_raw.sample(withReplacement=False, fraction=0.01)
df.count()

112869

In [84]:
# pre-process functions
import pyspark.sql.functions as F
labelUDF = F.udf(lambda x: 1 if float(x) >= 4.0 else 0, IntegerType())
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
STOPWORDS = set(stopwords.words("english"))
def remove_reg_stop(text: str) -> str:
    text = re.sub(r'[^\w]', ' ', text).lower()
    lst = text.split(' ')
    lst = list(filter(None, lst))
    lst = [word for word in lst if word not in STOPWORDS]
    str = ' '.join(lst)
    return str
cleanTextUDF = F.udf(remove_reg_stop, StringType())

def preProc (df):
    df_labeled_cleaned = df.dropDuplicates(["reviewerID", "asin"])
    df_labeled_cleaned = df_labeled_cleaned.na.drop()
    df_labeled_cleaned = df_labeled_cleaned.withColumn("label", labelUDF(df_labeled_cleaned["overall"]))
    df_labeled_cleaned = df_labeled_cleaned.select("reviewText", "label")
    df_labeled_cleaned = df_labeled_cleaned.withColumn("cleanText", cleanTextUDF(df_labeled_cleaned["reviewText"]))
    return df_labeled_cleaned.select("cleanText", "label")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Emma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [85]:
%%time

df_labeled_cleaned = preProc(df)
df_labeled_cleaned.count()

CPU times: total: 15.6 ms
Wall time: 15.2 s


112744

In [7]:
df_labeled_cleaned.groupby("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|    1|95146|
|    0|17593|
+-----+-----+



In [8]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator


evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

# get the name of the metric used
evaluator.getMetricName()

'areaUnderROC'

In [9]:
# create tokens from reviews
tk = Tokenizer(inputCol= "cleanText", outputCol = "tokens")

# create term frequencies for each of the tokens
tf1 = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=1e5)

# create tf-idf for each of the tokens
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2.0)

# create basic logistic regression model
lr = LogisticRegression(maxIter=20)

# create entire pipeline
basic_pipline = Pipeline(stages=[tk, tf1, idf, lr])

In [10]:
df_labeled_cleaned.cache()

DataFrame[cleanText: string, label: int]

In [11]:
%%time

model1 = basic_pipline.fit(df_labeled_cleaned)

CPU times: total: 344 ms
Wall time: 3min 46s


In [26]:
model1.save("bow_model1")

In [12]:
# test on testing data

df2_raw = spark.read.schema(schema).json(r"C:\Users\Emma\Downloads\school\Big_Data\project\finaldata\combined_test.json")
df2 = df2_raw.sample(withReplacement=False, fraction=0.01)
df2_labeled_cleaned = preProc(df2)

In [16]:
df2_labeled_cleaned.count()

28435

In [13]:
predictions = model1.transform(df2_labeled_cleaned)
score = evaluator.evaluate(predictions)
print("AUC SCORE: {}".format(score))

AUC SCORE: 0.768647368109351


In [34]:
predictions.show()

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|           cleanText|label|              tokens|         rawFeatures|            features|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|                 5 5|    1|              [5, 5]|(100000,[27741],[...|(100000,[27741],[...|[-0.9840254475377...|[0.27209377750342...|       1.0|
|great experience ...|    1|[great, experienc...|(100000,[10447,11...|(100000,[10447,11...|[-7.2627705032405...|[7.00671483686858...|       1.0|
|brmc right place ...|    1|[brmc, right, pla...|(100000,[712,3021...|(100000,[712,3021...|[-38.652144337936...|[1.63526070291485...|       1.0|
|          great yarn|    1|       [great, yarn]|(100000,[6564,607...|(100000,[6564,607...|[-1.0463793950310...|[0.25992096220753.

In [32]:
# test the model with self defined input

def createInput(lst):
    # returns a df to train from giving list of strings
    df = spark.createDataFrame([(review,) for review in lst], ["reviewText"])
    df = df.withColumn("cleanText", cleanTextUDF(df["reviewText"]))
    return df

tmp_reviews = [
    "This is a great product.",
    "I love this item!",
    "Not satisfied with the quality.", 
    "k",
    "nice item however something wrong"
]

In [33]:
df_test2 = createInput(tmp_reviews)
pred_test = model1.transform(df_test2)
pred_test.select("cleanText", "prediction").show()

+--------------------+----------+
|           cleanText|prediction|
+--------------------+----------+
|       great product|       1.0|
|           love item|       1.0|
|   satisfied quality|       1.0|
|                   k|       1.0|
|nice item however...|       1.0|
+--------------------+----------+



## using equal label training data

In [19]:
# count the numbers of data with label 0 and sample the same amount from data with label 1
df_labeled_0 = df_labeled_cleaned.filter(df_labeled_cleaned["label"]==0)
df_labeled_1 = df_labeled_cleaned.filter(df_labeled_cleaned["label"]==1)
FRAC_1 = df_labeled_0.count()/df_labeled_1.count()
df_labeled_1 = df_labeled_1.sample(withReplacement=False, fraction=FRAC_1, seed=42)

# combine
df_equal_label = df_labeled_0.union(df_labeled_1)

In [20]:
df_equal_label.count()

35538

In [23]:
df_equal_label.groupby("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|    0|17593|
|    1|17945|
+-----+-----+



In [21]:
# create entire pipeline
pipline2 = Pipeline(stages=[tk, tf1, idf, lr])

In [22]:
%%time

model2 = pipline2.fit(df_equal_label)

CPU times: total: 406 ms
Wall time: 38 s


In [27]:
model2.save("bow_model2")

In [24]:
predictions = model2.transform(df2_labeled_cleaned)
score = evaluator.evaluate(predictions)
print("AUC SCORE: {}".format(score))

AUC SCORE: 0.7687397631638292


In [25]:
pred_test = model2.transform(df_test2)
pred_test.select("cleanText", "prediction").show()

+--------------------+----------+
|           cleanText|prediction|
+--------------------+----------+
|       great product|       1.0|
|           love item|       1.0|
|   satisfied quality|       1.0|
|                   k|       0.0|
|nice item however...|       0.0|
+--------------------+----------+



In [3]:
# clear everything
import gc

spark.stop()
gc.collect()

98

## loading data 10mb at a time

In [2]:
# create a SparkSession
spark = SparkSession.builder.appName("ReadJSON_chunks").getOrCreate()

In [72]:
# Define the file path
file_path = r"C:\Users\Emma\Downloads\school\Big_Data\project\finaldata\combined_train.json"

# Define chunk size in lines
chunk_size_line = 20000

# define schema
schema = StructType(
    [
        StructField("asin", StringType(), True),
        StructField("reviewerID", StringType(), True),
        StructField("summary", StringType(), True),
        StructField("overall", StringType(), True),
        StructField("reviewText", StringType(), True)
    ]
)

In [66]:
# Read json with file open

f = open(file_path, "r")
f_content = ''
for i in range(chunk_size_line):
    f_content+=f.readline()

# write to a tmp json file
with open("tmp.json", "w") as outfile:
    outfile.write(f_content)

# free memory
del f_content

In [67]:
df_raw = spark.read.schema(schema).json(r"C:\Users\Emma\Downloads\school\Big_Data\project\big-data-final-project\tmp.json")

In [68]:
df_raw.show()

+----------+--------------+--------------------+-------+--------------------+
|      asin|    reviewerID|             summary|overall|          reviewText|
+----------+--------------+--------------------+-------+--------------------+
|0486427706|A1274GG1EB2JLJ|The pictures are ...|    5.0|The pictures are ...|
|0486427706|A30X5EGBYAZQQK|       So beautiful!|    5.0|I absolutely love...|
|0486427706|A3U6UNXLAUY6ZV|          Five Stars|    5.0|          I love it!|
|0486427706|A1SAJF5SNM6WJS|          Five Stars|    5.0|MY HUSBAND LOVED ...|
|0486427706| AHJWO3SI0S0OR|          Four Stars|    4.0|                cool|
|0486427706| ALLSNTNR6N6UL|nice pictures, gr...|    5.0|Exactly as descri...|
|0486448789|A3O6CP5TT54LJE|     save your money|    1.0|total waste of mo...|
|0486448789|A216BPGO0ZBR5N|age 5 loved the b...|    5.0|Nephew, age 5 lov...|
|0486448789|A3OJCR7TKQIPQM|          Very Cute!|    5.0|          Very Cute!|
|0486448789|A2DSZOLDOG70GC|This is pretty mu...|    3.0|This is 

In [69]:
spark.catalog.clearCache()
del df_raw

In [70]:
f_content = ''
for i in range(chunk_size_line):
    f_content+=f.readline()

# write to a tmp json file
with open("tmp.json", "w") as outfile:
    outfile.write(f_content)

# free memory
del f_content

df_raw = spark.read.schema(schema).json(r"C:\Users\Emma\Downloads\school\Big_Data\project\big-data-final-project\tmp.json")
df_raw.show()

+----------+--------------+--------------------+-------+--------------------+
|      asin|    reviewerID|             summary|overall|          reviewText|
+----------+--------------+--------------------+-------+--------------------+
|B00004T2WP|A360ADK9Q5VTF3|    It's okay but...|    3.0|my son never had ...|
|B00004T2WP|A2W5EPWH46DUCL|                Gift|    5.0|This was a gift t...|
|B00004T2WP|A1I10L12MM57S0|Perfect walk/ride...|    5.0|I was looking for...|
|B00004T2WP|A2R0MYROYFQIXY|           Love this|    5.0|this is so great,...|
|B00004T2WP|A335RCAJ27KI7R|Fun toy once they...|    5.0|This toy is good ...|
|B00004T2WP| A9D25J81K1ZSB|Used through 2 ch...|    5.0|I purchased this ...|
|B00004T2WP|A2FCS6WYO0HCW7|        not rideable|    1.0|Neither of my chi...|
|B00004T2WP|A2ROUV2OWG4TI2|      babies LOVE IT|    5.0|my son was trying...|
|B00004T2WP| AXY7LB6MPS5VM|Perfect for learn...|    5.0|This has been rea...|
|B00004T2WP|A3M424W9HTCTNL|My baby does not ...|    3.0|I bought

In [73]:
# clear everything
import gc

spark.stop()
gc.collect()

3312

## putting everying in loop...

In [2]:
# imports

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, BooleanType, IntegerType
from pyspark.sql.functions import col, struct
from pyspark.sql.functions import when

from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.pipeline import PipelineModel

In [2]:
# pre-process functions

import pyspark.sql.functions as F
labelUDF = F.udf(lambda x: 1 if float(x) >= 4.0 else 0, IntegerType())
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
STOPWORDS = set(stopwords.words("english"))
def remove_reg_stop(text: str) -> str:
    text = re.sub(r'[^\w]', ' ', text).lower()
    lst = text.split(' ')
    lst = list(filter(None, lst))
    lst = [word for word in lst if word not in STOPWORDS]
    str = ' '.join(lst)
    return str
cleanTextUDF = F.udf(remove_reg_stop, StringType())

def preProc (df):
    df_labeled_cleaned = df.dropDuplicates(["reviewerID", "asin"])
    df_labeled_cleaned = df_labeled_cleaned.na.drop()
    df_labeled_cleaned = df_labeled_cleaned.withColumn("label", labelUDF(df_labeled_cleaned["overall"]))
    df_labeled_cleaned = df_labeled_cleaned.select("reviewText", "label")
    df_labeled_cleaned = df_labeled_cleaned.withColumn("cleanText", cleanTextUDF(df_labeled_cleaned["reviewText"]))
    return df_labeled_cleaned.select("cleanText", "label")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Emma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# create a SparkSession
spark = SparkSession.builder.appName("ReadJSON_chunks").getOrCreate()

In [4]:
# Define the file path
file_path = r"C:\Users\Emma\Downloads\school\Big_Data\project\finaldata\combined_train.json"

# Define chunk size in lines
chunk_size_line = 20000

# define schema
schema = StructType(
    [
        StructField("asin", StringType(), True),
        StructField("reviewerID", StringType(), True),
        StructField("summary", StringType(), True),
        StructField("overall", StringType(), True),
        StructField("reviewText", StringType(), True)
    ]
)

In [5]:
# load testing data

df_test = spark.read.schema(schema).json(r"C:\Users\Emma\Downloads\school\Big_Data\project\finaldata\combined_test.json")
df_test = preProc(df_test)

In [6]:
# create pipline

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
tk = Tokenizer(inputCol= "cleanText", outputCol = "tokens")
tf1 = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=1e5)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2.0)
lr = LogisticRegression(maxIter=20)
basic_pipline = Pipeline(stages=[tk, tf1, idf, lr])

In [7]:
%%time

cur_run = 0
file_end = False
while not file_end:
    
    # Read json with file open
    f = open(file_path, "r")
    f_content = ''
    for i in range(chunk_size_line):
        line = f.readline()
        if line:
            f_content += line
        else: # last training when file reach end
            file_end = True
            break 
    # write to a tmp json file
    with open("tmp.json", "w") as outfile:
        outfile.write(f_content)
    
    # create df
    df = spark.read.schema(schema).json(r"C:\Users\Emma\Downloads\school\Big_Data\project\big-data-final-project\tmp.json")
    
    # pre-process
    df_labeled_cleaned = preProc(df)
    
    # training
    if cur_run<0: # skip this for now since model load is not working
        model_path = r"C:\Users\Emma\Downloads\school\Big_Data\project\big-data-final-project\bow_chunk_model_{}".format(cur_run)
        prev_model = PipelineModel.load(model_path)
        stages_steps = prev_model.stages
        model = Pipeline(stages = stages_steps).fit(df_labeled_cleaned) 
    else: # the first time training
        model = basic_pipline.fit(df_labeled_cleaned)
        
    cur_run += 1
    model_name = "bow_chunk_model_{}".format(cur_run)
    #model.save(model_name) # skip since model load is not working
    
    # get score from testing data
    predictions = model.transform(df_test)
    score = evaluator.evaluate(predictions)
    print("{} AUC SCORE: {}".format(model_name, score))

    # free memory
    spark.catalog.clearCache()
    del f_content
    del df
    del df_labeled_cleaned

bow_chunk_model_1 AUC SCORE: 0.6861595761437362
bow_chunk_model_2 AUC SCORE: 0.6861581332194084
bow_chunk_model_3 AUC SCORE: 0.686162518257355
bow_chunk_model_4 AUC SCORE: 0.6861598047388178
bow_chunk_model_5 AUC SCORE: 0.6861611665852446
bow_chunk_model_6 AUC SCORE: 0.6861624778486394
bow_chunk_model_7 AUC SCORE: 0.6861547961699533
bow_chunk_model_8 AUC SCORE: 0.6861608787377532
bow_chunk_model_9 AUC SCORE: 0.6861575371895784
bow_chunk_model_10 AUC SCORE: 0.6861555317360544
bow_chunk_model_11 AUC SCORE: 0.6861635508892369
bow_chunk_model_12 AUC SCORE: 0.6861579915051954
bow_chunk_model_13 AUC SCORE: 0.6861607861460787
bow_chunk_model_14 AUC SCORE: 0.6861575758615918
bow_chunk_model_15 AUC SCORE: 0.6861565341645833
bow_chunk_model_16 AUC SCORE: 0.6861540429121991
bow_chunk_model_17 AUC SCORE: 0.6861565404578753
bow_chunk_model_18 AUC SCORE: 0.6861580958657479
bow_chunk_model_19 AUC SCORE: 0.6861556185557579
bow_chunk_model_20 AUC SCORE: 0.6861550865482063
bow_chunk_model_21 AUC SCORE: 

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "C:\spark\spark-3.5.1-bin-hadoop3\python\lib\py4j-0.10.9.7-src.zip\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Emma\anaconda3\Lib\socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\spark\spark-3.5.1-bin-hadoop3\python\lib\py4j-0.10.9.7-src.zip\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\spark\spark-3.5.1-bin-hadoop3\python\lib\py4j-0.10.9.7-src.zip\py4j\clientserver.py", line 539, in send_command
    r

ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

In [21]:
## pipelineModel load testing

In [3]:
import os
from pyspark.ml.pipeline import PipelineModel 
data_path = "tmp_model"

In [6]:
os.path.exists(data_path)

True

In [7]:
model = PipelineModel.load(data_path)

In [8]:
type(model)

pyspark.ml.pipeline.PipelineModel