In [None]:
import numpy as np
import sparknlp
sparknlp.start()

from sparknlp.base import *
from sparknlp.annotator import *

from pyspark.ml import Pipeline
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.functions import vector_to_array
import pyspark.sql.functions as F

from sklearn.metrics import classification_report, accuracy_score

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel



# Data Engineering

## Filter by Restaurants which is open

In [40]:
review = spark.read.json('gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/yelp_academic_dataset_review.json')

                                                                                

In [2]:
business = spark.read.json('gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/yelp_academic_dataset_business.json')

23/05/17 18:59:53 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [9]:
business.count()

150346

In [3]:
business = business.filter(col('categories').contains('Restaurants'))

In [4]:
business = business.filter(col('is_open')==1)

In [5]:
business.count()

                                                                                

34987

In [6]:
business_idunique = business.select('business_id').distinct()

In [20]:
review.count()

                                                                                

6990280

In [41]:
review = review.join(business_idunique, 'business_id', 'inner')

In [42]:
review.count()

                                                                                

3773770

In [43]:
user_idunique = review.select('user_id').distinct()

In [44]:
user = spark.read.json('gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/yelp_academic_dataset_user.json')

                                                                                

In [45]:
user.count()

                                                                                

1987897

In [46]:
user = user.join(user_idunique, 'user_id', 'inner')

In [47]:
user.count()

                                                                                

1277283

In [7]:
checkin = spark.read.json('gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/yelp_academic_dataset_checkin.json')

                                                                                

In [8]:
checkin = checkin.join(business_idunique, 'business_id', 'inner')

In [9]:
checkin.coalesce(1).write.save("gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/engineered_data/checkin", format = "parquet",mode='overwrite')

                                                                                

## Function to flatten the dataframe if it is structured.

In [246]:
def flatting_df(df):
    from pyspark.sql.functions import col
    for item in df.dtypes:
        if item[1][:6] == 'struct':
            cols = df.select(item[0]+'.*').columns
            for subitem in cols:
                df = df.withColumn(subitem, col(item[0]+'.'+subitem))
            df = df.drop(item[0])
        else:
            pass
    return df

In [248]:
business = flatting_df(business)

In [249]:
business.coalesce(1).write.save("gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/engineered_data/business", format = "parquet",mode='overwrite')

                                                                                

In [49]:
user.coalesce(1).write.save("gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/engineered_data/user", format = "parquet")

                                                                                

In [50]:
review.coalesce(1).write.save("gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/engineered_data/review", format = "parquet",mode='overwrite')

                                                                                

In [250]:
business.printSchema()

root
 |-- address: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- city: string (nullable = true)
 |-- is_open: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- name: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- stars: double (nullable = true)
 |-- state: string (nullable = true)
 |-- AcceptsInsurance: string (nullable = true)
 |-- AgesAllowed: string (nullable = true)
 |-- Alcohol: string (nullable = true)
 |-- Ambience: string (nullable = true)
 |-- BYOB: string (nullable = true)
 |-- BYOBCorkage: string (nullable = true)
 |-- BestNights: string (nullable = true)
 |-- BikeParking: string (nullable = true)
 |-- BusinessAcceptsBitcoin: string (nullable = true)
 |-- BusinessAcceptsCreditCards: string (nullable = true)
 |-- BusinessParking: string (nullable = true)
 |-- ByAppointmentOnly: strin

# Few shot learning

Read the dataset prepared. Labeled by human eyes

In [67]:
trainDataset = spark.read.option('header', True).option('escapeQuotes', 'true').csv('gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/Train_Data.csv')

In [3]:
trainDataset.show(10,  truncate=50)

+--------------------------------------------------+--------+
|                                          sentence|Category|
+--------------------------------------------------+--------+
|If you decide to eat here, just be aware it is ...| Service|
|I have been to it's other locations in NJ and n...| Service|
|                                  The food is good|    Food|
|            it takes a very long time to come out.| Service|
|The waitstaff is very young, but usually pleasant.| Service|
|We have just had too many experiences where we ...| Service|
|We usually opt for another diner or restaurant ...| Service|
|I've taken a lot of spin classes over the years...|  Others|
|From the nice, clean space and amazing bikes, t...| Service|
|For anyone who struggles to fit workouts in, th...| Service|
+--------------------------------------------------+--------+
only showing top 10 rows



In [4]:
trainDataset.groupBy("Category") \
    .count() \
    .orderBy(col("Category").desc()) \
    .show(truncate=60)

[Stage 2:>                                                          (0 + 1) / 1]

+------------------------------------------------------------+-----+
|                                                    Category|count|
+------------------------------------------------------------+-----+
|                                                     Service|  213|
|                                                       Price|  157|
|                                                      Others|   29|
|                                                     Opinion|  154|
|                                                        Food|  264|
|                              Atmosphere/DescriptionOfPlaces|  162|
| we walked over to the bar and started looking over the m...|    1|
|                                                           "|    1|
|                                                        null|    1|
+------------------------------------------------------------+-----+



                                                                                

In [68]:
stringIndexer = StringIndexer(inputCol='Category', outputCol = "CategoryIndex").setHandleInvalid("skip")
encoder = OneHotEncoder(inputCol="CategoryIndex", outputCol="CategoryOnehot")
# assembler = VectorAssembler(inputCols=["Category"], outputCol='features')
stages = [stringIndexer, encoder]

In [69]:
prepPipeline = Pipeline().setStages(stages)
pipelineModel = prepPipeline.fit(trainDataset)

                                                                                

In [70]:
dataset = pipelineModel.transform(trainDataset)
dataset = dataset.select('*', vector_to_array('CategoryOnehot').alias('col_onehot'))
dataset.show()

[Stage 132:>                                                        (0 + 1) / 1]

+--------------------+--------+-------------+--------------+--------------------+
|            sentence|Category|CategoryIndex|CategoryOnehot|          col_onehot|
+--------------------+--------+-------------+--------------+--------------------+
|If you decide to ...| Service|          1.0| (7,[1],[1.0])|[0.0, 1.0, 0.0, 0...|
|I have been to it...| Service|          1.0| (7,[1],[1.0])|[0.0, 1.0, 0.0, 0...|
|    The food is good|    Food|          0.0| (7,[0],[1.0])|[1.0, 0.0, 0.0, 0...|
|it takes a very l...| Service|          1.0| (7,[1],[1.0])|[0.0, 1.0, 0.0, 0...|
|The waitstaff is ...| Service|          1.0| (7,[1],[1.0])|[0.0, 1.0, 0.0, 0...|
|We have just had ...| Service|          1.0| (7,[1],[1.0])|[0.0, 1.0, 0.0, 0...|
|We usually opt fo...| Service|          1.0| (7,[1],[1.0])|[0.0, 1.0, 0.0, 0...|
|I've taken a lot ...|  Others|          5.0| (7,[5],[1.0])|[0.0, 0.0, 0.0, 0...|
|From the nice, cl...| Service|          1.0| (7,[1],[1.0])|[0.0, 1.0, 0.0, 0...|
|For anyone who 

                                                                                

In [71]:
indexer_fitted = stringIndexer.fit(trainDataset)
num_categories = len(dataset.first()['col_onehot'])   # 3
cols_expanded = [(F.col('col_onehot')[i].alias(f'{indexer_fitted.labels[i]}')) for i in range(num_categories)]
dataset = dataset.select('sentence', 'Category', *cols_expanded)
dataset.show()

+--------------------+--------+----+-------+------------------------------+-----+-------+------+---+
|            sentence|Category|Food|Service|Atmosphere/DescriptionOfPlaces|Price|Opinion|Others|  "|
+--------------------+--------+----+-------+------------------------------+-----+-------+------+---+
|If you decide to ...| Service| 0.0|    1.0|                           0.0|  0.0|    0.0|   0.0|0.0|
|I have been to it...| Service| 0.0|    1.0|                           0.0|  0.0|    0.0|   0.0|0.0|
|    The food is good|    Food| 1.0|    0.0|                           0.0|  0.0|    0.0|   0.0|0.0|
|it takes a very l...| Service| 0.0|    1.0|                           0.0|  0.0|    0.0|   0.0|0.0|
|The waitstaff is ...| Service| 0.0|    1.0|                           0.0|  0.0|    0.0|   0.0|0.0|
|We have just had ...| Service| 0.0|    1.0|                           0.0|  0.0|    0.0|   0.0|0.0|
|We usually opt fo...| Service| 0.0|    1.0|                           0.0|  0.0|    0.0|  

# Model creation & Evaluation

## Create sentences table for aspect base analysis

In [7]:
review = spark.read.json('gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/yelp_academic_dataset_review.json')

                                                                                

In [10]:
def displaypartitions(df):
    #number of records by partition
    num = df.rdd.getNumPartitions()
    print("Partitions:", num)
    df.withColumn("partitionId", spark_partition_id())\
        .groupBy("partitionId")\
        .count()\
        .orderBy(asc("count"))\
        .show(num)

In [11]:
#number of partitions
review.rdd.getNumPartitions()

40

In [12]:
review.count()

                                                                                

6990280

In [9]:
#re-partition the data
review = review.repartition(120)

In [15]:
# review = review.limit(10000)

In [16]:
review.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



In [177]:
# actual content is inside description column
documentassembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

In [178]:
sentencerDL = SentenceDetectorDLModel\
    .pretrained("sentence_detector_dl", "en") \
    .setInputCols(["document"]) \
    .setOutputCol("sentences")

sentence_detector_dl download started this may take some time.
Approximate size to download 354.6 KB
[OK!]


In [179]:
pipeline_sentence = Pipeline(
    stages = [
        documentassembler,
        sentencerDL
    ])

In [180]:
processed_data = pipeline_sentence.fit(review.select('review_id', 'text')).transform(review.select('review_id', 'text'))

In [181]:
sentence_df = processed_data.select('review_id', 'sentences.result')

In [182]:
sentence_df = sentence_df.select('review_id', explode('result').alias('sentence'))

In [183]:
sentence_df.show()

                                                                                

+--------------------+--------------------+
|           review_id|            sentence|
+--------------------+--------------------+
|KU_O5udG6zpxOg-Vc...|If you decide to ...|
|KU_O5udG6zpxOg-Vc...|We have tried it ...|
|KU_O5udG6zpxOg-Vc...|I have been to it...|
|KU_O5udG6zpxOg-Vc...|The food is good,...|
|KU_O5udG6zpxOg-Vc...|The waitstaff is ...|
|KU_O5udG6zpxOg-Vc...|We have just had ...|
|KU_O5udG6zpxOg-Vc...|We usually opt fo...|
|saUsX_uimxRlCVr67...|       Family diner.|
|saUsX_uimxRlCVr67...|     Had the buffet.|
|saUsX_uimxRlCVr67...|Eclectic assortme...|
|saUsX_uimxRlCVr67...|           All good.|
|saUsX_uimxRlCVr67...|Lots of Mexican c...|
|saUsX_uimxRlCVr67...|Also has a menu w...|
|saUsX_uimxRlCVr67...|Friendly, attenti...|
|saUsX_uimxRlCVr67...|Good place for a ...|
|saUsX_uimxRlCVr67...|Next to the Clari...|
|AqPFMleE6RsU23_au...|Wow!  Yummy, diff...|
|AqPFMleE6RsU23_au...|Our favorite is t...|
|AqPFMleE6RsU23_au...|With 10 different...|
|AqPFMleE6RsU23_au...|          

In [None]:
%%time
# sentence_df.write.save("gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/sentence_df", format = "parquet",mode='overwrite')

                                                                                

CPU times: user 827 ms, sys: 332 ms, total: 1.16 s
Wall time: 1h 24min 57s


### Sentence embedding based approach (Deep learning)

#### Preparing the classifier pipeline

In [55]:
# actual content is inside description column
documentassembler = DocumentAssembler()\
    .setInputCol("sentence")\
    .setOutputCol("document")

In [56]:
sentencerDL = SentenceDetectorDLModel\
    .pretrained("sentence_detector_dl", "en") \
    .setInputCols(["document"]) \
    .setOutputCol("sentences")

sentence_detector_dl download started this may take some time.
Approximate size to download 354.6 KB
[ | ]sentence_detector_dl download started this may take some time.
Approximate size to download 354.6 KB
Download done! Loading the resource.
[ / ]

                                                                                

[ — ]

2023-05-17 06:29:10.555417: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[OK!]


In [57]:
use = UniversalSentenceEncoder.pretrained(lang="en") \
    .setInputCols(["document"])\
    .setOutputCol("sentence_embeddings")

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ | ]tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ — ]Download done! Loading the resource.
[OK!]


In [58]:
# the classes/labels/categories are in category column
classsifier = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("TrueLabel")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)

In [59]:
classifier_pipeline = Pipeline(
    stages = [
        documentassembler,
        sentencerDL,
        use,
        classsifier
    ])

### Cross validation

In [136]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
import numpy as np
import pandas as pd

In [132]:
category_list = ['Price', 'Opinion', 'Food', 'Atmosphere/DescriptionOfPlaces', 'Service']

In [120]:
evaluator = MulticlassClassificationEvaluator(labelCol="TrueLabel", predictionCol="Predict")

In [None]:
accuracy_lst = []
for category in category_list:
    counts = dataset.groupBy(category).count().collect()
    ratio_adjust = 1.0 ## ratio of pos to neg in the df_subsample
    if counts[0][1] > counts[1][1]:
        down_class = counts[0][0]
    else:
        down_class = counts[1][0]
    higherBound = counts[0][1]
    treshold_to_filter = int(ratio_adjust * float(counts[1][1]) / counts[0][1] * higherBound)
    randGen = lambda x: np.random.randint(0, higherBound) if x == down_class else -1
    udfRandGen = udf(randGen, IntegerType())
    df_temp= dataset.withColumn("randIndex", udfRandGen(category))
    df_temp = df_temp.filter(df_temp['randIndex'] < treshold_to_filter).drop('randIndex')

    df_temp = df_temp.select('sentence', col(category).alias('TrueLabel'))
    df_temp = df_temp_train.orderBy(rand())
    df_temp = df_temp.withColumn('row_id', monotonically_increasing_id())
    df_with_pattern = df_temp.withColumn("pattern_col", ((col("row_id") % 5))).drop('row_id')
    evaluator = MulticlassClassificationEvaluator(labelCol="TrueLabel", predictionCol="Predict")

    result_lst = []
    for i in range(5):
        df_cv_train = df_with_pattern.filter(col('pattern_col')!=i)
        df_cv_validation = df_with_pattern.filter(col('pattern_col')==i)
        pipeline_model = classifier_pipeline.fit(df_cv_train)
        predicts =  pipeline_model.transform(df_cv_validation)
        result = predicts.select('sentence', col('TrueLabel').cast('double'), col('class.result').getItem(0).alias('Predict').cast('double'))
        result_lst.append(evaluator.evaluate(result, {evaluator.metricName: "accuracy"}))
    accuracy_lst.append(np.max(result_lst))

2023-05-17 07:29:19.299117: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/25286ce868f1_classifier_dl1725063051871141995
2023-05-17 07:29:19.421711: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:29:19.421790: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/25286ce868f1_classifier_dl1725063051871141995
2023-05-17 07:29:20.065113: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:29:21.270139: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/25286ce868f1_classifier_dl1725063051871141995
2023-05-17 07:29:21.493204: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 2194103 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 344 - classes: 2
Epoch 1/5 - 0.46s - loss: 3.199205 - acc: 0.8375 - batches: 6
Epoch 2/5 - 0.04s - loss: 2.8590016 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.05s - loss: 1.8845842 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.05s - loss: 2.8478274 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.05s - loss: 2.652283 - acc: 1.0 - batches: 6


2023-05-17 07:30:03.552697: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/a55420346dc1_classifier_dl5744150677812356340
2023-05-17 07:30:03.648407: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:30:03.648477: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/a55420346dc1_classifier_dl5744150677812356340
2023-05-17 07:30:04.285978: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:30:05.509596: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/a55420346dc1_classifier_dl5744150677812356340
2023-05-17 07:30:05.699820: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 2147137 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 345 - classes: 2
Epoch 1/5 - 0.46s - loss: 3.1208131 - acc: 0.879125 - batches: 6
Epoch 2/5 - 0.03s - loss: 1.9005305 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.03s - loss: 1.880867 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.03s - loss: 1.8806376 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.03s - loss: 1.8801804 - acc: 1.0 - batches: 6


2023-05-17 07:30:52.278048: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/06fa2f129502_classifier_dl6691961940849320036
2023-05-17 07:30:52.388711: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:30:52.388774: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/06fa2f129502_classifier_dl6691961940849320036
2023-05-17 07:30:52.988628: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:30:53.890982: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/06fa2f129502_classifier_dl6691961940849320036
2023-05-17 07:30:54.078418: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1800382 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 345 - classes: 2
Epoch 1/5 - 0.44s - loss: 3.951472 - acc: 0.867 - batches: 6
Epoch 2/5 - 0.03s - loss: 1.8804058 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.03s - loss: 1.8796283 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.03s - loss: 1.8796593 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.03s - loss: 1.8796014 - acc: 1.0 - batches: 6


2023-05-17 07:31:31.046210: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/00943aba9984_classifier_dl1188366030923974238
2023-05-17 07:31:31.155176: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:31:31.155221: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/00943aba9984_classifier_dl1188366030923974238
2023-05-17 07:31:31.678140: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:31:32.580258: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/00943aba9984_classifier_dl1188366030923974238
2023-05-17 07:31:32.767273: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1721075 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 338 - classes: 2
Epoch 1/5 - 0.53s - loss: 4.2349815 - acc: 0.8538195 - batches: 6
Epoch 2/5 - 0.04s - loss: 3.707961 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.04s - loss: 3.3810143 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.04s - loss: 2.8853724 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.04s - loss: 2.8849378 - acc: 1.0 - batches: 6


2023-05-17 07:32:16.163960: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/156c2e6b7fe1_classifier_dl5103115996684526604
2023-05-17 07:32:16.292192: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:32:16.292253: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/156c2e6b7fe1_classifier_dl5103115996684526604
2023-05-17 07:32:16.980896: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:32:18.023919: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/156c2e6b7fe1_classifier_dl5103115996684526604
2023-05-17 07:32:18.228954: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 2065007 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 345 - classes: 2
Epoch 1/5 - 0.50s - loss: 3.1653018 - acc: 0.87112504 - batches: 6
Epoch 2/5 - 0.04s - loss: 1.8833042 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.03s - loss: 2.0019379 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.04s - loss: 2.0279415 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.03s - loss: 1.9837 - acc: 1.0 - batches: 6


2023-05-17 07:32:32.227810: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/f378df31219b_classifier_dl2669998473286466319
2023-05-17 07:32:32.462971: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:32:32.463035: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/f378df31219b_classifier_dl2669998473286466319
2023-05-17 07:32:33.252061: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:32:34.390649: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/f378df31219b_classifier_dl2669998473286466319
2023-05-17 07:32:34.625353: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 2397557 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 342 - classes: 2
Epoch 1/5 - 0.47s - loss: 2.9616666 - acc: 0.8752841 - batches: 6
Epoch 2/5 - 0.03s - loss: 1.8825591 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.03s - loss: 1.8804243 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.03s - loss: 1.8801707 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.03s - loss: 1.8800232 - acc: 1.0 - batches: 6


2023-05-17 07:33:21.247558: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/26e732d5d461_classifier_dl4860600403582297606
2023-05-17 07:33:21.438327: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:33:21.438403: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/26e732d5d461_classifier_dl4860600403582297606
2023-05-17 07:33:22.345177: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:33:23.765653: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/26e732d5d461_classifier_dl4860600403582297606
2023-05-17 07:33:24.076110: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 2828562 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 346 - classes: 2
Epoch 1/5 - 0.47s - loss: 2.7555704 - acc: 0.8831731 - batches: 6
Epoch 2/5 - 0.03s - loss: 1.8796753 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.03s - loss: 1.8796145 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.03s - loss: 1.8795933 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.03s - loss: 1.8795758 - acc: 1.0 - batches: 6


2023-05-17 07:33:55.643019: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/83b567d08b03_classifier_dl7520877994811051229
2023-05-17 07:33:55.772175: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:33:55.772234: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/83b567d08b03_classifier_dl7520877994811051229
2023-05-17 07:33:56.506163: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:33:57.477272: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/83b567d08b03_classifier_dl7520877994811051229
2023-05-17 07:33:57.703520: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 2060515 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 339 - classes: 2
Epoch 1/5 - 0.47s - loss: 2.882184 - acc: 0.8476974 - batches: 6
Epoch 2/5 - 0.03s - loss: 1.9041153 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.03s - loss: 1.8796756 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.03s - loss: 1.8796237 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.03s - loss: 1.8796113 - acc: 1.0 - batches: 6


2023-05-17 07:34:14.738543: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/14781bfd6f16_classifier_dl8871199348187441300
2023-05-17 07:34:14.859797: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:34:14.859839: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/14781bfd6f16_classifier_dl8871199348187441300
2023-05-17 07:34:15.500003: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:34:16.303858: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/14781bfd6f16_classifier_dl8871199348187441300
2023-05-17 07:34:16.510567: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1772035 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 351 - classes: 2
Epoch 1/5 - 0.42s - loss: 3.2837508 - acc: 0.8398186 - batches: 6
Epoch 2/5 - 0.03s - loss: 2.9318101 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.03s - loss: 1.8907527 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.03s - loss: 1.8800141 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.03s - loss: 1.8815874 - acc: 1.0 - batches: 6


2023-05-17 07:34:32.308046: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/279ef1e11a4c_classifier_dl336281442870422033
2023-05-17 07:34:32.426657: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:34:32.426707: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/279ef1e11a4c_classifier_dl336281442870422033
2023-05-17 07:34:33.077108: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:34:34.289738: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/279ef1e11a4c_classifier_dl336281442870422033
2023-05-17 07:34:34.508958: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 2200923 microseconds.

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 336 - classes: 2
Epoch 1/5 - 0.64s - loss: 3.76793 - acc: 0.8375 - batches: 6
Epoch 2/5 - 0.04s - loss: 1.8827101 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.04s - loss: 1.879842 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.04s - loss: 1.8797234 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.04s - loss: 1.8797503 - acc: 1.0 - batches: 6


2023-05-17 07:34:52.598632: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/b4b192bdc6ec_classifier_dl1236988260741811410
2023-05-17 07:34:52.864107: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:34:52.864179: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/b4b192bdc6ec_classifier_dl1236988260741811410
2023-05-17 07:34:53.862943: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:34:55.526012: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/b4b192bdc6ec_classifier_dl1236988260741811410
2023-05-17 07:34:55.807773: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 3209153 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 337 - classes: 2
Epoch 1/5 - 0.66s - loss: 3.780781 - acc: 0.7834559 - batches: 6
Epoch 2/5 - 0.04s - loss: 2.8528123 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.04s - loss: 2.8544645 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.05s - loss: 2.8784134 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.04s - loss: 2.8782053 - acc: 1.0 - batches: 6


2023-05-17 07:35:14.532653: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/2f5e3c072535_classifier_dl3443362273855066795
2023-05-17 07:35:14.694724: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:35:14.694810: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/2f5e3c072535_classifier_dl3443362273855066795
2023-05-17 07:35:15.674898: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:35:17.118543: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/2f5e3c072535_classifier_dl3443362273855066795
2023-05-17 07:35:17.406990: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 2874350 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 335 - classes: 2
Epoch 1/5 - 0.66s - loss: 4.302308 - acc: 0.775625 - batches: 6
Epoch 2/5 - 0.04s - loss: 2.7450187 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.04s - loss: 2.265615 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.05s - loss: 1.9426563 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.04s - loss: 1.9212146 - acc: 1.0 - batches: 6


2023-05-17 07:35:36.032957: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/68c4a5030b3e_classifier_dl2426705603961964878
2023-05-17 07:35:36.196617: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:35:36.196692: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/68c4a5030b3e_classifier_dl2426705603961964878
2023-05-17 07:35:37.140809: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:35:38.582367: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/68c4a5030b3e_classifier_dl2426705603961964878
2023-05-17 07:35:38.881072: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 2848129 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 342 - classes: 2
Epoch 1/5 - 0.62s - loss: 2.9781125 - acc: 0.8508523 - batches: 6
Epoch 2/5 - 0.04s - loss: 2.879434 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.04s - loss: 2.8788576 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.05s - loss: 2.8750696 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.04s - loss: 2.8784819 - acc: 1.0 - batches: 6


2023-05-17 07:35:53.520431: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/975b7c51c2ab_classifier_dl3752615412718921349
2023-05-17 07:35:53.762202: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:35:53.762273: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/975b7c51c2ab_classifier_dl3752615412718921349
2023-05-17 07:35:54.751064: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:35:56.168343: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/975b7c51c2ab_classifier_dl3752615412718921349
2023-05-17 07:35:56.475223: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 2954803 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 335 - classes: 2
Epoch 1/5 - 0.64s - loss: 3.694939 - acc: 0.8147917 - batches: 6
Epoch 2/5 - 0.04s - loss: 3.5115767 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.04s - loss: 2.8774567 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.04s - loss: 2.8774326 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.04s - loss: 2.8783672 - acc: 1.0 - batches: 6


2023-05-17 07:36:11.152354: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/816db664a265_classifier_dl2672109889477386494
2023-05-17 07:36:11.321347: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:36:11.321419: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/816db664a265_classifier_dl2672109889477386494
2023-05-17 07:36:12.335600: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:36:13.723824: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/816db664a265_classifier_dl2672109889477386494
2023-05-17 07:36:14.032760: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 2880425 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 344 - classes: 2
Epoch 1/5 - 0.45s - loss: 3.1506624 - acc: 0.86875 - batches: 6
Epoch 2/5 - 0.03s - loss: 1.8942703 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.03s - loss: 1.8795742 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.03s - loss: 1.8795708 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.03s - loss: 1.8795712 - acc: 1.0 - batches: 6


2023-05-17 07:36:26.760182: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/fcf9b612526f_classifier_dl2978416113006247314
2023-05-17 07:36:26.886388: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:36:26.886432: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/fcf9b612526f_classifier_dl2978416113006247314
2023-05-17 07:36:27.550057: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:36:28.354245: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/fcf9b612526f_classifier_dl2978416113006247314
2023-05-17 07:36:28.570537: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1810366 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 342 - classes: 2
Epoch 1/5 - 0.46s - loss: 2.960328 - acc: 0.7727273 - batches: 6
Epoch 2/5 - 0.03s - loss: 2.8669329 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.03s - loss: 2.8629627 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.03s - loss: 2.8799548 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.03s - loss: 2.8790376 - acc: 1.0 - batches: 6


2023-05-17 07:36:40.776735: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/f4f0cea78941_classifier_dl4545872352249449418
2023-05-17 07:36:40.905896: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:36:40.905948: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/f4f0cea78941_classifier_dl4545872352249449418
2023-05-17 07:36:41.617472: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:36:42.477914: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/f4f0cea78941_classifier_dl4545872352249449418
2023-05-17 07:36:42.702599: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1925877 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 351 - classes: 2
Epoch 1/5 - 0.49s - loss: 4.379346 - acc: 0.81169355 - batches: 6
Epoch 2/5 - 0.03s - loss: 2.70074 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.03s - loss: 1.9490212 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.03s - loss: 1.8978634 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.03s - loss: 1.8941174 - acc: 1.0 - batches: 6


2023-05-17 07:36:54.821599: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/0893b77cae75_classifier_dl4115822430597393794
2023-05-17 07:36:55.025794: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:36:55.025843: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/0893b77cae75_classifier_dl4115822430597393794
2023-05-17 07:36:55.802590: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:36:56.716429: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/0893b77cae75_classifier_dl4115822430597393794
2023-05-17 07:36:56.955001: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 2133413 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 331 - classes: 2
Epoch 1/5 - 0.45s - loss: 3.6584635 - acc: 0.93125 - batches: 6
Epoch 2/5 - 0.03s - loss: 4.789208 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.03s - loss: 1.8863329 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.03s - loss: 1.8969954 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.03s - loss: 2.013643 - acc: 1.0 - batches: 6


2023-05-17 07:37:12.571949: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/d7753d518c10_classifier_dl2481805851206175588
2023-05-17 07:37:12.714882: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:37:12.714930: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/d7753d518c10_classifier_dl2481805851206175588
2023-05-17 07:37:13.353930: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:37:14.131336: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/d7753d518c10_classifier_dl2481805851206175588
2023-05-17 07:37:14.344974: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1773037 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 346 - classes: 2
Epoch 1/5 - 0.46s - loss: 3.237083 - acc: 0.8300481 - batches: 6
Epoch 2/5 - 0.03s - loss: 1.882628 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.03s - loss: 1.8804997 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.03s - loss: 1.879843 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.03s - loss: 1.8796847 - acc: 1.0 - batches: 6


2023-05-17 07:37:30.866546: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/2d8eaf273fde_classifier_dl8372816687683041841
2023-05-17 07:37:31.002123: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:37:31.002181: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/2d8eaf273fde_classifier_dl8372816687683041841
2023-05-17 07:37:31.700782: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:37:32.512783: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/2d8eaf273fde_classifier_dl8372816687683041841
2023-05-17 07:37:32.733469: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1866938 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 334 - classes: 2
Epoch 1/5 - 0.45s - loss: 3.2122574 - acc: 0.90446424 - batches: 6
Epoch 2/5 - 0.03s - loss: 1.893239 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.03s - loss: 1.8798765 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.03s - loss: 1.8797783 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.03s - loss: 1.879628 - acc: 1.0 - batches: 6


2023-05-17 07:37:49.215528: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/55f62a299802_classifier_dl14839582098369240
2023-05-17 07:37:49.347790: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:37:49.347835: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/55f62a299802_classifier_dl14839582098369240
2023-05-17 07:37:50.007215: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:37:50.852180: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/55f62a299802_classifier_dl14839582098369240
2023-05-17 07:37:51.069485: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1853970 microseconds.


Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 321 - classes: 2
Epoch 1/5 - 0.51s - loss: 3.1546447 - acc: 0.925 - batches: 6
Epoch 2/5 - 0.03s - loss: 1.8822103 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.03s - loss: 1.8798969 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.03s - loss: 1.8809118 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.03s - loss: 1.8799262 - acc: 1.0 - batches: 6


2023-05-17 07:38:04.330379: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/6387f869cd22_classifier_dl7773610937426117519
2023-05-17 07:38:04.539249: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:38:04.539306: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/6387f869cd22_classifier_dl7773610937426117519
2023-05-17 07:38:05.282614: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:38:06.152169: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/6387f869cd22_classifier_dl7773610937426117519
2023-05-17 07:38:06.389333: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 2058965 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 362 - classes: 2
Epoch 1/5 - 0.48s - loss: 4.002316 - acc: 0.8559524 - batches: 6
Epoch 2/5 - 0.03s - loss: 2.884264 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.03s - loss: 2.8818684 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.03s - loss: 2.8800404 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.03s - loss: 2.8808157 - acc: 1.0 - batches: 6


2023-05-17 07:38:22.425482: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/6f509364e5ff_classifier_dl5451528369946794924
2023-05-17 07:38:22.555640: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:38:22.555692: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/6f509364e5ff_classifier_dl5451528369946794924
2023-05-17 07:38:23.237652: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:38:24.041154: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/6f509364e5ff_classifier_dl5451528369946794924
2023-05-17 07:38:24.259334: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1833864 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 340 - classes: 2
Epoch 1/5 - 0.47s - loss: 4.060907 - acc: 0.85812503 - batches: 6
Epoch 2/5 - 0.03s - loss: 2.8962553 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.03s - loss: 2.872812 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.03s - loss: 2.8404753 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.03s - loss: 2.853949 - acc: 1.0 - batches: 6


2023-05-17 07:38:41.348138: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/70100bbe0827_classifier_dl6869309762263172835
2023-05-17 07:38:41.480944: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:38:41.480994: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/70100bbe0827_classifier_dl6869309762263172835
2023-05-17 07:38:42.162403: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:38:42.981033: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/70100bbe0827_classifier_dl6869309762263172835
2023-05-17 07:38:43.198820: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1850695 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 356 - classes: 2
Epoch 1/5 - 0.48s - loss: 3.4527605 - acc: 0.87465274 - batches: 6
Epoch 2/5 - 0.03s - loss: 2.7749012 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.03s - loss: 1.9665914 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.03s - loss: 1.9378972 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.03s - loss: 2.0726337 - acc: 1.0 - batches: 6


2023-05-17 07:38:58.751125: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/ca1cf1d6384c_classifier_dl6417565887411554472
2023-05-17 07:38:58.867328: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-17 07:38:58.867375: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/ca1cf1d6384c_classifier_dl6417565887411554472
2023-05-17 07:38:59.434731: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-17 07:39:00.285838: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/ca1cf1d6384c_classifier_dl6417565887411554472
2023-05-17 07:39:00.457898: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1706785 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 343 - classes: 2
Epoch 1/5 - 0.49s - loss: 2.5262754 - acc: 0.8513587 - batches: 6
Epoch 2/5 - 0.03s - loss: 1.8891319 - acc: 1.0 - batches: 6
Epoch 3/5 - 0.03s - loss: 1.8823798 - acc: 1.0 - batches: 6
Epoch 4/5 - 0.03s - loss: 1.8806959 - acc: 1.0 - batches: 6
Epoch 5/5 - 0.03s - loss: 1.8799167 - acc: 1.0 - batches: 6


                                                                                

## HashingTF + Logistic regression cross validation

In [140]:
from pyspark.ml import feature

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

In [176]:
lr = LogisticRegression(maxIter=10, regParam=0.01)
indexer = StringIndexer(inputCol='TrueLabel', outputCol="label")
#tokenize the data into words
tokenizer = feature.Tokenizer(inputCol="sentence", outputCol="words")

countVectorizer = CountVectorizer(inputCol=tokenizer.getOutputCol(), outputCol="features")

In [184]:
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01]).build()

# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
pipeline_lr = Pipeline(stages=[indexer, tokenizer, countVectorizer, lr])

crossval = CrossValidator(estimator=pipeline,
                      estimatorParamMaps=paramGrid,
                      evaluator=BinaryClassificationEvaluator(),
                      numFolds=5)

In [203]:
accuracy_lst_lr = []
for category in category_list:
    counts = dataset.groupBy(category).count().collect()
    ratio_adjust = 1.0 ## ratio of pos to neg in the df_subsample
    if counts[0][1] > counts[1][1]:
        down_class = counts[0][0]
    else:
        down_class = counts[1][0]
    higherBound = counts[0][1]
    treshold_to_filter = int(ratio_adjust * float(counts[1][1]) / counts[0][1] * higherBound)
    randGen = lambda x: np.random.randint(0, higherBound) if x == down_class else -1
    udfRandGen = udf(randGen, IntegerType())
    df_temp= dataset.withColumn("randIndex", udfRandGen(category))
    df_temp = df_temp.filter(df_temp['randIndex'] < treshold_to_filter).drop('randIndex')

    df_temp = df_temp.select('sentence', col(category).alias('label'))
    # Run cross-validation, and choose the best set of parameters.
    cvModel = crossval.fit(df_temp)

    # Make predictions on test documents. cvModel uses the best model found (lrModel).
    predictions = cvModel.transform(df_temp)
    accuracy_lst_lr.append(evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}))

In [202]:
pd.DataFrame({'category': category_list,
    'accuracy_sentence_embedding':accuracy_lst,
    'accuracy_hashingTF':accuracy_lst_lr})

Unnamed: 0,category,accuracy_sentence_embedding,accuracy_hashingTF
0,Price,0.976471,0.90333
1,Opinion,0.91954,0.898102
2,Food,0.929412,0.887164
3,Atmosphere/DescriptionOfPlaces,0.928571,0.872663
4,Service,0.963855,0.869098


## Run the final model

In [196]:
for category in category_list:
    counts = dataset.groupBy(category).count().collect()
    ratio_adjust = 1.0 ## ratio of pos to neg in the df_subsample
    if counts[0][1] > counts[1][1]:
        down_class = counts[0][0]
    else:
        down_class = counts[1][0]
    higherBound = counts[0][1]
    treshold_to_filter = int(ratio_adjust * float(counts[1][1]) / counts[0][1] * higherBound)
    randGen = lambda x: np.random.randint(0, higherBound) if x == down_class else -1
    udfRandGen = udf(randGen, IntegerType())
    df_temp= dataset.withColumn("randIndex", udfRandGen(category))
    df_temp = df_temp.filter(df_temp['randIndex'] < treshold_to_filter).drop('randIndex')

    df_temp = df_temp.select('sentence', col(category).alias('TrueLabel'))
    df_temp_train, df_temp_test = df_temp.randomSplit([0.8, 0.2])

    print(category + ': Train Test dataset balance')
    df_temp_train.groupBy('TrueLabel').count().show()
    df_temp_test.groupby('TrueLabel').count().show()


    pipeline_model = pipeline_lr.fit(df_temp_train)
    predicts =  pipeline_model.transform(df_temp_test)

    df = predicts.select('sentence', col('TrueLabel').cast('double'), col('prediction').cast('double').alias('Predict')).toPandas()
    pipeline_model = pipeline_lr.fit(df_temp)
    print(category + ': Confusion matrix')
    print(classification_report(df.TrueLabel, df.Predict))
    print(category + ': Accuracy')
    print(accuracy_score(df.TrueLabel, df.Predict))

    sentence_df =  pipeline_model.transform(sentence_df)
    sentence_df = sentence_df.select('*',  col('prediction').cast('double').alias(category)).drop('words','features', 'rawPrediction', 'probability', 'prediction')
#     sentence_df.write.save("gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/" + category, format = "parquet",mode='overwrite')

Price: Train Test dataset balance
+---------+-----+
|TrueLabel|count|
+---------+-----+
|      0.0|  141|
|      1.0|  133|
+---------+-----+

+---------+-----+
|TrueLabel|count|
+---------+-----+
|      0.0|   22|
|      1.0|   30|
+---------+-----+

Price: Confusion matrix
              precision    recall  f1-score   support

         0.0       0.86      0.96      0.91        25
         1.0       0.95      0.84      0.89        25

    accuracy                           0.90        50
   macro avg       0.91      0.90      0.90        50
weighted avg       0.91      0.90      0.90        50

Price: Accuracy
0.9


23/05/17 15:07:40 WARN org.apache.spark.ml.feature.StringIndexerModel: Input column TrueLabel does not exist during transformation. Skip StringIndexerModel for this column.


Opinion: Train Test dataset balance
+---------+-----+
|TrueLabel|count|
+---------+-----+
|      0.0|  135|
|      1.0|  125|
+---------+-----+

+---------+-----+
|TrueLabel|count|
+---------+-----+
|      0.0|   25|
|      1.0|   32|
+---------+-----+

Opinion: Confusion matrix
              precision    recall  f1-score   support

         0.0       0.85      0.85      0.85        26
         1.0       0.89      0.89      0.89        35

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61

Opinion: Accuracy
0.8688524590163934


23/05/17 15:07:43 WARN org.apache.spark.ml.feature.StringIndexerModel: Input column TrueLabel does not exist during transformation. Skip StringIndexerModel for this column.


Food: Train Test dataset balance
+---------+-----+
|TrueLabel|count|
+---------+-----+
|      0.0|  198|
|      1.0|  206|
+---------+-----+

+---------+-----+
|TrueLabel|count|
+---------+-----+
|      0.0|   51|
|      1.0|   57|
+---------+-----+

Food: Confusion matrix
              precision    recall  f1-score   support

         0.0       0.05      0.05      0.05        57
         1.0       0.05      0.05      0.05        56

    accuracy                           0.05       113
   macro avg       0.05      0.05      0.05       113
weighted avg       0.05      0.05      0.05       113

Food: Accuracy
0.05309734513274336


23/05/17 15:07:46 WARN org.apache.spark.ml.feature.StringIndexerModel: Input column TrueLabel does not exist during transformation. Skip StringIndexerModel for this column.


Atmosphere/DescriptionOfPlaces: Train Test dataset balance
+---------+-----+
|TrueLabel|count|
+---------+-----+
|      0.0|  121|
|      1.0|  135|
+---------+-----+

+---------+-----+
|TrueLabel|count|
+---------+-----+
|      0.0|   36|
|      1.0|   32|
+---------+-----+

Atmosphere/DescriptionOfPlaces: Confusion matrix
              precision    recall  f1-score   support

         0.0       0.10      0.08      0.09        37
         1.0       0.11      0.12      0.11        32

    accuracy                           0.10        69
   macro avg       0.10      0.10      0.10        69
weighted avg       0.10      0.10      0.10        69

Atmosphere/DescriptionOfPlaces: Accuracy
0.10144927536231885


23/05/17 15:07:50 WARN org.apache.spark.ml.feature.StringIndexerModel: Input column TrueLabel does not exist during transformation. Skip StringIndexerModel for this column.


Service: Train Test dataset balance
+---------+-----+
|TrueLabel|count|
+---------+-----+
|      0.0|  156|
|      1.0|  169|
+---------+-----+

+---------+-----+
|TrueLabel|count|
+---------+-----+
|      0.0|   42|
|      1.0|   42|
+---------+-----+

Service: Confusion matrix
              precision    recall  f1-score   support

         0.0       0.12      0.10      0.11        50
         1.0       0.12      0.15      0.13        41

    accuracy                           0.12        91
   macro avg       0.12      0.12      0.12        91
weighted avg       0.12      0.12      0.12        91

Service: Accuracy
0.12087912087912088


23/05/17 15:07:53 WARN org.apache.spark.ml.feature.StringIndexerModel: Input column TrueLabel does not exist during transformation. Skip StringIndexerModel for this column.


#### Sentiment analysis

In [None]:
review_sample = review.limit(10000).cache()

In [None]:
review_sample.groupby('stars').agg(count('*').alias('record_cnt')).orderBy('stars', ascending=False).show()



+-----+----------+
|stars|record_cnt|
+-----+----------+
|  5.0|      4630|
|  4.0|      2227|
|  3.0|       975|
|  2.0|       751|
|  1.0|      1417|
+-----+----------+



                                                                                

In [None]:
review_sample = review_sample.\
withColumn("sentiment_strength",\
           when(col("stars") == 1, "Strong Negative").\
           when(col("stars") == 2, "Negative").\
           when(col("stars") == 3, "Neutral").\
           when(col("stars") == 4, "Positive").\
           when(col("stars") == 5, "Strong Positive").\
           otherwise("No Sentiment")).\
withColumn("label",\
           when(col("stars") == 1, -1).\
           when(col("stars") == 2, 3).\
           when(col("stars") == 3, 0).\
           when(col("stars") == 4, 3).\
           when(col("stars") == 5, 1).\
           otherwise(3))

In [None]:
trainingData = review_sample.filter(review_sample['label'] < 3).select('text', 'label')

In [None]:
trainingData.show(5)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|Service was aweso...|    1|
|Most know I have ...|    1|
|Today's lunch was...|    1|
|Love this place! ...|    1|
|This was easily t...|    0|
+--------------------+-----+
only showing top 5 rows



In [None]:
trainingData.groupby('label').agg(count('*').alias('record_cnt')).orderBy('label', ascending=False).show()

+-----+----------+
|label|record_cnt|
+-----+----------+
|    1|      4630|
|    0|       975|
|   -1|      1417|
+-----+----------+



In [None]:
from pyspark.sql.functions import count
class_counts = trainingData.groupBy('label').agg(count('*').alias('count'))
min_count = class_counts.agg({'count': 'min'}).collect()[0][0]
balanced_train = trainingData.sampleBy('label', fractions={c: min_count / r for c, r in class_counts.collect()})

In [None]:
balanced_train.groupby('label').agg(count('*').alias('record_cnt')).orderBy('label', ascending=False).show()

+-----+----------+
|label|record_cnt|
+-----+----------+
|    1|       937|
|    0|       975|
|   -1|       977|
+-----+----------+



In [None]:
df_temp = balanced_train.select(col('text').alias('sentence'), col('label').alias('TrueLabel'))
df_temp_train, df_temp_test = df_temp.randomSplit([0.8, 0.2])

print('Sentiment: Train Test dataset balance')
df_temp_train.groupBy('TrueLabel').count().show()
df_temp_test.groupby('TrueLabel').count().show()


pipeline_model = classifier_pipeline.fit(df_temp_train)
predicts =  pipeline_model.transform(df_temp_train)

df = predicts.select('sentence', 'TrueLabel', col('class.result').getItem(0).alias('Predict')).toPandas()
df['TrueLabel'] = df['TrueLabel'].astype(str)

print('Sentiment: Confusion matrix')
print(classification_report(df.TrueLabel, df.Predict))
print('Sentiment: Accuracy')
print(accuracy_score(df.TrueLabel, df.Predict))

Sentiment: Train Test dataset balance
+---------+-----+
|TrueLabel|count|
+---------+-----+
|        1|  742|
|        0|  778|
|       -1|  778|
+---------+-----+

+---------+-----+
|TrueLabel|count|
+---------+-----+
|       -1|  199|
|        0|  197|
|        1|  195|
+---------+-----+



2023-05-11 06:22:03.427174: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:43] Reading SavedModel from: /tmp/dc3ecbfb07f4_classifier_dl7990459047020111804
2023-05-11 06:22:03.530966: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:107] Reading meta graph with tags { serve }
2023-05-11 06:22:03.531017: I external/org_tensorflow/tensorflow/cc/saved_model/reader.cc:148] Reading SavedModel debug info (if present) from: /tmp/dc3ecbfb07f4_classifier_dl7990459047020111804
2023-05-11 06:22:04.029944: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:228] Restoring SavedModel bundle.
2023-05-11 06:22:04.857264: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:212] Running initialization op on SavedModel bundle at path: /tmp/dc3ecbfb07f4_classifier_dl7990459047020111804
2023-05-11 06:22:05.035351: I external/org_tensorflow/tensorflow/cc/saved_model/loader.cc:301] SavedModel load for tags { serve }; Status: success: OK. Took 1608195 microsecon

Training started - epochs: 5 - learning_rate: 0.005 - batch_size: 64 - training_examples: 2298 - classes: 3
Epoch 1/5 - 0.52s - loss: 30.533617 - acc: 0.6618996 - batches: 36
Epoch 2/5 - 0.18s - loss: 23.198456 - acc: 0.7836053 - batches: 36
Epoch 3/5 - 0.17s - loss: 23.687298 - acc: 0.80498767 - batches: 36
Epoch 4/5 - 0.17s - loss: 22.948769 - acc: 0.8229372 - batches: 36
Epoch 5/5 - 0.17s - loss: 22.674595 - acc: 0.8345443 - batches: 36


[Stage 90:>                                                         (0 + 1) / 1]

Sentiment: Confusion matrix
              precision    recall  f1-score   support

          -1       0.79      0.92      0.85       778
           0       0.72      0.72      0.72       778
           1       0.91      0.77      0.83       742

    accuracy                           0.80      2298
   macro avg       0.81      0.80      0.80      2298
weighted avg       0.81      0.80      0.80      2298

Sentiment: Accuracy
0.799390774586597


                                                                                

In [None]:
sentence_df =  pipeline_model.transform(sentence_df)
sentence_df = sentence_df.select('*',  col('class.result').getItem(0).alias('Sentiment')).drop('document', 'sentences', 'sentence_embeddings', 'class')

In [58]:
sentence_df.show()

[Stage 215:>                                                        (0 + 1) / 1]

+--------------------+--------------------+-----+-------+----+------------------------------+-------+---------+
|           review_id|            sentence|Price|Opinion|Food|Atmosphere/DescriptionOfPlaces|Service|Sentiment|
+--------------------+--------------------+-----+-------+----+------------------------------+-------+---------+
|R6HLfj8KakZY2RUt-...|For under $170 a ...|  1.0|    0.0| 0.0|                           1.0|    0.0|        0|
|R6HLfj8KakZY2RUt-...|You could be at a...|  0.0|    0.0| 0.0|                           0.0|    0.0|       -1|
|R6HLfj8KakZY2RUt-...|The rooms are lov...|  0.0|    0.0| 0.0|                           1.0|    0.0|        1|
|R6HLfj8KakZY2RUt-...|The restaurant wa...|  0.0|    0.0| 0.0|                           0.0|    0.0|        1|
|R6HLfj8KakZY2RUt-...|       In St. Louis.|  0.0|    1.0| 0.0|                           1.0|    0.0|        1|
|R6HLfj8KakZY2RUt-...|           Who knew?|  0.0|    1.0| 0.0|                           0.0|    0.0|   

                                                                                

In [None]:
calc_table = sentence_df.select('review_id', 
                                col('Price').cast('integer'),
                                col('Opinion').cast('integer'),
                                col('Food').cast('integer'),
                                col('Atmosphere/DescriptionOfPlaces').cast('integer').alias('Atmosphere'),
                                col('Service').cast('integer'),
                                col('Sentiment').cast('integer')                                
                               ).withColumn('Sentence_num', lit(1))

In [None]:
calc_table.show()

[Stage 179:>                                                        (0 + 1) / 1]

+--------------------+-----+-------+----+----------+-------+---------+------------+
|           review_id|Price|Opinion|Food|Atmosphere|Service|Sentiment|Sentence_num|
+--------------------+-----+-------+----+----------+-------+---------+------------+
|p8ag6hXNFQntXv4IZ...|    0|      0|   0|         1|      0|       -1|           1|
|p8ag6hXNFQntXv4IZ...|    0|      1|   0|         0|      0|        1|           1|
|p8ag6hXNFQntXv4IZ...|    0|      1|   0|         0|      1|        1|           1|
|p8ag6hXNFQntXv4IZ...|    0|      0|   0|         0|      1|        1|           1|
|p8ag6hXNFQntXv4IZ...|    0|      0|   0|         1|      1|        1|           1|
|p8ag6hXNFQntXv4IZ...|    0|      0|   0|         1|      1|       -1|           1|
|p8ag6hXNFQntXv4IZ...|    0|      0|   0|         1|      1|        0|           1|
|p8ag6hXNFQntXv4IZ...|    0|      0|   0|         1|      0|       -1|           1|
|p8ag6hXNFQntXv4IZ...|    0|      0|   0|         1|      0|       -1|      

                                                                                

In [None]:
calc_table = calc_table.withColumn('Price_sentiment', calc_table['Price'] * calc_table['Sentiment'])\
                    .withColumn('Opinion_sentiment', calc_table['Opinion'] * calc_table['Sentiment'])\
                    .withColumn('Atmosphere_sentiment', calc_table['Atmosphere'] * calc_table['Sentiment'])\
                    .withColumn('Service_sentiment', calc_table['Service'] * calc_table['Sentiment'])\
                    .withColumn('Food_sentiment', calc_table['Food'] * calc_table['Sentiment'])

In [70]:
calc_table.show()

[Stage 227:>(0 + 40) / 40][Stage 228:> (4 + 4) / 40][Stage 229:> (0 + 2) / 48]

+--------------------+-----+-------+----+----------+-------+---------+------------+---------------+-----------------+--------------------+-----------------+--------------+
|           review_id|Price|Opinion|Food|Atmosphere|Service|Sentiment|Sentence_num|Price_sentiment|Opinion_sentiment|Atmosphere_sentiment|Service_sentiment|Food_sentiment|
+--------------------+-----+-------+----+----------+-------+---------+------------+---------------+-----------------+--------------------+-----------------+--------------+
|KU_O5udG6zpxOg-Vc...|    0|      1|   0|         0|      0|        0|           1|              0|                0|                   0|                0|             0|
|KU_O5udG6zpxOg-Vc...|    0|      1|   1|         0|      0|       -1|           1|              0|               -1|                   0|                0|            -1|
|KU_O5udG6zpxOg-Vc...|    0|      1|   0|         0|      1|       -1|           1|              0|               -1|                   0|  

[Stage 227:>(0 + 40) / 40][Stage 228:> (8 + 3) / 40][Stage 229:> (0 + 3) / 48]  

In [None]:
calc_table.count()

[Stage 227:>(0 + 40) / 40][Stage 228:>(11 + 2) / 40][Stage 229:> (3 + 2) / 48]

In [56]:
calc_table.show()

+--------------------+-----+-------+----+----------+-------+---------+------------+---------------+-----------------+--------------------+-----------------+--------------+
|           review_id|Price|Opinion|Food|Atmosphere|Service|Sentiment|Sentence_num|Price_sentiment|Opinion_sentiment|Atmosphere_sentiment|Service_sentiment|Food_sentiment|
+--------------------+-----+-------+----+----------+-------+---------+------------+---------------+-----------------+--------------------+-----------------+--------------+
|KU_O5udG6zpxOg-Vc...|    0|      1|   0|         0|      0|        0|           1|              0|                0|                   0|                0|             0|
|KU_O5udG6zpxOg-Vc...|    0|      1|   1|         0|      0|       -1|           1|              0|               -1|                   0|                0|            -1|
|KU_O5udG6zpxOg-Vc...|    0|      1|   0|         0|      1|       -1|           1|              0|               -1|                   0|  

                                                                                

In [None]:
%%time
calc_table.write.save("gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/calc_table", format = "parquet",mode='overwrite')

                                                                                

CPU times: user 621 ms, sys: 209 ms, total: 831 ms
Wall time: 1h 46s


In [None]:
sentence_groupby = calc_table.groupBy('review_id').agg(sum("Price").alias("Price_total"), 
                                                               sum("Opinion").alias("Opinion_total"), 
                                                               sum("Food").alias("Food_total"),
                                                               sum("Atmosphere").alias("Atmosphere_total"), 
                                                               sum("Service").alias("Service_total"), 
                                                               sum("Sentiment").alias("Sentiment_total"),
                                                               sum("Price_sentiment").alias("Price_sent_total"), 
                                                               sum("Opinion_sentiment").alias("Opinion_sent_total"), 
                                                               sum("Food_sentiment").alias("Food_sent_total"),
                                                               sum("Atmosphere_sentiment").alias("Atmosphere_sent_total"), 
                                                               sum("Service_sentiment").alias("Service_sent_total"),
                                                                sum('Sentence_num').alias('Sentence_num'))

In [76]:
sentence_groupby.show()

[Stage 221:>                (0 + 1) / 1][Stage 346:>                (0 + 1) / 1]

+--------------------+-----------+-------------+----------+----------------+-------------+---------------+----------------+------------------+---------------+---------------------+------------------+------------+
|           review_id|Price_total|Opinion_total|Food_total|Atmosphere_total|Service_total|Sentiment_total|Price_sent_total|Opinion_sent_total|Food_sent_total|Atmosphere_sent_total|Service_sent_total|Sentence_num|
+--------------------+-----------+-------------+----------+----------------+-------------+---------------+----------------+------------------+---------------+---------------------+------------------+------------+
|iwDDm0Rrpe1Ua7O-N...|          2|            4|         0|               3|            4|              7|               0|                 4|              0|                    1|                 4|           9|
|pt7fin3p2m_mlEU0k...|          0|            2|         6|               0|            1|              2|               0|                 1|      

[Stage 221:>                                                        (0 + 1) / 1]

In [None]:
sentence_groupby.show()



+--------------------+-----------+-------------+----------+----------------+-------------+---------------+----------------+------------------+---------------+---------------------+------------------+------------+
|           review_id|Price_total|Opinion_total|Food_total|Atmosphere_total|Service_total|Sentiment_total|Price_sent_total|Opinion_sent_total|Food_sent_total|Atmosphere_sent_total|Service_sent_total|Sentence_num|
+--------------------+-----------+-------------+----------+----------------+-------------+---------------+----------------+------------------+---------------+---------------------+------------------+------------+
|pKqLbblHv_YobnSXZ...|          1|            4|         0|               1|            3|              2|              -1|                 2|              0|                    1|                 1|           6|
|W7DE_3jiFpccb6_2h...|          4|           22|         6|              15|            5|            -18|              -4|               -11|      

                                                                                

In [40]:
sentence_groupby = sentence_groupby.select('review_id','Price_total', 'Opinion_total', 'Food_total', 'Atmosphere_total', 'Service_total',
                                          (col('Sentiment_total') / col('Sentence_num')).alias('Total_sentscore'),
                                          (col('Price_sent_total') / col('Sentence_num')).alias('Price_sentscore'),
                                          (col('Opinion_sent_total') / col('Sentence_num')).alias('Opinion_sentscore'),   
                                          (col('Food_sent_total') / col('Sentence_num')).alias('Food_sentscore'),
                                          (col('Atmosphere_sent_total') / col('Sentence_num')).alias('Atmosphere_sentscore'),
                                          (col('Service_sent_total') / col('Sentence_num')).alias('Service_sentscore'), 'Sentence_num')

In [None]:
%%time
sentence_groupby.show()



+--------------------+-----------+-------------+----------+----------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+
|           review_id|Price_total|Opinion_total|Food_total|Atmosphere_total|Service_total|     Total_sentscore|     Price_sentscore|   Opinion_sentscore|      Food_sentscore|Atmosphere_sentscore|   Service_sentscore|Sentence_num|
+--------------------+-----------+-------------+----------+----------------+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+
|w-0SvNfYJPmrg7phC...|          0|            3|         2|               0|            0| -0.3333333333333333|                 0.0| -0.3333333333333333|                 0.0|                 0.0|                 0.0|           6|
|w7jXrIRAEt9GLGK8u...|          0|            3|         1|               1|    

                                                                                

In [None]:
%%time
sentence_groupby.coalesce(1).write.save("gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/sentence_groupby_finalv2", format = "parquet",mode='overwrite')

                                                                                

CPU times: user 711 ms, sys: 235 ms, total: 946 ms
Wall time: 1h 15min 42s


In [35]:
sentence_groupby = spark.read.option('multiline','true').option("quote", "\"").option('escape','\"').option('ignoreLeadingWhiteSpace', 'true').option('header', True).option('escapeQuotes', 'true').\
parquet('gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/engineered_data/aspect_sentiment')

In [52]:
review_idunique = review.select('review_id').distinct()

In [53]:
sentence_groupby = sentence_groupby.join(review_idunique, 'review_id', 'inner')

In [54]:
sentence_groupby.coalesce(1).write.save("gs://msca-bdp-student-gcs/GroupProject_Gr7/yelp_dataset/engineered_data/aspect_sentiment", format = "parquet",mode='overwrite')

                                                                                