In [1]:
import torch

if torch.cuda.is_available():
    print("CUDA is available. GPU can be used.")
    print("Version: ", torch.version.cuda)
    print("Number of GPUs available: ", torch.cuda.device_count())
else:
    print("CUDA is not available. Using CPU instead.")

CUDA is available. GPU can be used.
Version:  12.1
Number of GPUs available:  1


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# This is only to setup PySpark and Spark NLP on Colab
# !wget https://setup.johnsnowlabs.com/colab.sh -O - | bash
!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash -s -- -g

--2024-04-15 09:46:02--  https://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2024-04-15 09:46:02--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1191 (1.2K) [text/plain]
Saving to: ‘STDOUT’


2024-04-15 09:46:03 (120 MB/s) - written to stdout [1191/1191]

Installing PySpark 3.2.3 and Spark NLP 5.3.3
setup Colab for PySpark 3.2.3 and Spark NLP 

In [4]:
import sparknlp
import pyspark.sql.functions as F
spark = sparknlp.start(gpu=True)

print("\nSpark NLP version: {}".format(sparknlp.version()))
print("\nApache Spark version: {}".format(spark.version))


Spark NLP version: 5.3.3

Apache Spark version: 3.2.3


In [5]:
from sparknlp.pretrained import ResourceDownloader

ResourceDownloader.showPublicModels(annotator = "SentimentDLModel", lang = "en")

+-------------------------+------+---------+
| Model                   | lang | version |
+-------------------------+------+---------+
| sentimentdl_glove_imdb  |  en  | 2.5.0   |
| sentimentdl_use_twitter |  en  | 2.5.0   |
| sentimentdl_use_imdb    |  en  | 2.5.0   |
| sentimentdl_glove_imdb  |  en  | 2.7.1   |
| sentimentdl_use_imdb    |  en  | 2.7.0   |
| sentimentdl_use_twitter |  en  | 2.7.1   |
+-------------------------+------+---------+



In [6]:
testDataset = spark.read \
    .json("/content/drive/My Drive/Colab Notebooks/All_Beauty_5.json")

In [7]:
testDataset = testDataset.withColumnRenamed("reviewText", "text")
testDataset = testDataset.withColumnRenamed("overall", "label")
testDataset = testDataset.withColumn("label", F.when(testDataset["label"] >=3, 1.0).otherwise(0.0))
testDataset = testDataset.select("label", "text", "reviewerID")

In [8]:
testDataset.show(5)

+-----+--------------------+--------------+
|label|                text|    reviewerID|
+-----+--------------------+--------------+
|  1.0|As advertised. Re...|A3CIUOJXQ5VDQ2|
|  1.0|Like the oder and...|A3H7T87S984REU|
|  0.0|I bought this to ...|A3J034YH7UG4KT|
|  1.0|HEY!! I am an Aqu...|A2UEO5XR3598GI|
|  1.0|If you ever want ...|A3SFRT223XXWF7|
+-----+--------------------+--------------+
only showing top 5 rows



In [9]:
# Import the required modules and classes
from sparknlp.base import DocumentAssembler, Pipeline
from sparknlp.annotator import (
    UniversalSentenceEncoder,
    SentimentDLModel
)

documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
print("\ndocumentAssembler finished!")

use = UniversalSentenceEncoder.pretrained("tfhub_use", "en")\
    .setInputCols(["document"])\
    .setOutputCol("sentence_embeddings")
print("\nuseEmbeddings finished!")

sentimentdl = SentimentDLModel.pretrained("sentimentdl_use_twitter", "en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("prediction")
print("\nsentimentdl finished!")

pipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])


documentAssembler finished!
tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]

useEmbeddings finished!
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.4 MB
[OK!]

sentimentdl finished!


In [10]:
import time

In [11]:
# Start the timer
empty_df = spark.createDataFrame([['']]).toDF("text")
print("\nFitting model...")

start_time = time.time()
pipelineModel = pipeline.fit(empty_df)
# End the timer
end_time = time.time()

print("\nModel fitted.")

# Calculate the total time taken
total_time = end_time - start_time
print(f"\nTotal execution time: {total_time} seconds")

# Start the timer
print("\nTesting model...")
start_time = time.time()

preds = pipelineModel.transform(testDataset)
# End the timer
end_time = time.time()

print("\nTesting finished.")

# Calculate the total time taken
total_time = end_time - start_time
print(f"\nTotal execution time: {total_time} seconds")


Fitting model...

Model fitted.

Total execution time: 0.0015628337860107422 seconds

Testing model...

Testing finished.

Total execution time: 0.37982821464538574 seconds


In [12]:
df1 = preds.select("label", "text")
df2 = preds.select(
    F.explode(
      F.arrays_zip(
        preds.document.result,
        preds.prediction.result)).alias("cols")
).select(
    F.expr("cols['0']").alias("text"),
    F.expr("cols['1']").alias("prediction")
)
df2 = df2.withColumn(
    "prediction",
    F.when((df2["prediction"] == "positive") | (df2["prediction"] == "neutral"), 1.0).otherwise(0.0)
)
merge_df = df1.join(df2, (df1.text == df2.text), "inner").drop(df2["text"])

In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import Row

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(merge_df)

# Precision, Recall, and F1-Score
precision = evaluator.evaluate(merge_df, {evaluator.metricName: "precisionByLabel"})
recall = evaluator.evaluate(merge_df, {evaluator.metricName: "recallByLabel"})
f1 = evaluator.evaluate(merge_df, {evaluator.metricName: "f1"})

# Create a DataFrame with the metrics
metrics_df = spark.createDataFrame([
    Row(metric="\nAccuracy", score=accuracy),
    Row(metric="\nPrecision", score=precision),
    Row(metric="\nRecall", score=recall),
    Row(metric="\nF1 Score", score=f1)
])

In [14]:
print("Metrics for Pretrained Pipline\nTwitter Tweets")
metrics_df.show()

Metrics for Pretrained Pipline
Twitter Tweets
+-----------+-------------------+
|     metric|              score|
+-----------+-------------------+
| \nAccuracy| 0.7938681814644775|
|\nPrecision|0.11605481306039588|
|   \nRecall| 0.9038208168642952|
| \nF1 Score| 0.8616076446639912|
+-----------+-------------------+



In [15]:
sentimentdl = SentimentDLModel.pretrained("sentimentdl_use_imdb", "en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("prediction")
print("\nsentimentdl finished!")

pipeline = Pipeline(
      stages = [
          documentAssembler,
          use,
          sentimentdl
      ])

sentimentdl_use_imdb download started this may take some time.
Approximate size to download 12 MB
[OK!]

sentimentdl finished!


In [16]:
# Start the timer
empty_df = spark.createDataFrame([['']]).toDF("text")
print("\nFitting model...")

start_time = time.time()
pipelineModel = pipeline.fit(empty_df)
# End the timer
end_time = time.time()

print("\nModel fitted.")

# Calculate the total time taken
total_time = end_time - start_time
print(f"\nTotal execution time: {total_time} seconds")

# Start the timer
print("\nTesting model...")
start_time = time.time()

preds = pipelineModel.transform(testDataset)
# End the timer
end_time = time.time()

print("\nTesting finished.")

# Calculate the total time taken
total_time = end_time - start_time
print(f"\nTotal execution time: {total_time} seconds")


Fitting model...

Model fitted.

Total execution time: 0.0003399848937988281 seconds

Testing model...

Testing finished.

Total execution time: 0.10997390747070312 seconds


In [17]:
df1 = preds.select("label", "text")
df2 = preds.select(
    F.explode(
      F.arrays_zip(
        preds.document.result,
        preds.prediction.result)).alias("cols")
).select(
    F.expr("cols['0']").alias("text"),
    F.expr("cols['1']").alias("prediction")
)
df2 = df2.withColumn(
    "prediction",
    F.when((df2["prediction"] == "positive") | (df2["prediction"] == "neutral"), 1.0).otherwise(0.0)
)
merge_df = df1.join(df2, (df1.text == df2.text), "inner").drop(df2["text"])

In [18]:
accuracy = evaluator.evaluate(merge_df)
precision = evaluator.evaluate(merge_df, {evaluator.metricName: "precisionByLabel"})
recall = evaluator.evaluate(merge_df, {evaluator.metricName: "recallByLabel"})
f1 = evaluator.evaluate(merge_df, {evaluator.metricName: "f1"})

metrics_df = spark.createDataFrame([
    Row(metric="\nAccuracy", score=accuracy),
    Row(metric="\nPrecision", score=precision),
    Row(metric="\nRecall", score=recall),
    Row(metric="\nF1 Score", score=f1)
])

In [19]:
print("Metrics for Pretrained Pipline\nIMDB Reviews (w/o Glove)")
metrics_df.show()

Metrics for Pretrained Pipline
IMDB Reviews (w/o Glove)
+-----------+--------------------+
|     metric|               score|
+-----------+--------------------+
| \nAccuracy|0.033460431094856434|
|\nPrecision|0.029610531661393023|
|   \nRecall|  0.9986824769433466|
| \nF1 Score|0.009602969181568641|
+-----------+--------------------+



In [20]:
from sparknlp.annotator import (
    SentenceDetector,
    Tokenizer,
    WordEmbeddingsModel,
    SentenceEmbeddings
)

documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
print("\ndocumentAssembler finished!")

sentencer = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")
print("\nsentencer finished!")

tokenizer = Tokenizer() \
            .setInputCols(["sentence"]) \
            .setOutputCol("tokens")
print("\ntokenizer finished!")

use = WordEmbeddingsModel.pretrained("glove_100d")\
    .setInputCols(['document','tokens'])\
    .setOutputCol('word_embeddings')
print("\nWordEmbeddingsModel finished!")

sentence_embeddings = SentenceEmbeddings() \
    .setInputCols(["document", "word_embeddings"]) \
    .setOutputCol("sentence_embeddings") \
    .setPoolingStrategy("AVERAGE")
print("\nsentence_embeddings finished!")

sentimentdl = SentimentDLModel.pretrained("sentimentdl_glove_imdb", "en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("prediction")
print("\nsentimentdl finished!")

pipeline = Pipeline(
      stages = [
          documentAssembler,
          sentencer,
          tokenizer,
          use,
          sentence_embeddings,
          sentimentdl
      ])


documentAssembler finished!

sentencer finished!

tokenizer finished!
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]

WordEmbeddingsModel finished!

sentence_embeddings finished!
sentimentdl_glove_imdb download started this may take some time.
Approximate size to download 8.7 MB
[OK!]

sentimentdl finished!


In [21]:
# Start the timer
empty_df = spark.createDataFrame([['']]).toDF("text")
print("\nFitting model...")

start_time = time.time()
pipelineModel = pipeline.fit(empty_df)
# End the timer
end_time = time.time()

print("\nModel fitted.")

# Calculate the total time taken
total_time = end_time - start_time
print(f"\nTotal execution time: {total_time} seconds")

# Start the timer
print("\nTesting model...")
start_time = time.time()

preds = pipelineModel.transform(testDataset)
# End the timer
end_time = time.time()

print("\nTesting finished.")

# Calculate the total time taken
total_time = end_time - start_time
print(f"\nTotal execution time: {total_time} seconds")


Fitting model...

Model fitted.

Total execution time: 0.09493184089660645 seconds

Testing model...

Testing finished.

Total execution time: 0.24004030227661133 seconds


In [22]:
df1 = preds.select("label", "text")
df2 = preds.select(
    F.explode(
      F.arrays_zip(
        preds.document.result,
        preds.prediction.result)).alias("cols")
).select(
    F.expr("cols['0']").alias("text"),
    F.expr("cols['1']").alias("prediction")
)
df2 = df2.withColumn(
    "prediction",
    F.when((df2["prediction"] == "positive") | (df2["prediction"] == "neutral"), 1.0).otherwise(0.0)
)
merge_df = df1.join(df2, (df1.text == df2.text), "inner").drop(df2["text"])

In [23]:
accuracy = evaluator.evaluate(merge_df)
precision = evaluator.evaluate(merge_df, {evaluator.metricName: "precisionByLabel"})
recall = evaluator.evaluate(merge_df, {evaluator.metricName: "recallByLabel"})
f1 = evaluator.evaluate(merge_df, {evaluator.metricName: "f1"})

metrics_df = spark.createDataFrame([
    Row(metric="Accuracy", score=accuracy),
    Row(metric="Precision", score=precision),
    Row(metric="Recall", score=recall),
    Row(metric="F1 Score", score=f1)
])

In [24]:
print("Metrics for Pretrained Pipline\nIMDB Reviews (w/ Glove)")
metrics_df.show()

Metrics for Pretrained Pipline
IMDB Reviews (w/ Glove)
+---------+--------------------+
|   metric|               score|
+---------+--------------------+
| Accuracy|  0.0496459419500428|
|Precision|0.030136986301369864|
|   Recall|                 1.0|
| F1 Score| 0.04114126508468206|
+---------+--------------------+



In [25]:
# Stop the Spark session
spark.stop()