In [1]:
import torch

if torch.cuda.is_available():
    print("CUDA is available. GPU can be used.")
    print("Version: ", torch.version.cuda)
    print("Number of GPUs available: ", torch.cuda.device_count())
else:
    print("CUDA is not available. Using CPU instead.")

CUDA is available. GPU can be used.
Version:  11.7
Number of GPUs available:  4


In [2]:
import pandas as pd
import numpy as np
import time
from pyspark.sql.functions import when, rand
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [3]:
import sparknlp
import findspark as fs
fs.init('/home/jdu5sq/spark-3.4.1-bin-hadoop3')
fs.find()

'/home/jdu5sq/spark-3.4.1-bin-hadoop3'

In [4]:
data_path = "/home/jdu5sq/Documents/MSDS/DS5110/Project/"

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("GPU Spark NLP") \
    .master("local[*]") \
    .config("spark.driver.memory", "16G") \
    .config("spark.executor.memory", "12G") \
    .config("spark.executor.instances", "4") \
    .config("spark.task.cpus", "1") \
    .config("spark.task.resource.gpu.amount", "0.5") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.resource.gpu.amount", "1") \
    .config("spark.executor.resource.gpu.discoveryScript", data_path+"/getGpusResources.sh") \
    .config("spark.driver.resource.gpu.amount", "1") \
    .config("spark.driver.resource.gpu.discoveryScript", data_path+"/getGpusResources.sh") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.2.0,com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.2.0") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "2000M") \
    .config("spark.driver.maxResultSize", "0") \
    .getOrCreate()

print("Spark initialised.")

:: loading settings :: url = jar:file:/sfs/qumulo/qhome/jdu5sq/spark-3.4.1-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jdu5sq/.ivy2/cache
The jars for the packages stored in: /home/jdu5sq/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
com.johnsnowlabs.nlp#spark-nlp-gpu_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6d31f7e8-b2fa-492e-9a86-51ebd8e30b9a;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;4.2.0 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.uni

Spark initialised.


In [6]:
print("Starting dataset making...")

schema = StructType([
    StructField("label", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("text", StringType(), True)
])

trainDataset = spark.read \
    .option("header", False) \
    .schema(schema) \
    .csv("debugger_train.csv")

print("Finished getting dataset.")

Starting dataset making...
Finished getting dataset.


In [7]:
debugDataset = trainDataset.withColumn("label", when(trainDataset["label"] == 2, 1).otherwise(0))

In [8]:
# Import the required modules and classes
from sparknlp.base import DocumentAssembler, Pipeline
from sparknlp.annotator import (
    UniversalSentenceEncoder,
    SentimentDLApproach
)

In [9]:
# spark.sparkContext.setLogLevel("DEBUG")

In [10]:
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")
print("documentAssembler finished!")

useEmbeddings = UniversalSentenceEncoder.pretrained() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence_embeddings")
print("useEmbeddings finished!")

sentimentdl = SentimentDLApproach() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("sentiment") \
    .setLabelColumn("label") \
    .setbatchSize(32) \
    .setlr(1e-3) \
    .setMaxEpochs(5) \
    .setEnableOutputLogs(True)
print("sentimentdl finished!")

documentAssembler finished!
tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ | ]tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
Download done! Loading the resource.
[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ / ]

[Stage 0:>                                                          (0 + 0) / 1]

[ | ]

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/jdu5sq/.local/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jdu5sq/.local/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/apps/software/standard/core/jupyterlab/3.6.3-py3.11/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


[OK!]


KeyboardInterrupt: 

In [None]:
pipeline = Pipeline() \
    .setStages(
      [
        documentAssembler,
        useEmbeddings,
        sentimentdl
      ]
    )

In [None]:
# Start the timer
start_time = time.time()

pipelineModel = pipeline.fit(debugDataset)

print("Model fitted.")

# End the timer
end_time = time.time()

# Calculate the total time taken
total_time = end_time - start_time
print(f"Total execution time: {total_time} seconds")

# cat ~/annotator_logs/SentimentDLApproach_12faa854e3b3.log

print("Starting logs.")

!cat ~/annotator_logs/SentimentDLApproach_12faa854e3b3.log

In [None]:
spark.stop()