In [1]:
import torch

if torch.cuda.is_available():
    print("CUDA is available. GPU can be used.")
    print("Version: ", torch.version.cuda)
    print("Number of GPUs available: ", torch.cuda.device_count())
else:
    print("CUDA is not available. Using CPU instead.")

CUDA is available. GPU can be used.
Version:  11.7
Number of GPUs available:  4


In [2]:
import pandas as pd
import numpy as np
import time
from pyspark.sql.functions import when, rand
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [3]:
import sparknlp
import findspark as fs
fs.init('/home/jdu5sq/spark-3.4.1-bin-hadoop3')
fs.find()

'/home/jdu5sq/spark-3.4.1-bin-hadoop3'

In [4]:
data_path = "/home/jdu5sq/Documents/MSDS/DS5110/Project/"

In [5]:
from pyspark.sql import SparkSession

def start_spark_session():
    spark = SparkSession.builder \
        .appName("GPU Spark NLP") \
        .master("local[10]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.executor.memory", "12G") \
        .config("spark.executor.instances", "4") \
        .config("spark.task.cpus", "1") \
        .config("spark.task.resource.gpu.amount", "0.25") \
        .config("spark.executor.resource.gpu.amount", "1") \
        .config("spark.executor.resource.gpu.discoveryScript", data_path+"/getGpusResources.sh") \
        .config("spark.driver.resource.gpu.amount", "1") \
        .config("spark.driver.resource.gpu.discoveryScript", data_path+"/getGpusResources.sh") \
        .getOrCreate()
    return spark

spark = start_spark_session()
sparknlp.start(gpu=True)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/12 05:19:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable




24/04/12 05:19:45 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [6]:
print("Starting dataset making...")

schema = StructType([
    StructField("label", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("text", StringType(), True)
])

trainDataset = spark.read \
    .option("header", False) \
    .schema(schema) \
    .csv("debugger_train.csv")

print("Finished getting dataset.")

Starting dataset making...
Finished getting dataset.


In [7]:
trainDataset = trainDataset.withColumn("label", when(trainDataset["label"] == 2, 1).otherwise(0))
# debugDataset = trainDataset.orderBy(rand()).limit(100)

In [None]:
# Import the required modules and classes
from sparknlp.base import DocumentAssembler, Pipeline
from sparknlp.annotator import (
    UniversalSentenceEncoder,
    SentimentDLApproach
)

In [None]:
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

useEmbeddings = UniversalSentenceEncoder.pretrained() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence_embeddings")

sentimentdl = SentimentDLApproach() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("sentiment") \
    .setLabelColumn("label") \
    .setbatchSize(32) \
    .setlr(1e-3) \
    .setMaxEpochs(5) \
    .setEnableOutputLogs(True)

pipeline = Pipeline() \
    .setStages(
      [
        documentAssembler,
        useEmbeddings,
        sentimentdl
      ]
    )

In [None]:
# Start the timer
start_time = time.time()

pipelineModel = pipeline.fit(debugDataset)

print("Model fitted.")

# End the timer
end_time = time.time()

# Calculate the total time taken
total_time = end_time - start_time
print(f"Total execution time: {total_time} seconds")

# cat ~/annotator_logs/SentimentDLApproach_12faa854e3b3.log

print("Starting logs.")

!cat ~/annotator_logs/SentimentDLApproach_12faa854e3b3.log

In [None]:
spark.stop()