In [None]:
import pandas as pd
import numpy as np
import time
from pyspark.sql.functions import when

In [None]:
import findspark as fs
fs.init('/home/jdu5sq/spark-3.4.1-bin-hadoop3')
fs.find()

In [None]:
import sparknlp
params = {
    "spark.driver.cores":"4",
    "spark.driver.memory":"8G",
    "spark.executor.memory":"8G",
    "spark.master":"local[4]"
}
spark = sparknlp.start(gpu=True, params=params)

In [None]:
trainDataset = spark.read \
      .option("header", False) \
      .csv("train.csv")

In [None]:
header_names = ["label", "title", "text"]
trainDataset = trainDataset.toDF(*header_names)
trainDataset = trainDataset.withColumn("label", when(trainDataset["label"] == 2, 1).otherwise(0))

In [None]:
trainDataset.show(5)

In [None]:
# Import the required modules and classes
from sparknlp.base import DocumentAssembler, Pipeline
from sparknlp.annotator import (
    UniversalSentenceEncoder,
    SentimentDLApproach
)

In [None]:
%%time
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

useEmbeddings = UniversalSentenceEncoder.pretrained() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence_embeddings")

sentimentdl = SentimentDLApproach() \
    .setInputCols(["sentence_embeddings"]) \
    .setOutputCol("sentiment") \
    .setLabelColumn("label") \
    .setMaxEpochs(5) \
    .setEnableOutputLogs(True)

pipeline = Pipeline() \
    .setStages(
      [
        documentAssembler,
        useEmbeddings,
        sentimentdl
      ]
    )

pipelineModel = pipeline.fit(trainDataset)

!cat ~/annotator_logs/SentimentDLApproach_12faa854e3b3.log

In [None]:
spark.stop()