In [1]:
# Export java11 to use
import os
os.environ['JAVA_HOME'] = '/home/nlplab/.jdk/jdk-11.0.19+7'

# Import the necessary modules
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import udf, col, lower, regexp_replace, when
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml import Pipeline, PipelineModel
from nltk.corpus import stopwords

# Create a spark session
spark = SparkSession.builder \
    .appName("SentimentAnalysisTFIDF") \
    .master("local[*]") \
    .config("spark.driver.memory", "100g") \
    .config("spark.executor.memory", "100g") \
    .config("spark.memory.offHeap.enabled","true") \
    .config("spark.memory.offHeap.size","100g") \
    .getOrCreate()



In [2]:
# Load the sentiment data
# Assume the data has two columns: body and score
# Score is an integer from 1 to 5
print('READ DATASET...')
data = spark.read.csv('test.csv', inferSchema=True, header=True, multiLine=True, quote='"', escape='"')
data = data.select('review/score', (lower(regexp_replace('review/text', "[^a-zA-Z\\s]", "")).alias('review/text')))
data = data.dropna()

# Convert to 2 label 0, 1
data = data.replace(1, 0, subset=["review/score"])
data = data.replace(2, 0, subset=["review/score"])
data = data.replace(3, 0, subset=["review/score"])
data = data.replace(4, 1, subset=["review/score"])
data = data.replace(5, 1, subset=["review/score"])

READ DATASET...


In [3]:
tokenizer = Tokenizer(inputCol="review/text", outputCol="tokens")
remover = StopWordsRemover(stopWords=stopwords.words('english'), inputCol="tokens", outputCol="words")

pipeline = Pipeline(stages=[tokenizer, remover])

pipeline = pipeline.fit(data)

pipeline.save('pipelines/preprocess/tokenizer_stopwordremover')