In [1]:
# Export java11 to use
import os
os.environ['JAVA_HOME'] = '/home/team1/.jdk/jdk-11.0.19+7'
os.environ["SPARK_HOME"] = "/opt/spark"

# import findspark and initialize it
import findspark
findspark.init("/opt/spark")

# Import the necessary modules
from pyspark.sql import SparkSession
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import udf, col, lower, regexp_replace, when
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StopWordsRemover, NGram, CountVectorizer, VectorAssembler
from pyspark.ml.classification import NaiveBayes,LogisticRegression, LinearSVC, MultilayerPerceptronClassifier
from nltk.stem.snowball import SnowballStemmer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline, PipelineModel

# Create a spark session
spark = SparkSession.builder \
    .appName("SentimentAnalysisTFIDF") \
    .master("local[*]") \
    .config("spark.driver.memory", "100g") \
    .config("spark.executor.memory", "100g") \
    .config("spark.memory.offHeap.enabled","true") \
    .config("spark.memory.offHeap.size","100g") \
    .getOrCreate()

In [2]:
# Load the sentiment data
# Assume the data has two columns: body and score
# Score is an integer from 1 to 5
print('READ DATASET...')
data = spark.read.csv('part2_900k.csv', inferSchema=True, header=True, multiLine=True, quote='"', escape='"')
data = data.select('review/score', (lower(regexp_replace('review/text', "[^a-zA-Z\\s]", "")).alias('review/text')))
data = data.dropna()

# Convert to 2 label 0, 1
data = data.replace(1, 0, subset=["review/score"])
data = data.replace(2, 0, subset=["review/score"])
data = data.replace(3, 0, subset=["review/score"])
data = data.replace(4, 1, subset=["review/score"])
data = data.replace(5, 1, subset=["review/score"])

READ DATASET...


In [3]:
!which python

/workspace/nlplab/Bigdata/venv/bin/python


In [3]:
# Define the pipeline that includes tokenize, hashingTF and IDF
print('PREPROCESSING...')
# Tokenize text
# Tokenize text
preprocess = PipelineModel.load('pipelines/preprocess/tokenizer_stopwordremover')
# preprocess = preprocess.fit(data)
data = preprocess.transform(data)

# Split the data into train and test sets
train, test = data.randomSplit([0.9, 0.1], seed=42)

PREPROCESSING...


In [4]:
# Split the data into train and test sets
train, test = data.randomSplit([0.9, 0.1], seed=42)

In [None]:
NGRAM = 2
VOCABSIZE = [2**13, 2**12, 2**10]
MINDOCFRED = [50, 5, 5]

i = 2

ngrams = NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
 
cv = CountVectorizer(vocabSize=VOCABSIZE[i-1],inputCol="{0}_grams".format(i),
        outputCol="{0}_tf".format(i))

idf = IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=MINDOCFRED[i-1]) 

pipeline = Pipeline(stages=[ngrams, cv, idf])
pipeline = pipeline.fit(train)
pipeline.save('./pipelines/preprocess/{0}-gram_idf'.format(i))