# **II.Text Pre-processing**

For sentiment analysis, we first need to pre-process our comments

**STEP 1. Remove punctuations**

In [None]:
def remove_punct(text):
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    no_pun = regex.sub(" ", text)
    return no_pun

**STEP 2. Classify reviews as positive or negative sentiment, where star greater than 3 is labeled as 1.** **bold text**

In [None]:
def convert_rating(rating):
    star_ratings = int(rating)
    if rating >=3: return 1
    else: return 0

# functions to remove punctuation and convert star ratings
pun_remove = udf(lambda x: remove_punct(x))
convert = udf(lambda x: convert_rating(x))

# apply above functions to our datasets
processed_review = df_review.select('review_id', pun_remove('text'), convert('stars'))

processed_review = processed_review.withColumnRenamed('<lambda>(text)', 'text')\
                     .withColumn('label', processed_review["<lambda>(stars)"].cast(IntegerType()))\
                     .drop('<lambda>(stars)')\
                     .limit(1000000)

In [None]:
#Display reviews after processing
processed_review.show(5)

+--------------------+--------------------+-----+
|           review_id|                text|label|
+--------------------+--------------------+-----+
|KU_O5udG6zpxOg-Vc...|If you decide to ...|    1|
|BiTunyQ73aT9WBnpR...|I ve taken a lot ...|    1|
|saUsX_uimxRlCVr67...|Family diner  Had...|    1|
|AqPFMleE6RsU23_au...|Wow   Yummy  diff...|    1|
|Sx8TMOWLNuJBWer-0...|Cute interior and...|    1|
+--------------------+--------------------+-----+
only showing top 5 rows



**STEP 3. Tokenize comments and remove stop words**

In [None]:
# tokenize reviews
tokenize = Tokenizer(inputCol="text", outputCol="words")
tokenized_review = tokenize.transform(processed_review)

# remove stop words
remove_stopword = StopWordsRemover(inputCol='words', outputCol='words_new')
tokenized_review = remove_stopword.transform(tokenized_review)

In [None]:
#Display reviews after tokenization and removal of stop words
tokenized_review.show(5)

+--------------------+--------------------+-----+--------------------+--------------------+
|           review_id|                text|label|               words|           words_new|
+--------------------+--------------------+-----+--------------------+--------------------+
|KU_O5udG6zpxOg-Vc...|If you decide to ...|    1|[if, you, decide,...|[decide, eat, , a...|
|BiTunyQ73aT9WBnpR...|I ve taken a lot ...|    1|[i, ve, taken, a,...|[ve, taken, lot, ...|
|saUsX_uimxRlCVr67...|Family diner  Had...|    1|[family, diner, ,...|[family, diner, ,...|
|AqPFMleE6RsU23_au...|Wow   Yummy  diff...|    1|[wow, , , yummy, ...|[wow, , , yummy, ...|
|Sx8TMOWLNuJBWer-0...|Cute interior and...|    1|[cute, interior, ...|[cute, interior, ...|
+--------------------+--------------------+-----+--------------------+--------------------+
only showing top 5 rows



**STEP 4: CountVectorisation and tf-Idf (term frequency and inverse document frequency)**

In [None]:
# CountVectorization
CountVec = CountVectorizer(inputCol='words_new', outputCol='tf')
CountVec_model = CountVec.fit(tokenized_review)
count_vectorized_reviews = CountVec_model.transform(tokenized_review)

In [None]:
# tf-idf
tf_idf = IDF().setInputCol('tf').setOutputCol('tf_idf')
tf_idf_model = tf_idf.fit(count_vectorized_reviews)
tf_idf_review = tf_idf_model.transform(count_vectorized_reviews)

In [None]:
#Display reviews after countvectorization and tfidf
tf_idf_review.show(5)