#### This file contains the sentiment analysis model that will classify events as ```success``` or ```failure```

In [1]:
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T
import os

In [2]:
conf = pyspark.SparkConf()
# conf.set('spark.ui.proxyBase', '/user/' + os.environ['JUPYTERHUB_USER'] + '/proxy/4041')
conf.set('spark.driver.memory','3g')
conf.set('spark.ui.showConsoleProgress', False)
sc = pyspark.SparkContext(conf=conf)
spark = pyspark.SQLContext.getOrCreate(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/03 21:44:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
sentiment140 = spark.read.csv('../data-processed/sentiment140_data.csv', header=True).select(['text', 'label'])

In [4]:
sentiment140.show()

+--------------------+-----+
|                text|label|
+--------------------+-----+
|@switchfoot http:...|    0|
|is upset that he ...|    0|
|@Kenichan I dived...|    0|
|my whole body fee...|    0|
|@nationwideclass ...|    0|
|@Kwesidei not the...|    0|
|         Need a hug |    0|
|@LOLTrish hey  lo...|    0|
|@Tatiana_K nope t...|    0|
|@twittera que me ...|    0|
|spring break in p...|    0|
|I just re-pierced...|    0|
|@caregiving I cou...|    0|
|@octolinz16 It it...|    0|
|@smarrison i woul...|    0|
|@iamjazzyfizzle I...|    0|
|Hollis' death sce...|    0|
|about to file taxes |    0|
|@LettyA ahh ive a...|    0|
|@FakerPattyPattz ...|    0|
+--------------------+-----+
only showing top 20 rows



## Data Preparation

We will follow the following steps to prepare the data for our model
1. Lowercase the text
2. Remove stopwords from text
3. Remove punctuations from text since that is noise and meaningful information cannot be learned from them
4. Remove usernames, emojis, urls etc.
5. Replace contractions
6. Tokenize the text
7. Perform stemming and lemmatization on text

#### Lowercase the text

In [5]:
sentiment140 = sentiment140.withColumn('text', F.lower(F.col('text')))

In [6]:
sentiment140.show(10)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|@switchfoot http:...|    0|
|is upset that he ...|    0|
|@kenichan i dived...|    0|
|my whole body fee...|    0|
|@nationwideclass ...|    0|
|@kwesidei not the...|    0|
|         need a hug |    0|
|@loltrish hey  lo...|    0|
|@tatiana_k nope t...|    0|
|@twittera que me ...|    0|
+--------------------+-----+
only showing top 10 rows



#### Remove stopwords from text

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/priyangshupal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
from nltk.corpus import stopwords

In [9]:
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [10]:
def removeStopwords(text):
  return " ".join([word for word in text.split() if word not in stopwords.words('english')])

removeStopwordsUDF = F.udf(removeStopwords)

In [11]:
sentiment140 = sentiment140.withColumn('text', removeStopwordsUDF(F.col('text')))

In [12]:
sentiment140.show(10)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|@switchfoot http:...|    0|
|upset can't updat...|    0|
|@kenichan dived m...|    0|
|whole body feels ...|    0|
|@nationwideclass ...|    0|
|@kwesidei whole crew|    0|
|            need hug|    0|
|@loltrish hey lon...|    0|
|     @tatiana_k nope|    0|
|@twittera que mue...|    0|
+--------------------+-----+
only showing top 10 rows



#### Remove punctuations

In [13]:
import string

In [14]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
def remove_punctuations(text):
    # Make a translation table that maps all punctuation characters to None
    translator = str.maketrans("", "", string.punctuation)

    # Apply the translation table to the input string
    return text.translate(translator)

removePunctuationsUDF = F.udf(remove_punctuations)

In [16]:
sentiment140 = sentiment140.withColumn('text', removePunctuationsUDF(F.col('text')))

In [17]:
sentiment140.show(10)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|switchfoot httptw...|    0|
|upset cant update...|    0|
|kenichan dived ma...|    0|
|whole body feels ...|    0|
|nationwideclass n...|    0|
| kwesidei whole crew|    0|
|            need hug|    0|
|loltrish hey long...|    0|
|       tatianak nope|    0|
| twittera que muera |    0|
+--------------------+-----+
only showing top 10 rows



#### Remove emails, emojis, urls etc.

In [18]:
import emoji
import re

In [19]:
url_regex = '((www\.[^\s]+)|(https?://[^\s]+))'
username_regex = '@[^\s]+'

In [20]:
def remove_urls(text):
  return re.sub(url_regex, '', text)

def remove_usernames(text):
  return re.sub(username_regex, '', text)

def remove_emojis(text):
  return emoji.demojize(text)

remove_urlsUDF = F.udf(remove_urls)
remove_usernamesUDF = F.udf(remove_usernames)
remove_emojisUDF = F.udf(remove_emojis)

In [21]:
sentiment140 = sentiment140.withColumn('text', remove_urlsUDF(F.col('text')))
sentiment140 = sentiment140.withColumn('text', remove_usernamesUDF(F.col('text')))
sentiment140 = sentiment140.withColumn('text', remove_emojisUDF(F.col('text')))

In [22]:
sentiment140.show(10)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|switchfoot httptw...|    0|
|upset cant update...|    0|
|kenichan dived ma...|    0|
|whole body feels ...|    0|
|nationwideclass n...|    0|
| kwesidei whole crew|    0|
|            need hug|    0|
|loltrish hey long...|    0|
|       tatianak nope|    0|
| twittera que muera |    0|
+--------------------+-----+
only showing top 10 rows



#### Tokenizing the text

In [23]:
from nltk.tokenize import RegexpTokenizer

In [24]:
tokenizer = RegexpTokenizer('\w+')

In [25]:
def tokenize(text):
    return tokenizer.tokenize(text)

tokenizeUDF = F.udf(tokenize)

In [26]:
sentiment140 = sentiment140.withColumn('text', tokenizeUDF(F.col('text')))

In [27]:
sentiment140.show(10)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|[switchfoot, http...|    0|
|[upset, cant, upd...|    0|
|[kenichan, dived,...|    0|
|[whole, body, fee...|    0|
|[nationwideclass,...|    0|
|[kwesidei, whole,...|    0|
|         [need, hug]|    0|
|[loltrish, hey, l...|    0|
|    [tatianak, nope]|    0|
|[twittera, que, m...|    0|
+--------------------+-----+
only showing top 10 rows



### Stemming and Lemmatizing the text

In [28]:
from nltk.stem import PorterStemmer

In [29]:
stemmer = nltk.PorterStemmer()

In [30]:
from nltk.tokenize import word_tokenize

In [31]:
def stemming(text):
  return [stemmer.stem(word) for word in text]

stemmingUDF = F.udf(stemming)

In [32]:
sentiment140 = sentiment140.withColumn('text', stemmingUDF(F.col('text')))

In [33]:
sentiment140.show(10)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|[switchfoot, http...|    0|
|[upset, cant, upd...|    0|
|[kenichan, dive, ...|    0|
|[whole, bodi, fee...|    0|
|[nationwideclass,...|    0|
|[kwesidei, whole,...|    0|
|         [need, hug]|    0|
|[loltrish, hey, l...|    0|
|    [tatianak, nope]|    0|
|[twittera, que, m...|    0|
+--------------------+-----+
only showing top 10 rows



In [34]:
lemmatizer = nltk.WordNetLemmatizer()

In [35]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/priyangshupal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/priyangshupal/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [36]:
def lemmatize(text):
    return [lemmatizer.lemmatize(word) for word in text]

lemmatizeUDF = F.udf(lemmatize)

In [37]:
sentiment140 = sentiment140.withColumn('text', lemmatizeUDF(F.col('text')))

In [38]:
sentiment140.show(10)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|[switchfoot, http...|    0|
|[upset, cant, upd...|    0|
|[kenichan, dive, ...|    0|
|[whole, bodi, fee...|    0|
|[nationwideclass,...|    0|
|[kwesidei, whole,...|    0|
|         [need, hug]|    0|
|[loltrish, hey, l...|    0|
|    [tatianak, nope]|    0|
|[twittera, que, m...|    0|
+--------------------+-----+
only showing top 10 rows



Remove Null rows

In [48]:
sentiment140 = sentiment140.filter(~F.col('label').isNull())

In [None]:
sentiment140.write.parquet("../data-processed/sentiment140_model_data.parquet")