In [49]:
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T
import os

In [50]:
conf = pyspark.SparkConf()
conf.set('spark.ui.proxyBase', '/user/' + os.environ['JUPYTERHUB_USER'] + '/proxy/4041')
conf.set('spark.driver.memory','8g')
conf.set('spark.ui.showConsoleProgress', False)
try:
    sc = pyspark.SparkContext(conf=conf)
    spark = pyspark.SQLContext.getOrCreate(sc)
except:
    print('Spark context already exists, continuing with', sc)

Spark context already exists, continuing with <SparkContext master=local[*] appName=pyspark-shell>


In [51]:
# sentiment140 = spark.read.csv('../data-processed/sentiment140_data.csv', header=True).select(['text', 'label'])
sentiment140 = spark.read.csv('../data/reduced_data.csv', header=True).select(['text', 'label'])

In [52]:
sentiment140.show(10)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|I LOVE @Health4Ua...|    1|
|im meeting up wit...|    1|
|@DaRealSunisaKim ...|    1|
|Being sick can be...|    1|
|@LovesBrooklyn2 h...|    1|
|@ProductOfFear Yo...|    1|
|@r_keith_hill Tha...|    1|
|@KeepinUpWKris I ...|    1|
|@tommcfly ah, con...|    1|
|@e4VoIP I RESPOND...|    1|
|crazy day of scho...|    1|
|@naughtyhaughty H...|    1|
|@nileyjileyluver ...|    1|
|@soundwav2010 At ...|    1|
|@LutheranLucciol ...|    1|
|Just added tweeti...|    1|
|@michellardi i re...|    1|
|@nicolerichie: yo...|    1|
|Catching Up on Em...|    1|
|Dancing around th...|    1|
+--------------------+-----+
only showing top 20 rows



## Data Preparation

We will follow the following steps to prepare the data for our model
1. Lowercase the text
2. Remove stopwords from text
3. Remove punctuations from text since that is noise and meaningful information cannot be learned from them
4. Remove usernames, emojis, urls etc.
5. Replace contractions
6. Tokenize the text
7. Perform stemming and lemmatization on text

#### Lowercase the text

In [53]:
sentiment140 = sentiment140.withColumn('text', F.lower(F.col('text')))

In [54]:
sentiment140.show(10)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|i love @health4ua...|    1|
|im meeting up wit...|    1|
|@darealsunisakim ...|    1|
|being sick can be...|    1|
|@lovesbrooklyn2 h...|    1|
|@productoffear yo...|    1|
|@r_keith_hill tha...|    1|
|@keepinupwkris i ...|    1|
|@tommcfly ah, con...|    1|
|@e4voip i respond...|    1|
+--------------------+-----+
only showing top 10 rows



#### Remove stopwords from text

In [55]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [56]:
from nltk.corpus import stopwords

In [57]:
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [58]:
def removeStopwords(text):
  return " ".join([word for word in text.split() if word not in stopwords.words('english')])

removeStopwordsUDF = F.udf(removeStopwords)

In [59]:
sentiment140 = sentiment140.withColumn('text', removeStopwordsUDF(F.col('text')))

In [60]:
sentiment140.show(10)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|love @health4uand...|    1|
|im meeting one be...|    1|
|@darealsunisakim ...|    1|
|sick really cheap...|    1|
|@lovesbrooklyn2 e...|    1|
|@productoffear te...|    1|
|@r_keith_hill tha...|    1|
|@keepinupwkris je...|    1|
|@tommcfly ah, con...|    1|
|@e4voip responded...|    1|
+--------------------+-----+
only showing top 10 rows



#### Remove punctuations

In [61]:
import string

In [62]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [63]:
def remove_punctuations(text):
    # Make a translation table that maps all punctuation characters to None
    translator = str.maketrans("", "", string.punctuation)

    # Apply the translation table to the input string
    return text.translate(translator)

removePunctuationsUDF = F.udf(remove_punctuations)

In [64]:
sentiment140 = sentiment140.withColumn('text', removePunctuationsUDF(F.col('text')))

In [65]:
sentiment140.show(10)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|love health4uandp...|    1|
|im meeting one be...|    1|
|darealsunisakim t...|    1|
|sick really cheap...|    1|
|lovesbrooklyn2 ef...|    1|
|productoffear tel...|    1|
|rkeithhill thans ...|    1|
|keepinupwkris jea...|    1|
|tommcfly ah congr...|    1|
|e4voip responded ...|    1|
+--------------------+-----+
only showing top 10 rows



#### Remove emails, emojis, urls etc.

In [66]:
import emoji
import re

In [67]:
url_regex = '((www\.[^\s]+)|(https?://[^\s]+))'
username_regex = '@[^\s]+'

In [68]:
def remove_urls(text):
  return re.sub(url_regex, '', text)

def remove_usernames(text):
  return re.sub(username_regex, '', text)

def remove_emojis(text):
  return emoji.demojize(text)

remove_urlsUDF = F.udf(remove_urls)
remove_usernamesUDF = F.udf(remove_usernames)
remove_emojisUDF = F.udf(remove_emojis)

In [69]:
sentiment140 = sentiment140.withColumn('text', remove_urlsUDF(F.col('text')))
sentiment140 = sentiment140.withColumn('text', remove_usernamesUDF(F.col('text')))
sentiment140 = sentiment140.withColumn('text', remove_emojisUDF(F.col('text')))

#### Tokenizing, stemming, and lemmatizing the text

In [70]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

In [71]:
def tokenize_stem_lemmatize(text):
    tokenizer = RegexpTokenizer('\w+')
    tokenized_words = tokenizer.tokenize(text)
    
    # Stemming logic
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in tokenized_words]
    
    # Lemmatizing logic
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, pos = 'a') for word in stemmed_words]
    
    return ' '.join(lemmatized_words)

tokenize_stem_lemmatizeUDF = F.udf(tokenize_stem_lemmatize)

In [72]:
sentiment140 = sentiment140.withColumn('text', tokenize_stem_lemmatizeUDF(F.col('text')))

In [73]:
sentiment140.show(10)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|love health4uandp...|    1|
|im meet one besti...|    1|
|darealsunisakim t...|    1|
|sick realli cheap...|    1|
|lovesbrooklyn2 ef...|    1|
|productoffear tel...|    1|
|rkeithhil than re...|    1|
|keepinupwkri jeal...|    1|
|tommcfli ah congr...|    1|
|e4voip respond st...|    1|
+--------------------+-----+
only showing top 10 rows



#### Removing rows with null labels

In [26]:
sentiment140 = sentiment140.filter(~F.col('label').isNull())

In [34]:
# sentiment140.toPandas().to_csv('../data-processed/sentiment140_model_data.csv')

Storing the dataframe in a parquet file. This will be used to train the model

In [45]:
import shutil
try:
    sentiment140.write.parquet("../data-processed/sentiment140_model_data.parquet")
except:
    print('Directory already exists at path...\nDeleting the directory')
    shutil.rmtree('../data-processed/sentiment140_model_data.parquet')
    sentiment140.write.parquet("../data-processed/sentiment140_model_data.parquet")

Directory already exists at path...
Deleting the directory
