## Import Libraries

In [1]:
import json
import re
import pandas as pd
import os
import demoji
demoji.download_codes()

from wordsegment import load, segment
from autocorrect import Speller
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 


import nltk
nltk.download('wordnet')
nltk.download('punkt')

Downloading emoji data ...
... OK (Got response in 0.24 seconds)
Writing emoji data to /home/jovyan/.demoji/codes.json ...
... OK


[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Contractions List

This list of contractions was sourced from this site:https://gist.github.com/nealrs/96342d8231b75cf4bb82

Feel free to add to this list as you see fit

In [2]:
cList = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "I'd": "I would",
  "I'd've": "I would have",
  "I'll": "I will",
  "I'll've": "I will have",
  "I'm": "I am",
  "I've": "I have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have",
  "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have",
  "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have",
  "you'll": "you you will",
  "you'll've": "you you will have",
  "you're": "you are",
  "you've": "you have"
}

## Paths

Note that the DATA_PROCESSED_TWEETS_PATH was created from pulling the tweets. Makes a "refined" folder used to store the output in the directory.

In [3]:
DATA_PROCESSED_TWEETS_PATH = '../data/processed/full-tweets'

## Functions

Here are all the functions used for preprocessing. All functions take in the entire tweet as an input with remove_shortwords taking a second parameter. Quick description for each:

<b>make_lowercase</b> : Makes all the words lowercase for consistency

<b>remove_url</b> : Removes URLs found in the tweet

<b>remove_html</b>: Removed some common HTML statements and replaces them. Feel free to add more to this as you encounter more

<b>remove_newline</b> : Removes \n from tweets

<b>replace_emojis</b> : Will replace an emoji with a description of what the emoji is

<b>remove_emojis</b> : Entirely removes emojis from a tweet. Use either this or replace_emojis, not both

<b>remove_mentions</b> : Removes @ followed by words, usually will be another account which is not needed for sentiment

<b>remove_hashtags_completely</b> : Removes all hashtages and replaces them with a space

<b>remove_hashtag_symbol</b> : Replaces "#" with a space. Use only this or remove_hashtags_completely

<b>remove_noncharacters</b> : Keeps only a-z characters as well as spaces, _ , and apostrophes

<b>remove_contractions</b> : Separates out contractions based on the dictionary above

<b>split_words</b> : If it is obvious that there are multiple words strung together with no spaces, this will split them apart. Must use load() before running this

<b>spelling_correction</b> : Does a quick spell-check on words.  Must set spell = Speller()  before running this

<b>remove_stopwards</b> : Removes short words like "the" that aren't useful for sentiment analysis. See https://www.nltk.org/book/ch02.html for what counts as a stopword.

<b>remove_shortwords</b> : Removes words that are shorter than the length specified. 

<b>make_stem</b>: Typically removed "ed", "s", "es" from ends of words to find the root of a word. "Created" should become "Create". Could try running spellcheck after this step to correct words. Must set ps = PorterStemmer()  before running this

In [4]:
def make_lowercase(tweet):
    return tweet.lower()

def remove_url(tweet):
    return re.sub(r'http\S+', '', tweet)

def remove_html(tweet):
    return tweet.replace('&amp;', 'and').replace('&lt;', 'less than').replace('&gt;', 'greater than')

def remove_newline(tweet):
    return tweet.replace('\n', ' ')

def replace_emojis(tweet):
    return demoji.replace_with_desc(tweet, sep = ' ')

def remove_emojis(tweet):
    return demoji.replace(tweet, '')

def remove_mentions(tweet):
    return re.sub(r"@(\w+)", ' ', tweet, flags=re.MULTILINE)

def remove_hashtags_completely(tweet):
    return re.sub(r"#(\w+)", ' ', tweet, flags=re.MULTILINE)

def remove_hashtag_symbol(tweet):
    return tweet.replace('#', ' ')

def remove_noncharacters(tweet):
    return re.sub(r"[^a-zA-Z_ ']", "", tweet)

def remove_contractions(tweet):
    tweet = tweet.replace('’',"'")
    return ' '.join(cList[word] if word in cList else word for word in tweet.split())

#call load() before running this
def split_words(tweet):
    return ' '.join(segment(tweet))

#call spell = Speller() before runnning this
def spelling_correction(tweet):
    return ' '.join([spell(i) for i in tweet.split()])

def remove_stopwords(tweet):
    result2 = []
    for token in gensim.utils.simple_preprocess(tweet):
        if token not in gensim.parsing.preprocessing.STOPWORDS:
            result2.append(token)
    return ' '.join(result2)

#recommend 2 or 3 for length
def remove_shortwords(tweet, length):
    result2 = []
    for token in gensim.utils.simple_preprocess(tweet):
        if len(token) > length:
            result2.append(token)
    return ' '.join(result2)

#Call ps = PorterStemmer()  before running this
def make_stem(tweet):
    result2 = []
    for token in gensim.utils.simple_preprocess(tweet):
        result2.append(ps.stem(token))
    return ' '.join(result2)

In [5]:
load()
spell = Speller()
ps = PorterStemmer()

## Example

In [None]:
# Just loading in Tweets 0 for now

with open(DATA_PROCESSED_TWEETS_PATH + "/tweets-0.jsonl" , 'r') as json_file:
    json_list = list(json_file)

result = []    

for json_str in json_list:
    result.append(json.loads(json_str))
    
tweet_dict = {}
for i in result:
    tweet_dict[i["id"]] = i["full_text"]
    
tweet_final = {}

for k, v in tweet_dict.items():
    v = make_lowercase(v)
    v = remove_url(v)
    v = remove_html(v)
    v = remove_newline(v)
    v = replace_emojis(v)
    v = remove_mentions(v)
    v = remove_hashtag_symbol(v)
    v = remove_noncharacters(v)
    v = remove_contractions(v)
    v = split_words(v)
    v = remove_stopwords(v)
    v = remove_shortwords(v, 2)
    v = make_stem(v)
    v = spelling_correction(v)
    tweet_final[k] = v

In [None]:
tweet_dict

In [None]:
tweet_final

# Process With PySpark
Process the whole dataset with PySpark

In [6]:
import os

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, LongType

In [7]:
STS_PATH          = '../data/sts'
STS_PROCESED_PATH = '../data/processed/sts'

STS_PROCESED_TRAIN_FILENAME = 'sts_train'
STS_PROCESED_TEST_FILENAME = 'sts_test'

COVID_PROCESSED_PATH = '../data/proccesed/full-tweets-sanitized'

COVID_PROCESSED_FILENAME = 'tweets-santized'

In [8]:
spark = SparkSession.builder.master('local').appName('local').getOrCreate()

## Load Data
We load the STS corpus and COVID-19 Tweets dataset.

In [9]:
STS_TRAIN_FILENAME = 'training.1600000.processed.noemoticon.csv'
STS_TRAIN_PATH     = STS_PATH + '/' + STS_TRAIN_FILENAME
STS_TEST_FILENAME  = 'testdata.manual.2009.06.14.csv'
STS_TEST_PATH      = STS_PATH + '/' + STS_TEST_FILENAME

COVID_DATA_PATH = '../data/processed/full-tweets'

In [10]:
sts_schema = StructType([
    StructField('label', IntegerType(), True),
    StructField('id', LongType(), True),
    StructField('date', StringType(), True),
    StructField('query', StringType(), True),
    StructField('user', StringType(), True),
    StructField('text', StringType(), True),
])

In [11]:
df_sts_train_raw = spark.read.csv(STS_TRAIN_PATH, header=False, schema=sts_schema)

df_sts_test_raw = spark.read.csv(STS_TEST_PATH, header=False, schema=sts_schema)

df_covid_raw = spark.read.json(COVID_DATA_PATH + '/' + '*.jsonl')

In [12]:
df_sts_train_selected_cols = df_sts_train_raw.select('id', 'text', 'label')
df_sts_test_selected_cols = df_sts_test_raw.select('id', 'text', 'label') 
df_covid_selected_cols = df_covid_raw.filter(df_covid_raw.lang == 'en').select('id', 'full_text')

## Register Functions as UDF
Register our functions above as PySpark UDFs

In [13]:
from pyspark.sql.functions import udf

In [14]:
def sanitize(v):
    v = make_lowercase(v)
    v = remove_url(v)
    v = remove_html(v)
    v = remove_newline(v)
    v = replace_emojis(v)
    v = remove_mentions(v)
    v = remove_hashtag_symbol(v)
    v = remove_noncharacters(v)
    v = remove_contractions(v)
#     v = split_words(v)
#     v = remove_stopwords(v)
#     v = remove_shortwords(v, 2)
#     v = make_stem(v)
#     v = spelling_correction(v)
    
    return v

sanitize_udf = udf(sanitize, StringType())

## Process Data
Process data using UDF and save output to file.

In [15]:
df_sts_train_santized = df_sts_train_selected_cols.select('id', sanitize_udf('text').alias('cleaned_text'), 'label')

In [16]:
df_sts_test_santized = df_sts_test_selected_cols.select('id', sanitize_udf('text').alias('cleaned_text'), 'label')

In [18]:
df_sts_covid_santized = df_covid_selected_cols.select('id', sanitize_udf('full_text').alias('cleaned_text'))

In [20]:
df_sts_train_santized.repartition(1).write.csv(STS_PROCESED_PATH + '/' + STS_PROCESED_TRAIN_FILENAME)

In [19]:
df_sts_test_santized.repartition(1).write.csv(STS_PROCESED_PATH + '/' + STS_PROCESED_TEST_FILENAME)

In [None]:
df_sts_covid_santized.repartition(1).write.csv(COVID_PROCESSED_PATH + '/' + COVID_PROCESSED_FILENAME)