In [2]:
import pandas as pd
import numpy as np
import re
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')
from nltk.corpus import stopwords
import enchant
from enchant.checker import SpellChecker
import enchant

In [3]:
import sys
print("Python version")
print (sys.version)

Python version
3.7.7 (tags/v3.7.7:d7c567b08f, Mar 10 2020, 10:41:24) [MSC v.1900 64 bit (AMD64)]


In [4]:
df = pd.read_csv('../data/raw_data.csv')

# Making the text more uniform

Remove urls

In [5]:
df['preproc'] = df.body.replace(r'http\S+', '', regex=True)

Change acronyms to words for uniformity. 
* https://www.netlingo.com/acronyms.php
* https://blog.adioma.com/internet-acronyms-intro-list-infographic/

In [6]:
from acronyms_smileys import acronyms

In [7]:
df.preproc = df.preproc.str.lower()
# the ’ was causing issues, took a while to notice
df.preproc = df.preproc.str.replace('’', '\'')

In [9]:
df.preproc = df.preproc.apply(lambda x: ' '.join(acronyms.get(word, word) for word in x.split()))

Replace negations with "not"

In [10]:
negations = ['don\'t', 'aint' 'aren\'t', 'couldn\'t','didn\'t', 
             'doesn\'t', 'hadn\'t', 'hasn\'t', 'haven\'t', 'isn\'t', 
             'mightn\'t', 'mustn\'t', 'needn\'t', 'shouldn\'t', 'wasn\'t', 
             'weren\'t', 'won\'t', 'wouldn\'t', 'nor', 'not', 'cant', 'dont',
            'arent', 'couldnt', 'didnt', 'doesnt', 'hadnt', 'hasnt', 'havent',
            'isnt', 'mightnt', 'mustnt', 'neednt', 'shouldnt', 'wasnt',
            'werent', 'wont', 'wouldnt']
regx = r'\b(?:{})\b'.format('|'.join(negations))
df.preproc = df.preproc.str.replace(regx, 'not')

In [11]:
# Remove negations from stop list, add two missing contractions
stopwords_list = stopwords.words('english')
stopwords_list = [el for el in stopwords_list if el not in negations]
missing_words = ['i\'m', 'i\'d']
stopwords_list.extend(missing_words)

10 most commonly used words

In [12]:
pd.Series(' '.join(df.preproc).lower().split()).value_counts()[:10]

this    432
the     426
i       373
is      302
you     277
to      263
a       241
not     197
and     185
of      172
dtype: int64

In [13]:
def get_common_stopwords(stop_words, n=5):
    most_freq_words = pd.Series(' '.join(df.preproc).lower().split()).value_counts()[:int(n*2)].keys().to_numpy()
    common_stopwords = [i for i in most_freq_words if i in stop_words]
    return common_stopwords[0:n]

Remove most common stop words

In [14]:
common_stopwords = get_common_stopwords(stopwords_list)
df.preproc = df.preproc.apply(lambda x: ' '.join([word for word in x.split() if word not in (common_stopwords)]))

In [15]:
df.head(5)

Unnamed: 0,body,positive,negative,neutral,rated,comment_id,video_id,date,preproc
0,*stretched past the 10 minute mark for ad reve...,0,1,0,1,UgztP4lVR-Epv5HlSXN4AaABAg,ItYOdWRo0JY,2020-01-10T20:24:33Z,"*stretched past 10 minute mark for ad revenue,..."
1,Big time scam,0,1,0,1,UgzZubyLG5FtZu7qlal4AaABAg,ItYOdWRo0JY,2019-08-01T15:45:49Z,big time scam
2,I’d recycle his face,0,1,0,1,UgzSMwb88ntjkYHQYaN4AaABAg,ItYOdWRo0JY,2019-06-19T04:00:06Z,i'd recycle his face
3,God dang you are a twat,0,1,0,1,UgwS5xQLLnyIzNUS4bp4AaABAg,ItYOdWRo0JY,2019-06-16T00:49:53Z,god dang are a twat
4,Why didn't you give it away to one of the 3 fa...,0,1,0,1,Ugz1EJ6K2F0CYUX5NrN4AaABAg,ItYOdWRo0JY,2019-03-31T00:50:58Z,why not give it away to one of 3 fans that rec...


List of emojis defined from https://en.wikipedia.org/wiki/List_of_emoticons , https://emojipedia.org/people/

Other references:
* https://www.urbandictionary.com/define.php?term=%F0%9F%92%80
* https://www.urbandictionary.com/define.php?term=%F0%9F%94%A5

In [29]:
import emoji
# list of tagged emoticons from above links
from acronyms_smileys import smileys
from acronyms_smileys import sent_acronyms
# for removing untagged emoji
import demoji
#demoji.download_codes()

In [36]:
def replace_repeating_emoji(text):
    uniques = set()
    final_string = list()
    text_arr = [item for item in emoji.get_emoji_regexp().split(text) if not item == '']
    for e in text_arr:
        # for some reason even though it is defined as '❤', when its 
        # imported, it gets loaded as '❤❤'
        if e == '❤':
            e = '❤❤'
        if not bool(emoji.get_emoji_regexp().search(e)):
            final_string.append(smileys.get(e, e))
        else:
            if e not in uniques:
                uniques.add(e)
                final_string.append(smileys.get(e, e))
    return ' '.join(final_string)

In [39]:
# find repeating emoticons and remove repetitions, and tag emoticons
df.preproc = df.preproc.apply(lambda x: replace_repeating_emoji(x) if (bool(emoji.get_emoji_regexp().search(x)) and bool(re.search(r'(.)\1', x))) else x)

In [40]:
# remove untagged emoticons
df.preproc = df.preproc.apply(lambda x : demoji.replace(x, ''))

Set sentiment on acronyms (such as 'lol')

In [41]:
df.preproc = df.preproc.apply(lambda x: ' '.join(sent_acronyms.get(word, word) for word in x.split()))

Remove hashtags

In [42]:
df.preproc = df.preproc.apply(lambda x: ' '.join([word for word in x.split() if '#' not in word]))

In [43]:
df.loc[df.preproc.str.contains('#')]

Unnamed: 0,body,positive,negative,neutral,rated,comment_id,video_id,date,preproc


Remove punctuation

In [44]:
import string
df.preproc = df.preproc.str.replace('[{}]'.format(string.punctuation), '')

In [45]:
df.head(5)

Unnamed: 0,body,positive,negative,neutral,rated,comment_id,video_id,date,preproc
0,*stretched past the 10 minute mark for ad reve...,0,1,0,1,UgztP4lVR-Epv5HlSXN4AaABAg,ItYOdWRo0JY,2020-01-10T20:24:33Z,stretched past 10 minute mark for ad revenue see
1,Big time scam,0,1,0,1,UgzZubyLG5FtZu7qlal4AaABAg,ItYOdWRo0JY,2019-08-01T15:45:49Z,big time scam
2,I’d recycle his face,0,1,0,1,UgzSMwb88ntjkYHQYaN4AaABAg,ItYOdWRo0JY,2019-06-19T04:00:06Z,id recycle his face
3,God dang you are a twat,0,1,0,1,UgwS5xQLLnyIzNUS4bp4AaABAg,ItYOdWRo0JY,2019-06-16T00:49:53Z,god dang are a twat
4,Why didn't you give it away to one of the 3 fa...,0,1,0,1,Ugz1EJ6K2F0CYUX5NrN4AaABAg,ItYOdWRo0JY,2019-03-31T00:50:58Z,why not give it away to one of 3 fans that rec...


Remove repeating vowels and consonants

In [46]:
# https://stackoverflow.com/questions/46701245/how-to-replace-multiple-consecutive-repeating-characters-into-1-character-in-pyt
df.preproc = df.preproc.apply(lambda x: ' '.join([re.sub(r'[^\w\s]|(.)(?=\1)', '', word) for word in x.split()]))

Tag any sequence of "ha" or "ah" (for example, "ahaha" or "haha") as a "laugh"

In [47]:
df.preproc = df.preproc.apply(lambda x: ' '.join([re.sub(r'([ha]+[ah]+).*\1', r'laugh', word) for word in x.split()]))

Remove numbers

In [48]:
df.preproc = df.preproc.str.replace('\d+', '')

In [49]:
# 0 - negative, 1 - positive
df['rating'] = df.positive

Write preprocessed column and the rating to a file

In [54]:
# copy relevant columns for later to file write
data_no_trans_stem = df.filter(['comment_id', 'preproc', 'rating'], axis=1)
data_no_trans_stem.columns = ['comment_id', 'body', 'rating']
data_trans = df.filter(['comment_id', 'preproc', 'rating'], axis=1)
data_trans.columns = ['comment_id', 'body', 'rating']
data_stem = df.filter(['comment_id', 'preproc', 'rating'], axis=1)
data_stem.columns = ['comment_id', 'body', 'rating']
data_trans_stem = df.filter(['comment_id', 'preproc', 'rating'], axis=1)
data_trans_stem.columns = ['comment_id', 'body', 'rating']

Fix as many spelling errors as possible

In [56]:
spell_checker = SpellChecker("en_UK","en_US")
def correct_error(body):
    spell_checker.set_text(body)
    for err in spell_checker:
        if len(err.suggest())>0: 
            sug = err.suggest()[0]
            err.replace(sug)
    return spell_checker.get_text()
data_trans.body = data_trans.body.apply(lambda row: correct_error(row))
data_trans_stem.body = data_trans_stem.body.apply(lambda row: correct_error(row))
data_trans.body = data_trans.body.str.lower()
data_trans_stem.body = data_trans_stem.body.str.lower()

Stem the words

In [59]:
snows = nltk.stem.SnowballStemmer('english')
data_stem.body = data_stem.body.apply(lambda x: ' '.join([snows.stem(word) for word in x.split()]))
data_trans_stem.body = data_trans_stem.body.apply(lambda x: ' '.join([snows.stem(word) for word in x.split()]))

In [60]:
data_no_trans_stem.to_csv('../data/preproc_no_trans_stem.csv', index=False)
data_trans.to_csv('../data/preproc_trans.csv', index=False)
data_stem.to_csv('../data/preproc_stem.csv', index=False)
data_trans_stem.to_csv('../data/preproc_trans_stem.csv', index=False)

To do:
* profanity tags