In [71]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [72]:
data = pd.read_csv("train-balanced-sarcasm.csv")

In [73]:
#del data['date']
#del data['created_utc']
print("Number of Entries:",len(data))

Number of Entries: 1010826


In [74]:
#Filtering out comments that are just floats
data = data[~data["comment"].apply(lambda x: isinstance(x,float))]
data = data[~data["parent_comment"].apply(lambda x: isinstance(x,float))]
print("Number of Entries left:", len(data))

Number of Entries left: 1010773


In [75]:
X = data.drop('label', axis = 1)
y = data['label']
X_sample, _, y_sample, _ = train_test_split(X, y, test_size = 0.8, stratify = y, random_state = 42)
data = pd.concat([X_sample,y_sample], axis = 1)
data = data.reset_index()
data.drop('index', axis = 1, inplace = True)

### Data Cleaning
Remove contractions, change to lower case, remove links, usernames and emails

In [76]:
#removing contractions
import contractions
data['comment'] = data['comment'].apply(contractions.fix)
data['parent_comment'] = data['parent_comment'].apply(contractions.fix)


In [77]:
#change to lower case
data['comment'] = data["comment"].apply(lambda text:text.lower())
data['parent_comment'] = data["parent_comment"].apply(lambda text:text.lower())

In [78]:
data.head()

Unnamed: 0,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,label
0,"yeah, they only have captain america, iron man...",RedBarrel,marvelstudios,1,1,0,2015-11,2015-11-12 19:18:43,kind of defeats the ideas of a shared universe...,1
1,just pretend there is nothing wrong with this ...,Swopyx,FIFA,7,-1,-1,2016-11,2016-11-19 10:24:32,something sketchy going on in wl? i have playe...,0
2,i thought i had killed somebody by administeri...,fathompin,exmormon,12,12,0,2016-05,2016-05-10 22:33:28,want to repent to this sub for stupid and misg...,1
3,"it is cool, as long as you keep one tire on yo...",mahacctissoawsum,videos,2,2,0,2012-09,2012-09-20 01:32:52,"but there is a double line, he cannot legally ...",1
4,"no, i am pretty sure it is an ar-15.",reg55000,The_Donald,1,1,0,2016-06,2016-06-14 06:41:03,top left is definitely ak-47.,1


In [79]:
import re

#function to remove links from text
def remove_links(text):
    text_1 = re.sub(r"[(+*)]\S*https?:\S*[(+*)]", "", text)
    text_2 = re.sub('http://\S+|https://\S+', " ", text_1)
    text_3 = re.sub(r"[\(\[].*?[\)\]]", " ", text_2)
    return text_3

#function to remove email addresses
def remove_emails(text):
    text_1 = re.sub(r'\S+@\S+', '', text)
    return text_1

#function to remove usernames
def remove_usernames(text):
    text_1 = re.sub(r'@\w+', '', text)
    return text_1

def remove_links_emails_usernames(text):
    text_1 = remove_links(text)
    text_2 = remove_emails(text_1)
    text_3 = remove_usernames(text_2)
    return text_3

data['comment'] = data['comment'].apply(remove_links_emails_usernames)
data['parent_comment'] = data['parent_comment'].apply(remove_links_emails_usernames)

In [80]:
data.head()

Unnamed: 0,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,label
0,"yeah, they only have captain america, iron man...",RedBarrel,marvelstudios,1,1,0,2015-11,2015-11-12 19:18:43,kind of defeats the ideas of a shared universe...,1
1,just pretend there is nothing wrong with this ...,Swopyx,FIFA,7,-1,-1,2016-11,2016-11-19 10:24:32,something sketchy going on in wl? i have playe...,0
2,i thought i had killed somebody by administeri...,fathompin,exmormon,12,12,0,2016-05,2016-05-10 22:33:28,want to repent to this sub for stupid and misg...,1
3,"it is cool, as long as you keep one tire on yo...",mahacctissoawsum,videos,2,2,0,2012-09,2012-09-20 01:32:52,"but there is a double line, he cannot legally ...",1
4,"no, i am pretty sure it is an ar-15.",reg55000,The_Donald,1,1,0,2016-06,2016-06-14 06:41:03,top left is definitely ak-47.,1


### Data Pre-Processing
https://www.analyticsvidhya.com/blog/2021/06/text-preprocessing-in-nlp-with-python-codes/   
https://exchange.scale.com/public/blogs/preprocessing-techniques-in-nlp-a-guide  
Tokenize text, remove stopwords, lemmatize tokens

In [81]:
#Sentence Tokenizer
import nltk
#nltk.download('punkt')

def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    return tokens

data["comment_tokens"] = data["comment"].apply(tokenize_text)
data["parent_comment_tokens"] = data["parent_comment"].apply(tokenize_text)

In [82]:
"""
#spellchecking
#%pip install pyspellchecker
from spellchecker import SpellChecker
spell = SpellChecker()
data['comment_tokens'] = data['comment_tokens'].apply(lambda tokens: [spell.correction(token) for token in tokens])
data['parent_comment_tokens'] = data['parent_comment_tokens'].apply(lambda tokens: [spell.correction(token) for token in tokens])
"""

"\n#spellchecking\n#%pip install pyspellchecker\nfrom spellchecker import SpellChecker\nspell = SpellChecker()\ndata['comment_tokens'] = data['comment_tokens'].apply(lambda tokens: [spell.correction(token) for token in tokens])\ndata['parent_comment_tokens'] = data['parent_comment_tokens'].apply(lambda tokens: [spell.correction(token) for token in tokens])\n"

In [83]:
#remove stop words
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

#remove possible words from stop_words that can change meaning of text
to_remove = ["no","not"]
for word in to_remove:
    stop_words.remove(word)

data["comment_tokens"] = data["comment_tokens"].apply(lambda tokens: [token for token in tokens if token not in stop_words])
data["parent_comment_tokens"] = data["parent_comment_tokens"].apply(lambda tokens: [token for token in tokens if token not in stop_words])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dxcas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [84]:
#Lemmatization
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
data["comment_tokens"] = data["comment_tokens"].apply(lambda tokens:[lemmatizer.lemmatize(token) for token in tokens])
data["parent_comment_tokens"] = data["parent_comment_tokens"].apply(lambda tokens:[lemmatizer.lemmatize(token) for token in tokens])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dxcas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [85]:
data.head()

Unnamed: 0,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,label,comment_tokens,parent_comment_tokens
0,"yeah, they only have captain america, iron man...",RedBarrel,marvelstudios,1,1,0,2015-11,2015-11-12 19:18:43,kind of defeats the ideas of a shared universe...,1,"[yeah, ,, captain, america, ,, iron, man, ,, s...","[kind, defeat, idea, shared, universe, differe..."
1,just pretend there is nothing wrong with this ...,Swopyx,FIFA,7,-1,-1,2016-11,2016-11-19 10:24:32,something sketchy going on in wl? i have playe...,0,"[pretend, nothing, wrong, game, like, 95, %, s...","[something, sketchy, going, wl, ?, played, eve..."
2,i thought i had killed somebody by administeri...,fathompin,exmormon,12,12,0,2016-05,2016-05-10 22:33:28,want to repent to this sub for stupid and misg...,1,"[thought, killed, somebody, administering, pro...","[want, repent, sub, stupid, misguided, thing, ..."
3,"it is cool, as long as you keep one tire on yo...",mahacctissoawsum,videos,2,2,0,2012-09,2012-09-20 01:32:52,"but there is a double line, he cannot legally ...",1,"[cool, ,, long, keep, one, tire, side, .]","[double, line, ,, not, legally, pas, !]"
4,"no, i am pretty sure it is an ar-15.",reg55000,The_Donald,1,1,0,2016-06,2016-06-14 06:41:03,top left is definitely ak-47.,1,"[no, ,, pretty, sure, ar-15, .]","[top, left, definitely, ak-47, .]"


In [86]:
data.to_pickle('processed_train.pkl')