In [1]:
import pandas as pd

In [36]:
data = pd.read_csv("train-balanced-sarcasm.csv")

In [37]:
#del data['date']
#del data['created_utc']
print("Number of Entries:",len(data))

Number of Entries: 1010826


In [38]:
#Filtering out comments that are just floats
data = data[~data["comment"].apply(lambda x: isinstance(x,float))]
data = data[~data["parent_comment"].apply(lambda x: isinstance(x,float))]
print("Number of Entries left:", len(data))

Number of Entries left: 1010773


### Data Cleaning
Remove contractions, change to lower case, remove links, usernames and emails

In [39]:
#removing contractions
import contractions
data['comment'] = data['comment'].apply(contractions.fix)
data['parent_comment'] = data['parent_comment'].apply(contractions.fix)


In [40]:
#change to lower case
data['comment'] = data["comment"].apply(lambda text:text.lower())
data['parent_comment'] = data["parent_comment"].apply(lambda text:text.lower())

In [41]:
data.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,nc and nh.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"yeah, i get that argument. at this point, i wo..."
1,0,you do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,the blazers and mavericks (the wests 5 and 6 s...
2,0,"they were underdogs earlier today, but since g...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,they are favored to win.
3,0,"this meme is not funny none of the ""new york n...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass do not kill my buzz
4,0,i could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,yep can confirm i saw the tool they use for th...


In [42]:
import re

#function to remove links from text
def remove_links(text):
    text_1 = re.sub(r"[(+*)]\S*https?:\S*[(+*)]", "", text)
    text_2 = re.sub('http://\S+|https://\S+', " ", text_1)
    text_3 = re.sub(r"[\(\[].*?[\)\]]", " ", text_2)
    return text_3

#function to remove email addresses
def remove_emails(text):
    text_1 = re.sub(r'\S+@\S+', '', text)
    return text_1

#function to remove usernames
def remove_usernames(text):
    text_1 = re.sub(r'@\w+', '', text)
    return text_1

def remove_links_emails_usernames(text):
    text_1 = remove_links(text)
    text_2 = remove_emails(text_1)
    text_3 = remove_usernames(text_2)
    return text_3

data['comment'] = data['comment'].apply(remove_links_emails_usernames)
data['parent_comment'] = data['parent_comment'].apply(remove_links_emails_usernames)

In [43]:
data.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,nc and nh.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"yeah, i get that argument. at this point, i wo..."
1,0,you do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,the blazers and mavericks did not even carry...
2,0,"they were underdogs earlier today, but since g...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,they are favored to win.
3,0,"this meme is not funny none of the ""new york n...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass do not kill my buzz
4,0,i could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,yep can confirm i saw the tool they use for th...


### Data Pre-Processing
https://www.analyticsvidhya.com/blog/2021/06/text-preprocessing-in-nlp-with-python-codes/   
https://exchange.scale.com/public/blogs/preprocessing-techniques-in-nlp-a-guide  
Tokenize text, remove stopwords, lemmatize tokens

In [2]:
#Sentence Tokenizer
import nltk
#nltk.download('punkt')

def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    return tokens

#data["comment_tokens"] = data["comment"].apply(tokenize_text)
#data["parent_comment_tokens"] = data["parent_comment"].apply(tokenize_text)

In [45]:
"""
#spellchecking
#%pip install pyspellchecker
from spellchecker import SpellChecker
spell = SpellChecker()
data['comment_tokens'] = data['comment_tokens'].apply(lambda tokens: [spell.correction(token) for token in tokens])
data['parent_comment_tokens'] = data['parent_comment_tokens'].apply(lambda tokens: [spell.correction(token) for token in tokens])
"""

"\n#spellchecking\n#%pip install pyspellchecker\nfrom spellchecker import SpellChecker\nspell = SpellChecker()\ndata['comment_tokens'] = data['comment_tokens'].apply(lambda tokens: [spell.correction(token) for token in tokens])\ndata['parent_comment_tokens'] = data['parent_comment_tokens'].apply(lambda tokens: [spell.correction(token) for token in tokens])\n"

In [46]:
#remove stop words
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

#remove possible words from stop_words that can change meaning of text
to_remove = ["no","not"]
for word in to_remove:
    stop_words.remove(word)

data["comment_tokens"] = data["comment_tokens"].apply(lambda tokens: [token for token in tokens if token not in stop_words])
data["parent_comment_tokens"] = data["parent_comment_tokens"].apply(lambda tokens: [token for token in tokens if token not in stop_words])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dxcas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
#Lemmatization
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
data["comment_tokens"] = data["comment_tokens"].apply(lambda tokens:[lemmatizer.lemmatize(token) for token in tokens])
data["parent_comment_tokens"] = data["parent_comment_tokens"].apply(lambda tokens:[lemmatizer.lemmatize(token) for token in tokens])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dxcas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [48]:
data.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,comment_tokens,parent_comment_tokens
0,0,nc and nh.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"yeah, i get that argument. at this point, i wo...","[nc, nh, .]","[yeah, ,, get, argument, ., point, ,, would, p..."
1,0,you do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,the blazers and mavericks did not even carry...,"[know, west, team, play, west, team, east, tea...","[blazer, maverick, not, even, carry, good, eno..."
2,0,"they were underdogs earlier today, but since g...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,they are favored to win.,"[underdog, earlier, today, ,, since, gronk, 's...","[favored, win, .]"
3,0,"this meme is not funny none of the ""new york n...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass do not kill my buzz,"[meme, not, funny, none, ``, new, york, nigga,...","[deadass, not, kill, buzz]"
4,0,i could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,yep can confirm i saw the tool they use for th...,"[could, use, one, tool, .]","[yep, confirm, saw, tool, use, ., made, boy, e..."


In [None]:
data.to_pickle('processed_train.pkl')