In [1]:
#Adapted from Ben's preprocessing work pushed on 14 Oct, 2020

import pandas as pd
import string
from datetime import datetime
import unidecode
from word2number import w2n
import gensim

import nltk
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings("ignore")


In [2]:
def year_splitter(date_time):
    format = '%Y-%m-%d %H:%M:%S'
    date = datetime.strptime(date_time, format)
    year = date.year
    
    return f'{year}'

def month_extractor(date_time):
    format = '%Y-%m-%d %H:%M:%S'
    date = datetime.strptime(date_time, format)
    month = date.month
    
    return f'{month:02}' 

def day_extractor(date_time):
    format = '%Y-%m-%d %H:%M:%S'
    date = datetime.strptime(date_time, format)
    day= date.day
    
    return f'{day:02}' 

def hour_extractor(date_time):
    format = '%Y-%m-%d %H:%M:%S'
    date = datetime.strptime(date_time, format)
    hour = date.hour
    
    return f'{hour:02}' 

def remove_accented_chars(text):
    text = unidecode.unidecode(text)
    
    return text

def remove_punctuation(text):
    final = ''
    
    for i in text:
        if ((i in string.printable) and (i not in (string.punctuation))):
            final += i
        else:
            final += " "
    
    return final

def convert_numbers(text):
    token = text.split()

    final = ''
    for word in range(len(token)):
        try:
            number = w2n.word_to_num(token[word])
            final += str(number)
        except:
            final += token[word]
    
        if (word < len(token)):
            final += ' '
        
    return final

def remove_numbers(text):
    text = ''.join([i for i in text if not i.isdigit()])
    
    return text

def check_all_uppercase(text):
    return (text.isalpha() and text.isupper())

def my_lower(text):
    token = text.split()
    final = ''
    accept_word_count = 0
    
    for word in range(len(token)):
        if (accept_word_count!=0):
            final += ' '
        if check_all_uppercase(token[word]):
            final += token[word]
        else:
            final += token[word].lower()
        accept_word_count += 1
            
    return final

def remove_single_word(text):
    token = text.split()
    final = ''
    for word in range(len(token)):
        
        if len(token[word])>1:
            final += token[word]
            
            if (word < len(token)):
                final += ' '
    return final
    
def change_negation_words(text):
    
    negation = [ 'ain', 'aren', "arent", "cant", 'couldn',  "couldnt", 'didn', "didnt", 'doesn', "doesnt", "don", "dont", 'hadn', 
                "hadnt", 'hasn', "hasnt", 'haven', "havent", 'isn',  "isnt", 'ma', 'mightn', "mightnt", 'mustn', "mustnt", 
                'needn', "neednt", 'shan', "shant", 'shouldn', "shouldnt", 'wasn', "wasnt", 'weren', "werent", 'won', "wont",
                'wouldn', "wouldnt", "cannot"]
    
    token = text.split()
    final = ''
    for word in range(len(token)):
        if (token[word] not in negation):
            final += token[word]
            
            if (word < len(token)):
                final += ' '
        else:
            final += "not "
    return final  

# removed words: 'not', 'no'
def remove_stopwords(text):
    
    #stop = [remove_punctuation(i) for i in stopwords.words('english')]
    stop = stopwords.words('english') + ["I"]
    """
    ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "youre", "youve", "youll", "youd", 'your', 
        'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "shes", 'her', 'hers', 'herself', 'it', 
        'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 
        'that', "thatll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 
        'having', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 
        'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 
        'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 
        'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 
        'some', 'such', 'nor', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 
        "dont", 'should', "shouldve", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', "theyre"]
    
    """
    
    token = text.split()
    final = ''
    accept_word_count = 0
    
    for word in range(len(token)):
        if (token[word] not in stop):
            if (accept_word_count!=0):
                final += ' '
            final += token[word]
            accept_word_count += 1
            
    
    return final   

In [3]:
comments = pd.read_csv('train-balanced-sarcasm.csv')

In [4]:
# removing the columns not really needed
comments.drop(['author', 'date'], axis = 1, inplace = True)
comments = comments.dropna()

In [5]:
# extraving the year, month, day and hour that the comment was made. May be useful in the future
comments['year'] = comments['created_utc'].map(lambda x: year_splitter(x))
comments['month'] = comments['created_utc'].map(lambda x: month_extractor(x))
comments['day'] = comments['created_utc'].map(lambda x: day_extractor(x))
comments['hour'] = comments['created_utc'].map(lambda x: hour_extractor(x))

In [6]:
comments

Unnamed: 0,label,comment,subreddit,score,ups,downs,created_utc,parent_comment,year,month,day,hour
0,0,NC and NH.,politics,2,-1,-1,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ...",2016,10,16,23
1,0,You do know west teams play against west teams...,nba,-4,-1,-1,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...,2016,11,01,00
2,0,"They were underdogs earlier today, but since G...",nfl,3,3,0,2016-09-22 21:45:37,They're favored to win.,2016,09,22,21
3,0,"This meme isn't funny none of the ""new york ni...",BlackPeopleTwitter,-8,-1,-1,2016-10-18 21:03:47,deadass don't kill my buzz,2016,10,18,21
4,0,I could use one of those tools.,MaddenUltimateTeam,6,-1,-1,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...,2016,12,30,17
...,...,...,...,...,...,...,...,...,...,...,...,...
1010821,1,I'm sure that Iran and N. Korea have the techn...,reddit.com,2,2,0,2009-04-25 00:47:52,"No one is calling this an engineered pathogen,...",2009,04,25,00
1010822,1,"whatever you do, don't vote green!",climate,1,1,0,2009-05-14 22:27:40,In a move typical of their recent do-nothing a...,2009,05,14,22
1010823,1,Perhaps this is an atheist conspiracy to make ...,atheism,1,1,0,2009-01-11 00:22:57,Screw the Disabled--I've got to get to Church ...,2009,01,11,00
1010824,1,The Slavs got their own country - it is called...,worldnews,1,1,0,2009-01-23 21:12:49,I've always been unsettled by that. I hear a l...,2009,01,23,21


Preprocessing with removing stopwords, for EDA

In [11]:
# converting accented characters to be standard
comments['cleaned comment'] = comments['comment'].apply(remove_accented_chars)

# removing all forms of punctuation, maintains spaces
comments['cleaned comment'] = comments['cleaned comment'].apply(remove_punctuation)

# convert numbers to numeric where possible
comments['cleaned comment'] = comments['cleaned comment'].apply(convert_numbers)

# remove numbers from each comment
comments['cleaned comment'] = comments['cleaned comment'].apply(remove_numbers)

# removing all upper case letters from each comment
comments['cleaned comment'] = comments['cleaned comment'].apply(my_lower)

# remove the tokens with stopping word
comments['cleaned comment'] = comments['cleaned comment'].apply(remove_stopwords)

# remove the tokens with single word
comments['cleaned comment'] = comments['cleaned comment'].apply(remove_single_word)


In [12]:
comments

Unnamed: 0,label,comment,subreddit,score,ups,downs,created_utc,parent_comment,year,month,day,hour,cleaned comment
0,0,NC and NH.,politics,2,-1,-1,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ...",2016,10,16,23,NC NH
1,0,You do know west teams play against west teams...,nba,-4,-1,-1,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...,2016,11,01,00,know west teams play west teams east teams right
2,0,"They were underdogs earlier today, but since G...",nfl,3,3,0,2016-09-22 21:45:37,They're favored to win.,2016,09,22,21,underdogs earlier today since gronk announceme...
3,0,"This meme isn't funny none of the ""new york ni...",BlackPeopleTwitter,-8,-1,-1,2016-10-18 21:03:47,deadass don't kill my buzz,2016,10,18,21,meme funny none new york nigga ones
4,0,I could use one of those tools.,MaddenUltimateTeam,6,-1,-1,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...,2016,12,30,17,could use tools
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1010821,1,I'm sure that Iran and N. Korea have the techn...,reddit.com,2,2,0,2009-04-25 00:47:52,"No one is calling this an engineered pathogen,...",2009,04,25,00,sure iran korea technology create pig bird hum...
1010822,1,"whatever you do, don't vote green!",climate,1,1,0,2009-05-14 22:27:40,In a move typical of their recent do-nothing a...,2009,05,14,22,whatever vote green
1010823,1,Perhaps this is an atheist conspiracy to make ...,atheism,1,1,0,2009-01-11 00:22:57,Screw the Disabled--I've got to get to Church ...,2009,01,11,00,perhaps atheist conspiracy make christians loo...
1010824,1,The Slavs got their own country - it is called...,worldnews,1,1,0,2009-01-23 21:12:49,I've always been unsettled by that. I hear a l...,2009,01,23,21,slavs got country called kosovo


In [14]:
# doing the same for the parent comments
# converting accented characters to be standard
comments['cleaned parent comment'] = comments['parent_comment'].apply(remove_accented_chars)

# removing punctuation
comments['cleaned parent comment'] = comments['cleaned parent comment'].apply(remove_punctuation)

# convert numbers to numeric where possible
comments['cleaned parent comment'] = comments['cleaned parent comment'].apply(convert_numbers)

# remove numbers from each comment
comments['cleaned parent comment'] = comments['cleaned parent comment'].apply(remove_numbers)

# removing all upper case letters from each comment
comments['cleaned parent comment'] = comments['cleaned parent comment'].apply(my_lower)

# remove the tokens with stopping word
comments['cleaned parent comment'] = comments['cleaned parent comment'].apply(remove_stopwords)

# remove the tokens with single word
comments['cleaned parent comment'] = comments['cleaned parent comment'].apply(remove_single_word)



In [15]:
comments = comments[(comments["cleaned comment"] != "")&(comments["cleaned parent comment"] != "")]

In [16]:
comments

Unnamed: 0,label,comment,subreddit,score,ups,downs,created_utc,parent_comment,year,month,day,hour,cleaned comment,cleaned parent comment
0,0,NC and NH.,politics,2,-1,-1,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ...",2016,10,16,23,NC NH,yeah get argument prefer lived NC well
1,0,You do know west teams play against west teams...,nba,-4,-1,-1,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...,2016,11,01,00,know west teams play west teams east teams right,blazers mavericks wests seed even carry good e...
2,0,"They were underdogs earlier today, but since G...",nfl,3,3,0,2016-09-22 21:45:37,They're favored to win.,2016,09,22,21,underdogs earlier today since gronk announceme...,favored win
3,0,"This meme isn't funny none of the ""new york ni...",BlackPeopleTwitter,-8,-1,-1,2016-10-18 21:03:47,deadass don't kill my buzz,2016,10,18,21,meme funny none new york nigga ones,deadass kill buzz
4,0,I could use one of those tools.,MaddenUltimateTeam,6,-1,-1,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...,2016,12,30,17,could use tools,yep confirm saw tool use made boy easports MUT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1010821,1,I'm sure that Iran and N. Korea have the techn...,reddit.com,2,2,0,2009-04-25 00:47:52,"No one is calling this an engineered pathogen,...",2009,04,25,00,sure iran korea technology create pig bird hum...,calling engineered pathogen reports virus bits...
1010822,1,"whatever you do, don't vote green!",climate,1,1,0,2009-05-14 22:27:40,In a move typical of their recent do-nothing a...,2009,05,14,22,whatever vote green,move typical recent nothing approach CO emissi...
1010823,1,Perhaps this is an atheist conspiracy to make ...,atheism,1,1,0,2009-01-11 00:22:57,Screw the Disabled--I've got to get to Church ...,2009,01,11,00,perhaps atheist conspiracy make christians loo...,screw disabled got get church time
1010824,1,The Slavs got their own country - it is called...,worldnews,1,1,0,2009-01-23 21:12:49,I've always been unsettled by that. I hear a l...,2009,01,23,21,slavs got country called kosovo,always unsettled hear lot jewish people say is...


In [17]:
comments.to_csv("data/cleaned-train-balanced-sarcasm-1.csv", index = False)

Preprocessing without removing stopwords, for Training(maybe?)

In [18]:
comments = pd.read_csv('data/train-balanced-sarcasm.csv')

In [19]:
# removing the columns not really needed
comments.drop(['author', 'date'], axis = 1, inplace = True)
comments = comments.dropna()

In [20]:
# extraving the year, month, day and hour that the comment was made. May be useful in the future
comments['year'] = comments['created_utc'].map(lambda x: year_splitter(x))
comments['month'] = comments['created_utc'].map(lambda x: month_extractor(x))
comments['day'] = comments['created_utc'].map(lambda x: day_extractor(x))
comments['hour'] = comments['created_utc'].map(lambda x: hour_extractor(x))

In [5]:
# converting accented characters to be standard
comments['cleaned comment'] = comments['comment'].apply(remove_accented_chars)

# removing all forms of punctuation, maintains spaces
comments['cleaned comment'] = comments['cleaned comment'].apply(remove_punctuation)

# convert numbers to numeric where possible
comments['cleaned comment'] = comments['cleaned comment'].apply(convert_numbers)

# remove numbers from each comment
comments['cleaned comment'] = comments['cleaned comment'].apply(remove_numbers)

# removing all upper case letters from each comment
comments['cleaned comment'] = comments['cleaned comment'].apply(my_lower)

# remove the tokens with single word
comments['cleaned comment'] = comments['cleaned comment'].apply(remove_single_word)

In [6]:
# doing the same for the parent comments
# converting accented characters to be standard
comments['cleaned parent comment'] = comments['parent_comment'].apply(remove_accented_chars)

# removing punctuation
comments['cleaned parent comment'] = comments['cleaned parent comment'].apply(remove_punctuation)

# convert numbers to numeric where possible
comments['cleaned parent comment'] = comments['cleaned parent comment'].apply(convert_numbers)

# remove numbers from each comment
comments['cleaned parent comment'] = comments['cleaned parent comment'].apply(remove_numbers)

# removing all upper case letters from each comment
comments['cleaned parent comment'] = comments['cleaned parent comment'].apply(my_lower)

# remove the tokens with single word
comments['cleaned parent comment'] = comments['cleaned parent comment'].apply(remove_single_word)

In [7]:
comments = comments[(comments["cleaned comment"] != "")&(comments["cleaned parent comment"] != "")]

In [8]:
comments

Unnamed: 0,label,comment,subreddit,score,ups,downs,created_utc,parent_comment,cleaned comment,cleaned parent comment
0,0,NC and NH.,politics,2,-1,-1,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ...",NC and NH,yeah get that argument at this prefer is she l...
1,0,You do know west teams play against west teams...,nba,-4,-1,-1,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...,you do know west teams play against west teams...,the blazers and mavericks the wests and seed d...
2,0,"They were underdogs earlier today, but since G...",nfl,3,3,0,2016-09-22 21:45:37,They're favored to win.,they were underdogs earlier today but since gr...,they re favored to win
3,0,"This meme isn't funny none of the ""new york ni...",BlackPeopleTwitter,-8,-1,-1,2016-10-18 21:03:47,deadass don't kill my buzz,this meme isn funny none of the new york nigga...,deadass don kill my buzz
4,0,I could use one of those tools.,MaddenUltimateTeam,6,-1,-1,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...,could use of those tools,yep can confirm saw the tool they use for that...
...,...,...,...,...,...,...,...,...,...,...
1010821,1,I'm sure that Iran and N. Korea have the techn...,reddit.com,2,2,0,2009-04-25 00:47:52,"No one is calling this an engineered pathogen,...",sure that iran and korea have the technology t...,no is calling this an engineered pathogen but ...
1010822,1,"whatever you do, don't vote green!",climate,1,1,0,2009-05-14 22:27:40,In a move typical of their recent do-nothing a...,whatever you do don vote green,in move typical of their recent do nothing app...
1010823,1,Perhaps this is an atheist conspiracy to make ...,atheism,1,1,0,2009-01-11 00:22:57,Screw the Disabled--I've got to get to Church ...,perhaps this is an atheist conspiracy to make ...,screw the disabled ve got to get to church on ...
1010824,1,The Slavs got their own country - it is called...,worldnews,1,1,0,2009-01-23 21:12:49,I've always been unsettled by that. I hear a l...,the slavs got their own country it is called k...,ve always been unsettled by that hear lot of j...


In [9]:
comments.to_csv("cleaned-train-balanced-sarcasm-2.csv", index = False)