In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize.casual import TweetTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

In [2]:
dataset = pd.read_csv('data/sarcasm_v2.csv')
print(dataset.head())

  Corpus Label             ID  \
0    GEN  sarc  GEN_sarc_0000   
1    GEN  sarc  GEN_sarc_0001   
2    GEN  sarc  GEN_sarc_0002   
3    GEN  sarc  GEN_sarc_0003   
4    GEN  sarc  GEN_sarc_0004   

                                          Quote Text  \
0  First off, That's grade A USDA approved Libera...   
1  watch it. Now you're using my lines. Poet has ...   
2  Because it will encourage teens to engage in r...   
3  Obviously you missed the point. So sorry the t...   
4  This is pure paranoia. What evidence do you ha...   

                                       Response Text  
0  Therefore you accept that the Republican party...  
1  More chattering from the peanut gallery? Haven...  
2  Yep, suppressing natural behavior is always th...  
3  I guess we all missed your point Justine, what...  
4  Evidence, I dont need no sticking evidence. Th...  


In [3]:
print(dataset['Label'].value_counts())
print(dataset['Corpus'].value_counts())

notsarc    2346
sarc       2346
Name: Label, dtype: int64
GEN    3260
RQ      850
HYP     582
Name: Corpus, dtype: int64


In [5]:
Y = dataset['Label'].values
quotes = dataset['Quote Text'].values
responses = dataset['Response Text'].values 

print(type(Y))
print(type(quotes))
print(type(responses))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [7]:
#Two examples of potential tokenizers to use
print(TweetTokenizer().tokenize(quotes[0])) #tweet tokenizer to recogonize potential emoticons and 
print(word_tokenize(quotes[0])) #more standard tokenizer on punctuation and words

['First', 'off', ',', "That's", 'grade', 'A', 'USDA', 'approved', 'Liberalism', 'in', 'a', 'nutshell', '.']
['First', 'off', ',', 'That', "'s", 'grade', 'A', 'USDA', 'approved', 'Liberalism', 'in', 'a', 'nutshell', '.']


In [8]:
def preprocess_text(text, tokenizer, stopwords=stopwords.words("english"), stemmer=PorterStemmer()):
    '''
    This function will remove stopwords from the text and perform stemming. Return tokenized sentences. 
    
    Params:
    text -- string we are looking at 
    tokenizer -- string of either 'twitter' or 'word' to specify which tokenizer to use
    stopwords -- list of stopwords to remove, default is the NLTK stopwords list
    stemmer -- stemming function to use, default is the PorterStemmer from NLTK
    
    Returns:
    cleaned_text -- text with removed stopwords and applied stemming
    
    '''
    #remove stopwords 
    cleaned_text =  ' '.join([word for word in text.split() if word not in stopwords])
        
    #perform stemming
    if(tokenizer == 'twitter'):
        tokens = TweetTokenizer().tokenize(cleaned_text)
        stemmed_tokens = [stemmer.stem(i) for i in tokens]
    elif(tokenizer == 'word'):
        tokens = word_tokenize(cleaned_text)
        stemmed_tokens = [stemmer.stem(i) for i in tokens]
    
    return stemmed_tokens

In [9]:
for i in range(quotes.shape[0]):
    quotes[i] = preprocess_text(quotes[i], 'twitter')
for i in range(responses.shape[0]):
    responses[i] = preprocess_text(responses[i], 'twitter')

In [11]:
#let's make sure we get what we expect: tokenized sentences with no stopwords and removed stems
print(quotes[0])
print(responses[0])

['first', 'off', ',', "that'", 'grade', 'A', 'usda', 'approv', 'liber', 'nutshel', '.']
['therefor', 'accept', 'republican', 'parti', 'almost', 'whole', '"', 'grade', 'A', 'usda', 'approv', 'liber', '.', '"', 'about', 'time', 'did', '.']


In [None]:
#See Github README for next steps