In [1]:
import pandas as pd
import numpy as np
import itertools
from nltk.corpus import stopwords
from nltk.tokenize.casual import TweetTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases
from gensim.models.phrases import Phraser



In [12]:
#get all our filepaths
filepath=r'C:\Users\Jonathan\Desktop\Data Science 2\SARC-train.csv'
filenameAFINN = r'C:\Users\Jonathan\Desktop\Data Science 2\AFINN-111.txt'
filenameEmotion = r'C:\Users\Jonathan\Desktop\Data Science 2\hashtag-emotion-0.2.txt'

#read in data and split into quotes and their responses. We've got some NA responses so axe those entries
dataset = pd.read_csv(filepath)
dataset=dataset.dropna(axis=0)
Y = dataset['Label'].values
quotes = dataset['Quote Text'].values
responses = dataset['Response Text'].values 

#read in AFINN and hashtag-emotion databases. Probably shouldve used pandas to do this but whatever
afinnMat=[]
emotMat=[]
for ws in open(filenameAFINN):
    afinnMat.append(ws.strip().split('\t'))
for ws in open(filenameEmotion):
    emotMat.append(ws.strip().split('\t'))
emotMat=np.array(emotMat)
afinnMat=np.array(afinnMat)

In [132]:
quotes

array(["First off, That's grade A USDA approved Liberalism in a nutshell.",
       "watch it. Now you're using my lines. Poet has always been an easy target, I will agree. ;)",
       'Because it will encourage teens to engage in riskier behavior. Abstinence until marriage is still the best way.',
       ...,
       'Sorry, I expanded my definitions here and was not polite enough to inform the rest of you :) I was not meaning to say JUST that nothing would be done about global warming (though I see now that it certainly appears that way). I am saying nothing will be done PERIOD. No preparation, nothing. We will wait until it is too late, act surprised when it happens, and then blame either the mexicans or the chinese. It is what we do.',
       'What we are left with are gods that have considerably little interaction with the world that we know.',
       '"They don\'t want state oversight," Wollmer said, "but they\'re more than willing to take state dollars."'], dtype=object)

In [15]:
def preprocess_text(text, tokenizer, stopwords=stopwords.words("english"), stemming=False, stemmer=PorterStemmer()):
    '''
    This function will remove stopwords from the text and perform stemming. Return tokenized sentences. 
    
    Params:
    text -- string we are looking at 
    tokenizer -- string of either 'twitter' or 'word' to specify which tokenizer to use
    stopwords -- list of stopwords to remove, default is the NLTK stopwords list
    stemming -- whether or not to perform stemming
    stemmer -- stemming function to use, default is the PorterStemmer from NLTK
    
    Returns:
    cleaned_text -- text with removed stopwords and applied stemming
    
    '''
    #remove stopwords 
    cleaned_text =  ' '.join([word for word in text.split() if word not in stopwords])
        
    #perform stemming
    if(stemming):
        if(tokenizer == 'twitter'):
            tokens = TweetTokenizer().tokenize(cleaned_text)
            stemmed_tokens = [stemmer.stem(i) for i in tokens]
        elif(tokenizer == 'word'):
            tokens = word_tokenize(cleaned_text)
            stemmed_tokens = [stemmer.stem(i) for i in tokens]
        return stemmed_tokens
    else:
        if(tokenizer == 'twitter'):
            tokens = TweetTokenizer().tokenize(cleaned_text)
        elif(tokenizer == 'word'):
            tokens = word_tokenize(cleaned_text)
        return tokens

In [16]:
for i in range(quotes.shape[0]):
    quotes[i] = preprocess_text(quotes[i], 'twitter')
for i in range(responses.shape[0]):
    responses[i] = preprocess_text(responses[i], 'twitter')

In [18]:
def embed(corpus,sizeArg=300,windowArg=7,ngrams=3):
    
    '''
    This function generates word2vec embeddings of our corpus. 
    
    Params:
    corpus -- our collection of documents. It should be a list of arrays  
    sizeArg -- dimension of embedding. Default is 300. This is on the larger side of the spectrum because
            our corpus is fairly large (standard embedding size is between 100 and 300).
    windowArg -- context window used in embedding. Default is 7, which is a little large. We do this 
                because sarcasm has a very complex structure and consequently prediction of a token
                will require knowledge of a lot of the nearby tokens
    ngram -- transforms tokens into phrases of at most n words. Default is three since not many phrases are longer
                than three words
    
    Returns: embedding_matrix -- an array of 300 dimensional vectors, one for each token in the vocabulary of the corpus
    '''
    
    #checks if we need to use the bigram transformer at all
    if ngrams==1:
        model=Word2Vec(corpus,size=sizeArg,window=windowArg,min_count=5)

    else:
        
        #perform bigramming n times. Note we only perform it a max of 5 times since there are a negligent amount
        #of phrases of length bigger than 5
        for i in range(0,min(ngrams,5)):
            
            #Phrases creates an object with all the bigrams, then Phraser is a wrapper class used to access the
            #resulting corpus of bigrams. Using phraser also speeds up computation time when making the model
            if i==0:
                bigram = Phrases(corpus)
                bigram_phraser = Phraser(bigram)
            else:
                bigram = Phrases(bigram_phraser[corpus])
                bigram_phraser = Phraser(bigram)
        
        model=Word2Vec(bigram_phraser[corpus],size=sizeArg,window=windowArg,min_count=5)
    
    #convert the wv word vectors into a numpy matrix
    embedding_matrix = np.zeros((len(model.wv.vocab), sizeArg))
    for i in range(len(model.wv.vocab)):
        embedding_vector = model.wv[model.wv.index2word[i]]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

In [19]:
def lexical_features(corpus,valence,emotion):
    
    '''
    This function will create all our lexical vectors.
    
    Params:
    corpus -- our collection of documents. It should be a list of arrays 
    valence -- a matrix with 2 columns; words in one column and their corresponding AFINN rating in the other. 
                Note every element in this 2d numpy array is a string.
    emotion -- a matrix with 3 columns; emotion in the first column, a word associated with it in the second,
                and its correlation in the third. Note every element in this 2d numpy array is a string.
                
    Returns: lexiconMat, emotions -- an array of lexical feature vectors for each observation and the emotion
                labels for each column
     -- 
    
    '''
    
    #here we get all the unique emotions in the NRC hashtag database
    emotions=np.unique(emotion[:,0])
    
    #this matrix will hold lexical features for each document. The first len(emotions) features represent 
    #the precense of a certain emotion in the document. 'Presence' in the document is measured using the 
    #sum of each word's correlation with a given emotion. The next feature holds the sum of the valence 
    #values of all the words in the document
    lexiconMat=np.zeros((len(corpus),len(emotions)+1))
    
    #loop through every document and then each word within each document. I tried to figure out a clever way 
    #to vectorize this but I couldn't
    for i in range(0,len(corpus)):
        for j in range(0,len(corpus[i])):
            
            #check if our current word is in the AFINN database. Also make our token lowercase since all the 
            #words in the AFINN database are lowercase. Capitalization of a word shouldn't change its valence
            if str.lower(corpus[i][j]) in valence[:,0]:
                
                #find which entry in the AFINN database our current word is
                indexVal=str.lower(corpus[i][j])==valence[:,0]
                
                #add this to our valence feature is the lexical matrix. Note valence[indexVal,1] returns an
                #array with one element, thats why we added the [0] to the end of it
                lexiconMat[i,len(emotions)]+=float(valence[indexVal,1][0])
                
            #check if our current word is in the AFINN database. Also make our token lowercase since all the 
            #words in the hashtag-emotion database are lowercase. Capitalization of a word shouldn't change 
            #the emotion it elicits
            if str.lower(corpus[i][j]) in emotion[:,1]:
                
                #find which entry in the hashtag-emotion database our current word is
                indexEmot=str.lower(corpus[i][j])==emotion[:,1]
                
                #check if our word correlates to more than one emotion 
                if len(emotion[indexEmot,0])==1:
                    
                    #generate a truth vector that reports true when on the index of our word's emotion. We append
                    #a false value to the end of this because we will pass it in the columns argument when subsetting
                    #lexiconMat
                    emotionName=emotions==emotion[indexEmot,0]
                    emotionName=np.append(emotionName,values=False)
                    
                    #add the correlation value of our word to corresponding emotion feature. Note we throw a [0]
                    #on the end of emotion[indexEmot,2] because that bit of code returns an array with one element
                    lexiconMat[i,emotionName]+=float(emotion[indexEmot,2][0])
                else:
                    
                    #loop through all the different emotions our current word correlates to
                    for l in range(0,len(emotion[indexEmot,0])):
                        
                        #do the same stuff as when our word only correlates to one emotion
                        emotionName=emotions==emotion[indexEmot,0][l]
                        emotionName=np.append(emotionName,values=False)
                        lexiconMat[i,emotionName]+=float(emotion[indexEmot,2][l][0])
                
    #we return emotions here as well so that we have a key for which column in lexiconMat corresponds to which emotion
    return (lexiconMat,emotions)

In [20]:
embeddings=embed(quotes)
lexFeatures=lexical_features(quotes,afinnMat,emotMat)

In [120]:
np.save(r'C:\Users\Jonathan\Desktop\Data Science 2\word2vec.npy',embeddings)
np.save(r'C:\Users\Jonathan\Desktop\Data Science 2\lexicalFeatures.npy',lexFeatures[0])