In [1]:
import pandas as pd
import numpy as np
import nltk
import spacy
import pickle
import textblob
import re
import string
import os
from textblob import TextBlob
from processing_comment import ProcessingComment
from stop_words import get_stop_words
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, \
                                            CountVectorizer

## Feature engineering

### Loading data

In [2]:
data = pd.read_csv('data/train-balanced-sarcasm.csv')

In [3]:
data = data.dropna().reset_index(drop=True)

In [4]:
data.tail()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
1010768,1,I'm sure that Iran and N. Korea have the techn...,TwarkMain,reddit.com,2,2,0,2009-04,2009-04-25 00:47:52,"No one is calling this an engineered pathogen,..."
1010769,1,"whatever you do, don't vote green!",BCHarvey,climate,1,1,0,2009-05,2009-05-14 22:27:40,In a move typical of their recent do-nothing a...
1010770,1,Perhaps this is an atheist conspiracy to make ...,rebelcommander,atheism,1,1,0,2009-01,2009-01-11 00:22:57,Screw the Disabled--I've got to get to Church ...
1010771,1,The Slavs got their own country - it is called...,catsi,worldnews,1,1,0,2009-01,2009-01-23 21:12:49,I've always been unsettled by that. I hear a l...
1010772,1,"values, as in capitalism .. there is good mone...",frogking,politics,2,2,0,2009-01,2009-01-24 06:20:14,Why do the people who make our laws seem unabl...


In [5]:
print(len(data.author.unique()))
print(data.shape)

256560
(1010773, 11)


In [6]:
np.sum(data.label)

505368

In [7]:
print(np.sum(data.author.value_counts() > 4),
      np.sum(data.author.value_counts() > 0))

45338 256560


### textblob

In [None]:
first_try = TextBlob(data.iloc[1, 1])

In [None]:
first_try.sentences

## sentiment

In [None]:
def sentiment_spread(comment):
    text = TextBlob(comment)
    text = text.sentiment_assessments
    list_ = []
    if text[2]:
        for i in range(len(text[2])):
            list_.append(text[2][i][1])
    else:
        list_.append(0)
    std = np.std(list_)
    max_ = np.max(list_)
    min_ = np.min(list_)
    
    return [text[0], text[1], max_, min_, std]

In [None]:
sentiment_spread(data.iloc[1, 1])

In [None]:
first_try.sentiment_assessments

## bag of words + lexical clues

In [19]:
comments, parent_comments = data.iloc[:, 1], data.iloc[:, -1]

In [22]:
comments

0                                                 NC and NH.
1          You do know west teams play against west teams...
2          They were underdogs earlier today, but since G...
3          This meme isn't funny none of the "new york ni...
4                            I could use one of those tools.
5          I don't pay attention to her, but as long as s...
6              Trick or treating in general is just weird...
7                            Blade Mastery+Masamune or GTFO!
8          You don't have to, you have a good build, buy ...
9                          I would love to see him at lolla.
10         I think a significant amount would be against ...
11                            Damn I was hoping God was real
12                                      They have an agenda.
13                                               Great idea!
14         Ayy bb wassup, it makes a bit more sense in co...
15                                             what the fuck
16                      

### prototyping

In [23]:
testing = TextBlob(comments[10])

In [24]:
comments[13]

'Great idea!'

In [None]:
pat1 = r'[.?!",]+'
pat2 = r'[!?]+'
pat3 = r'\.{3,10}'
match1 = re.findall(pat1, ' aa bb cc ... dd . ee .. ff !?! r ?')
match2 = re.findall(pat2, ' aa bb cc ... dd . ee .. ff !?! r ?')
match3 = re.findall(pat3, ' aa bb cc ... dd . ee .. ff !?! r ?')

In [None]:
print(match1, match2, match3)
print(len(match1), len(match2), 
      len(match3)) # to get overall number of punctuation

In [None]:
one_string = ''.join(map(str, comments[:10000]))
print(one_string)

In [None]:
emoji_pattern = r'((?::|x|X|;|=)(?:-)?(?:\)|D|P|p|X|/))'

In [None]:
emoji_match = re.findall(emoji_pattern, one_string)
all_emoji = [emoji for emoji in emoji_match 
             if emoji not in ['XP', 'xp', 'Xp']]

In [None]:
all_emoji

In [None]:
for word in testing.tags:
    if len(word[0]) > 2:
        print(word)

In [None]:
print(testing.lower()) # decapitalize
print(len(testing)) # get overall length

In [None]:
testing.word_counts

In [None]:
testing.words[8].lemmatize('v')

In [None]:
for word in testing.tags:
    if word[1].startswith('V'):
        word = (word[0], 'v')
        print(word)
    elif word[1].startswith('N'):
        word = (word[0], 'n')
        print(word)
    elif word[1].startswith('J'):
        word = (word[0], 'a')
        print(word)
    elif word[1].startswith('R'):
        word = (word[0], 'r')
        print(word)

In [None]:
clean_words = []
for i in range(len(testing.words)):
    word = testing.words[i]
    if word.isalpha():
        clean_words.append(word)

In [None]:
stops = list(get_stop_words('en'))         #About 900 stopwords
nltk_words = list(stopwords.words('english')) #About 150 stopwords
stops.extend(nltk_words)
filtered_words = [word for word in testing.words 
                  if word not in stops]

In [None]:
filtered_words

In [None]:
cap_pat = r'[A-Z]+'
cap_pat2 = r'[A-Z]'

In [None]:
caps = re.findall(cap_pat, comments[11])
caps2 = re.findall(cap_pat2, comments[11])

In [None]:
print(len(caps2))
max_len = 0
for i in caps: # put somtething like length of comment/# of capitals
    max_len = max(max_len, len(i))
print(max_len)

### actual functions after prototyping

In [None]:
def only_bow_features(data):
    ''' Creates a raw text for BoW and PoS tags.
    
    This function takes as input the vector of comments and extracts
    the BoW and PoS tags. It creates two lists of lists which 
    are then returned.
    Before the actual extraction starts stopwords are created. 
    Then a for loop goes through each comment and cleans the comment. 
    Afterwards the cleaned text and its tags are added to their
    respective comments.
    '''
    
    # final output
    bow = []
    pos = []
    
    # stopwords
    stops = list(get_stop_words('en'))
    nltk_words = list(stopwords.words('english'))
    stops.extend(nltk_words)
    
    for j, comment in enumerate(data):
        # counter
        if j % 10000 == 0:
            print(j)
        
        ##### comment cleaning
        # textblob and lowercase
        text = TextBlob(comment)
        text_lower = text.lower()
        
        # numbers + punctuation
        clean_words = []
        for word in text_lower.words:
            word = word.stem()
            if word.isalpha():
                clean_words.append(word)
        
        # stopwords + short words
        filtered_words = [word for word in clean_words
                          if word not in stops]
        final_list = []
        for i in range(len(filtered_words)):
            if len(filtered_words[i]) > 2:
                final_list.append(filtered_words[i])
        final_list = ' '.join(map(str, final_list))
        
        # pos tags
        pos_tags = [text.pos_tags[k][1]
                    for k in range(len(text.pos_tags))]
        pos_list = ' '.join(map(str, pos_tags))
        
        bow.append(final_list)
        pos.append(pos_list)
    
    return bow, pos

In [5]:
def bow_and_lexical_features(data):
    ''' Creates a raw text for BoW, PoS tags and a set of lexical features.
    
    This function takes as input the vector of comments and extracts
    the BoW, PoS tags and all the lexical features. It creates three lists
    of lists which are then returned.
    Before the actual extraction starts stopwords and regex expressions
    are created. Then a for loop goes through each comment and finds the
    regex matches and cleans the comment. Afterwards the regex matches are
    added to one of the list, the cleaned text and its tags are added to
    respective comments as well.
    '''
    
    # final output
    lexical = []
    bow = []
    pos = []
    
    # stopwords
    stops = list(get_stop_words('en'))
    nltk_words = list(stopwords.words('english'))
    stops.extend(nltk_words)
    
    # patterns for matching
    cap_pat = r'[A-Z]+'
    cap_pat2 = r'[A-Z]'
    pat1 = r'[.?!",]+'
    pat2 = r'[!?]+'
    pat3 = r'\.{3,10}'
    emoji_pattern = r'((?::|x|X|;|=)(?:-)?(?:\)|D|P|p|X|/))'
    
    for j, comment in enumerate(data):
        # counter
        if j % 10000 == 0:
            print(j)
            
        ###### lexical cues
        # capitals
        caps = re.findall(cap_pat, comment)
        max_len = 0
        for i in caps:
            max_len = max(max_len, len(i))
        caps2 = re.findall(cap_pat2, comment)
        
        # punctuation
        match1 = re.findall(pat1, comment)
        match2 = re.findall(pat2, comment)
        match3 = re.findall(pat3, comment)
        
        # emojis
        emoji_match = re.findall(emoji_pattern, comment)
        all_emoji = [emoji for emoji in emoji_match 
                     if emoji not in ['XP', 'xp', 'Xp']]
        
        # length
        length = len(comment)
        len_vars = [len(caps) / length, len(caps2) / length, 
                    len(match1) / length, len(match2) / length, 
                    len(match3) / length, len(all_emoji) / length]
        
        ##### comment cleaning
        # textblob and lowercase
        text = TextBlob(comment)
        text_lower = text.lower()
        
        # numbers + punctuation
        clean_words = []
        for word in text_lower.words:
            word = word.stem()
            if word.isalpha():
                clean_words.append(word)
        
        # stopwords + short words
        filtered_words = [word for word in clean_words
                          if word not in stops]
        final_list = []
        for i in range(len(filtered_words)):
            if len(filtered_words[i]) > 2:
                final_list.append(filtered_words[i])
        final_list = ' '.join(map(str, final_list))
        
        # pos tags
        pos_tags = [text.pos_tags[k][1]
                    for k in range(len(text.pos_tags))]
        pos_list = ' '.join(map(str, pos_tags))
        
        lexical.append(len_vars)
        bow.append(final_list)
        pos.append(pos_list)
    
    return lexical, bow, pos

#### a bit of prototyping again

In [None]:
text = TextBlob(comments[2247])

In [None]:
text.tags

In [None]:
pos_tags = [text.pos_tags[k][1]
        for k in range(len(text.pos_tags))]
pos_list = ' '.join(map(str, pos_tags))

In [None]:
lexical, bow, pos = bow_and_lexical_features(comments[2249])

#### actual functions again

In [None]:
def create_bow_classic(array_bow, array_pos):
    ''' Takes the BoW and PoS output and returns a BoW.'''
    vec = CountVectorizer()
    X = vec.fit_transform(array_bow)

    vec_pos = CountVectorizer()
    X_pos = vec_pos.fit_transform(array_pos)
    
    return X, X_pos

In [18]:
def create_bow(array_bow, array_pos):
    ''' Takes the BoW and PoS output and returns a normalized BoW.'''
    vec = TfidfVectorizer()
    X = vec.fit_transform(array_bow)

    vec_pos = TfidfVectorizer()
    X_pos = vec_pos.fit_transform(array_pos)
    
    return X, X_pos

In [9]:
def create_bow2(array_bow):
    ''' Takes the BoW and returns a normalized BoW.'''
    vec = TfidfVectorizer()
    X = vec.fit_transform(array_bow)
    
    return X

## similarity measures

In [None]:
# loading spaCy model, the vanilla version which comes with spacy is enough
nlp = spacy.load('en_core_web_lg')

#### prototyping

In [None]:
tokens_parent = nlp(parent_comments[7])

In [None]:
len(tokens_parent[0])

In [None]:
tokens = nlp(comments[7])

In [None]:
tokens.similarity(nlp(parent_comments[7]))

In [None]:
def comment_similarity(comment, parent_comment):
    simil = comment.similarity(parent_comment)
    return simil

In [None]:
comment_similarity(tokens, tokens_parent)

In [None]:
for token in tokens:
    print(token.text, token.has_vector, 
          token.vector_norm, token.tag_)

In [None]:
tokens[4].text == tokens[8].text

#### actual function

In [None]:
def get_similarity(tokens):
    '''Computes maximum and minimum similarity within a comment.
    
    The input is a transformed comment by spaCy, output is a list of lists
    of the similarities. The first for loop keeps only words which are
    verbs or nouns, the second and third then calculate the similarity
    between all verbs/nouns and the maximum and minimum of all these values
    is returned.
    '''
    list_nouns = []
    list_verbs = []
    used_words = []
    for token in tokens:
        tag = token.tag_
        if token.text in used_words:
            None
        else:
            used_words.append(token.text)
            if tag.startswith("V"):
                list_verbs.append(token)
            elif tag.startswith('N'):
                list_nouns.append(token)

    simil_list = []
    for k in range(len(list_nouns)):
        new = [list_nouns[i] for i in range(len(list_nouns)) 
               if i != k]
        for other_word in new:
            if len(other_word) > 1:
                simil_list.append(list_nouns[k] \
                                  .similarity(other_word))
    similarity_nouns = list(set(simil_list))
    
    simil_list = []
    for k in range(len(list_verbs)):
        new = [list_verbs[i] for i in range(len(list_verbs)) 
               if i != k]
        for other_word in new:
            if len(other_word) > 1:
                simil_list.append(list_verbs[k] \
                                  .similarity(other_word))
    similarity_verbs = list(set(simil_list))
    
    # if no verb/noun is present
    if not similarity_nouns:
        similarity_nouns.append(0)
    if not similarity_verbs:
        similarity_verbs.append(0)
    
    return [np.max(similarity_nouns), np.min(similarity_nouns), 
            np.max(similarity_verbs), np.min(similarity_verbs)]

In [None]:
get_similarity(tokens)

In [None]:
get_similarity(tokens_parent)

## wrapper functions

In [None]:
'''
sentiment_spread - one comment
bow_and_lexical_features - vector of comments
create_bow - output from bow_and_lexical_features
get_similarity - tokens, comment transformed by nlp() command
'''

In [None]:
def sentiment_all_data(data):
    '''Wrapper for sentiment_spread(), input: whole data set'''
    comment_sent = []
    parent_sent = []
    for index, row in data.iterrows():
        if index % 1000 == 0:
            print(index)
        comment_sent.append(sentiment_spread(row['comment']))
        parent_sent.append(sentiment_spread(row['parent_comment']))
    
    return comment_sent, parent_sent

In [None]:
# the sentiment_all_data() function applied to both parent and original
# comments and the result saved for later use
orig_sentiment, par_sentiment = sentiment_all_data(data)
pickle.dump(orig_sentiment, open("orig_sentiment", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(par_sentiment, open("par_sentiment", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
def lex_bow_pos_all(data):
    ''' Gets cleaned text, PoS and lexical clues from data set'''
    lexical, bow, pos = bow_and_lexical_features(data['comment'])
    lexical_par, bow_par, pos_par = bow_and_lexical_features(
        data['parent_comment'])
    
    return lexical, lexical_par, bow, pos, bow_par, pos_par

In [None]:
def lex_bow_pos_all_2(bow, pos, bow_par, pos_par):
    ''' Creates BoW and PoS (tfidf)matrix from cleaned text and PoS tags'''
    bow_matrix, pos_matrix = create_bow(bow, pos)
    bow_matrix_par, pos_matrix_par = create_bow(bow_par, pos_par)
    
    return bow_matrix, pos_matrix, bow_matrix_par, pos_matrix_par

In [None]:
def bow_pos_all(data):
    ''' Gets cleaned text and PoS from data set'''
    bow_matrix, pos_matrix = create_bow(bow, pos)
    bow_matrix_par, pos_matrix_par = create_bow(bow_par, pos_par)
    
    return bow_matrix, pos_matrix, bow_matrix_par, pos_matrix_par

In [None]:
def bow_pos_all_2(bow, pos, bow_par, pos_par):
    ''' Creates BoW and PoS matrix from cleaned text and PoS tags'''
    bow_matrix, pos_matrix = create_bow_classic(bow, pos)
    bow_matrix_par, pos_matrix_par = create_bow_classic(bow_par, 
                                                        pos_par)
    
    return bow_matrix, pos_matrix, bow_matrix_par, pos_matrix_par

In [None]:
# the lex_bow_pos_all() function applied to both parent and original
# comments and the result saved for later use
lexical, lexical_parent, bow, pos, bow_parent, \
        pos_parent = lex_bow_pos_all(data)

pickle.dump(lexical, open("lexical", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(lexical_parent, open("lexical_parent", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(bow, open("bow", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(pos, open("pos", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(bow_parent, open("bow_parent", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(pos_parent, open("pos_parent", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# the lex_bow_pos_all_2() function applied to both parent and original
# comments and the result saved for later use
bow_matrix, pos_matrix, bow_matrix_par, pos_matrix_par = \
    lex_bow_pos_all_2(bow, pos, bow_parent, pos_parent)

pickle.dump(bow_matrix, open("bow_matrix", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(pos_matrix, open("pos_matrix", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(bow_matrix_par, open("bow_matrix_par", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(pos_matrix_par, open("pos_matrix_par", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# the bow_pos_all() function applied to cleaned data from par and orig
# comments and the result saved for later use
bow_cl, pos_cl, bow_parent_cl, pos_parent_cl = bow_pos_all(data)

In [None]:
# the bow_pos_all_2() function applied to cleaned data from par and orig
# comments and the result saved for later use
bow_matrix_cl, pos_matrix_cl, bow_matrix_par_cl, pos_matrix_par_cl = \
    bow_pos_all_2(bow_cl, pos_cl, bow_parent_cl, pos_parent_cl)

pickle.dump(bow_matrix_cl, open("bow_matrix_cl", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(pos_matrix_cl, open("pos_matrix_cl", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(bow_matrix_par_cl, open("bow_matrix_par_cl", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(pos_matrix_par_cl, open("pos_matrix_par_cl", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)

### similarity

In [None]:
def similarity_all(data):
    '''Wrapper for get_similarity() function, input entire data set'''
    comment_sim = []
    parent_sim = []
    to_par_sim = []
    
    for index, row in data.iterrows():
        if index % 1000 == 0:
            print(index)
        comment = nlp(row['comment'])
        par_comment = nlp(row['parent_comment'])
        comment_sim.append(get_similarity(comment))
        parent_sim.append(get_similarity(par_comment))
        to_par_sim.append(comment.similarity(par_comment))
    
    return comment_sim, parent_sim, to_par_sim

In [None]:
# the similarity_all() function applied to cleaned data from par and orig
# comments and the result saved for later use
orig_simil, par_simil, com_to_par_sim = similarity_all(data)

pickle.dump(orig_simil, open("orig_simil", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(par_simil, open("par_simil", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(com_to_par_sim, open("com_to_par_sim", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)

## user embeddings based measure

In [None]:
from scipy.spatial.distance import cosine

### prototyping

In [None]:
list_[0]['Always_the_NewGuy'][0].reshape(-1, )

In [None]:
cosine(list_[0]['Always_the_NewGuy'],
       list_[0]['Always_smooth'])

In [None]:
with open("data/user_embeddings/AmazingAlo",'rb') as df:
    list_ = pickle.load(df)

In [None]:
list_[0]['Always_smooth']

In [None]:
smth = nlp(data['comment'][5:8])

In [None]:
np.sum(smth.tensor, axis=0).shape

In [None]:
data.loc[555, 'author']

In [None]:
sub_index = list(data[data['author'] == 'Trumpbart'].index.values)

In [None]:
test_data = data.iloc[:10, :]

In [None]:
test_data.loc[sub_index, :]

In [None]:
test_np = np.zeros((data.shape[0], 1))
test_np[sub_index] = np.array((1, 3)).reshape((-1, 1))
sum(test_np)

### actual function

In [None]:
def user_deviation(d):
    '''Takes user embedding and calculates the comment's similarity
    
    Input is the data set with the comments, then every user's
    embedding is retrieved, data set subsetted, the similarity
    calculated and a numpy array changed. The output is the numpy
    array with the similarities.
    '''
    
    user_simil = np.zeros((d.shape[0], 1))
    dicts = os.listdir('data/user_embeddings')
    for dict_ in dicts:
        with open('data/user_embeddings/' + dict_, 'rb') as df:
            list_ = pickle.load(df)
        n_lists = len(list_)
        print(dict_)
        
        for j in range(n_lists):
            users = list_[j].keys()
            for user in users:
                subset = d.query('author == @user')['comment'] \
                            .apply(nlp)
                index_ = list(subset.index.values)
                com_tensors = [np.sum(comment.tensor, axis=0)
                               for comment in subset]             
                simil = [cosine(tensor, list_[j][user])
                         for tensor in com_tensors]
                user_simil[index_] = np.array(simil) \
                                        .reshape((-1, 1))
    
    return user_simil   

In [None]:
user_similarity = user_deviation(data)

In [None]:
pickle.dump(user_similarity, open("user_similarity", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)

## subreddit

### prototyping

In [5]:
prevalence = data.groupby('subreddit')['label'].sum() \
    / data.groupby('subreddit')['label'].count()

In [8]:
prevalence.index

Index(['07Scape', '0x10c', '0x3642', '100DaysofKeto', '100pushups',
       '100thieves', '1022', '10cloverfieldlane', '10pm', '112263Hulu',
       ...
       'zombiemanic', 'zombies', 'zooeydeschanel', 'zookeeperbattle',
       'zoology', 'zoophilia', 'zootopia', 'zweiteliga', 'zyramains', 'zyzz'],
      dtype='object', name='subreddit', length=14876)

### actual function

In [14]:
def prevalence_vector_ordered(d):
    '''Calculates the prevalence of sarcasm and returns ordered vector
    
    Input is the whole data set and output one ordered vector.
    The prevalence per subreddit is the first command, then an empty
    vector is created and then filled with a for loop.
    '''
    prevalence = d.groupby('subreddit')['label'].sum() \
        / d.groupby('subreddit')['label'].count()
    
    prevalence_vector = np.zeros((d.shape[0], 1))
    subreddits = d['subreddit'].unique()
    for subreddit in subreddits:
        # subset for the specific subreddit is created
        subset = d.query('subreddit == @subreddit')
        # prevalence vector is subsetted and a new auxilliary one created
        prevalence_subset = prevalence[prevalence.index == subreddit]
        index2 = list(prevalence_subset.index.values)
        subreddit_prevalence = prevalence[index2]
        # original indexes of this subset are saved and used to subset
        # the fill the final vector
        index1 = list(subset.index.values)
        prevalence_vector[index1] = np.array(subreddit_prevalence) \
                                        .reshape((-1, 1))
    
    return prevalence_vector

In [15]:
smth = prevalence_vector_ordered(data)

In [16]:
smth.shape

(1010773, 1)

In [18]:
pickle.dump(smth, open("subreddit_prevalence", "wb"),
           protocol=pickle.HIGHEST_PROTOCOL)