# Rhetorical Question or Not?

## Imports and func defs

In [9]:
import numpy as np
import preprocessor as tweetproc
import re

# import torch
# import torch.autograd as autograd
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim


# torch.manual_seed(1)

MODAL_VERBS =  set(["would", "will", "shall", "should", "can", "could", "may", "might", "ought", "must"])
NEGATIONS = None

def load_negations(filename=None):
    if filename is None:
        filename = '../aligned_antonym_detect/negations.txt'
    global NEGATIONS
    with open(filename, 'r') as f: 
        ants = f.readlines()
        ants = [a.strip() for a in ants]
        NEGATIONS = set(ants)

def load_data(rq_file, not_rq_file):
    """
    Files contain tweet text on each line
    returns samples, labels ([0,1] is RQ, [1,0] is not RQ)
    """
    with open(rq_file, 'r') as f:
        rq = f.readlines()
        rq = [l.strip() for l in rq]
    with open(not_rq_file, 'r') as f:
        not_rq = f.readlines()
        not_rq = [l.strip() for l in not_rq]
    print("RQ: {}\nNot RQ: {}".format(len(rq), len(not_rq)))
    rq_labels = [[0, 1]]*len(rq)
    not_rq_labels  = [[1, 0]]*len(not_rq)
    x = rq + not_rq
    y = rq_labels + not_rq_labels
    print("Total samples: {}".format(len(x)))
    return x, y

def parse_embeddings(filename, embedding_dim=100):
    """
    format: word 0.3 0.8 0.1 ... \n
    """
    with open(filename, 'r') as f:
        lines = f.readlines()
        lines = [l.strip() for l in lines]
    print('Found {} lines in embedding file {}'.format(len(lines)-1, filename))
    
    embeddings = {}
    for line in lines[1:]: # first line is info
        toks = line.split(' ')
        assert len(toks) == (embedding_dim+1), "Embedding length invalid: {}".format(len(toks))
        vec = toks[1:]
        vec = [float(v) for v in vec]
        embeddings[toks[0]] = np.array(vec)
    
    return embeddings

def visualize(x, y, n=10):
    for i in range(0, len(x), len(x)/n):
        print np.argmax(y[i]), x[i],'\n'
        
def remove_taboo(x):
    taboo_list = ['#sarcasm', '#Sarcasm', '#SARCASM', '#sarcastic', '&amp;']
    for taboo in taboo_list:
        if taboo in x:
            x = x.replace(taboo, '')
    return x

def replace_identifiers(x):
    replace_list = [('$MENTION$', 'TOUSER'),
                    ('$URL$', 'URL'),
                    ('$NUMBER$', 'NUMBER')]
    for tok, sub in replace_list:
        x = x.replace(tok, sub)
    return x

def contains_digit(w):
    return bool(re.search(r'\d', w))

def contains_alpha(w):
    return bool(re.search(r'[a-zA-Z]',w))     




## Load embeddings, vocab, negations

In [10]:
load_negations()
print 'Num modal verbs: ', len(MODAL_VERBS)
print 'Num negations: ', len(NEGATIONS)

emb = parse_embeddings('../tweet_embeddings.txt')
vocab = list(emb.keys())
vocab.sort()
print('Vocab size: {}'.format(len(vocab)))
sample_emb = emb['😊']
print 'embedding dim: ', sample_emb.shape
print 'sample embedding: ', sample_emb

Num modal verbs:  10
Num negations:  74
Found 105698 lines in embedding file ../tweet_embeddings.txt
Vocab size: 105698
sample embedding:  [-0.091999 -0.090327 -0.116107  0.122594 -0.179668 -0.270206 -0.223126
 -0.50453   0.062491  0.070141 -0.084368 -0.105765 -0.214917  0.19564
 -0.115645  0.163886  0.019436  0.300362  0.168865  0.030936  0.274668
  0.049551  0.180642  0.354748 -0.203798  0.50992   0.162908  0.085134
  0.100754 -0.27362   0.067393  0.126837 -0.227833  0.112674  0.061513
 -0.185428  0.303224 -0.329283  0.081322 -0.01414  -0.51638  -0.077393
  0.054275  0.232466  0.022979 -0.235335  0.28811   0.538704  0.381486
 -0.027811 -0.006613  0.223376  0.227421  0.017996  0.093377 -0.13187
  0.140716  0.061507  0.073246 -0.063014 -0.187818  0.124506  0.229867
 -0.099627 -0.008644  0.295628 -0.058905  0.081961  0.459971  0.340804
 -0.267998 -0.453933  0.009051  0.182248  0.070721  0.027988  0.028843
  0.207732  0.242723 -0.111458 -0.156547  0.152425 -0.056641 -0.091101
  0.067743 

## Load tweets and do processing

In [11]:
rq_file = 'rq.txt.dedup'
not_rq_file = 'not_rq.txt.dedup'

tweets,y = load_data(rq_file, not_rq_file)
y_integer = [np.argmax(y_i) for y_i in y ] # store integer labels instead of one-hot vec
y_integer = np.array(y_integer)

tweets = [remove_taboo(t) for t in tweets]
visualize(tweets, y)

RQ: 3402
Not RQ: 4081
Total samples: 7483
1 @cmigbear You’re not going to give Amazon access to your house?!?   

1 @scottsauls "Hunting"?  What are we animals or something!? #Offended  

1 @business More contentious than Melanoma's coat? Yes or no??? #snark  #fashionfauxpas 

1 @JanineZeeCheng But the weather was SOOOOO bad!!  Why can't that fine, clearly-concerned gentleman make sure his date stays warm?!? #mansplaining  

1 I can still use this in my car right?  https://t.co/B0OOAl74Mb 

0 @Dr_Ehmad Govt. is at fault. They made a mistake. No? 

0 this guy walked up to me and was coming way to close  he shaked my hand and wouldn’t let it go?¿ he also asked for my number and I told him no but he didn’t accept it 

0 RT @jlist: Which hairstyle do you like? https://t.co/hZojrnfXpm 

0 #MAGA Dear @USAGSessions, why are you dragging your feet on Hillary, Obama,  DNC, while we spare no expense to find  dirt on Trump? 

0 RT @Brother_Kin: 🔴LIVE!  👉Mystery game stream is a go! Game 1: Now Pl

In [12]:
def replace_special(tweets):
    """
    URL: p.OPT.URL
    Mention: p.OPT.MENTION
    Hashtag: p.OPT.HASHTAG
    Reserved Words: p.OPT.RESERVED
    Emoji: p.OPT.EMOJI
    Smiley: p.OPT.SMILEY
    Number: p.OPT.NUMBER
    """
    tweetproc.set_options(tweetproc.OPT.URL, tweetproc.OPT.MENTION)
    tweets = [replace_identifiers(tweetproc.tokenize(t)) for t in tweets]
    return tweets

In [13]:
tweets = replace_special(tweets)
visualize(tweets, y, n = 100)

1 TOUSER You’re not going to give Amazon access to your house?!? 

1 Starting to see TOUSER philosophy at work now. TOUSER with 14 9 at halftime. Not even sure why he's on the floor?! 

1 TOUSER Well Sean, if you look at BOTH sides of da science I'm sure there's a "deal" we can make with nature right? #climateChangeIsReal 

1 Oh really? Who would've guessed Glad to see research examining validating what many students of Color share everyday #RacismInEd URL 

1 RT TOUSER: Going out on a limb, but could it be because she is a black woman? #ImpeachTrump URL 

1 TOUSER Alright. Who’s paying you guys to say that now? 

1 UNC gets away with academic fraud, now Duke gets away with this. Is there no justice? #seriously #almostasbad URL 

1 Wait, we learned NOTHING from the TOUSER after game 1!!??? WTF, 

1 TOUSER TOUSER or just kidding? More like a lifetime ban on cabinet members flying commercial anywhere. 

1 We all know Misha's OTP is Trutin. Or is it Pump? TOUSER 😂😂😂 #Collusion #TrumpRussi

## Tokenize

In [14]:
import nltk
def tokenize_tweets(tweets):
    tweet_tokens = [nltk.word_tokenize(t.decode('utf-8')) for t in tweets]
    return tweet_tokens

In [15]:
tweet_tokens = tokenize_tweets(tweets)
visualize(tweet_tokens, y, n=100)

1 [u'TOUSER', u'You', u'\u2019', u're', u'not', u'going', u'to', u'give', u'Amazon', u'access', u'to', u'your', u'house', u'?', u'!', u'?'] 

1 [u'Starting', u'to', u'see', u'TOUSER', u'philosophy', u'at', u'work', u'now', u'.', u'TOUSER', u'with', u'14', u'9', u'at', u'halftime', u'.', u'Not', u'even', u'sure', u'why', u'he', u"'s", u'on', u'the', u'floor', u'?', u'!'] 

1 [u'TOUSER', u'Well', u'Sean', u',', u'if', u'you', u'look', u'at', u'BOTH', u'sides', u'of', u'da', u'science', u'I', u"'m", u'sure', u'there', u"'s", u'a', u'``', u'deal', u"''", u'we', u'can', u'make', u'with', u'nature', u'right', u'?', u'#', u'climateChangeIsReal'] 

1 [u'Oh', u'really', u'?', u'Who', u'would', u"'ve", u'guessed', u'Glad', u'to', u'see', u'research', u'examining', u'validating', u'what', u'many', u'students', u'of', u'Color', u'share', u'everyday', u'#', u'RacismInEd', u'URL'] 

1 [u'RT', u'TOUSER', u':', u'Going', u'out', u'on', u'a', u'limb', u',', u'but', u'could', u'it', u'be', u'because', u

## Extract handcrafted features

In [23]:
def is_negation(w):
    global NEGATIONS
    if w.lower() in NEGATIONS:
        return True
    else:
        return False

def is_modal(w):
    global MODAL_VERBS
    if w.lower() in MODAL_VERBS:
        return True
    else:
        return False

def handcrafted_features(tok_tweets):
    """
    Returns samples,2 matrix
    1st col: MODAL present in tweet then 1
    2nd col: NEGATION present in tweet then 1
    3rd col: position of 1st question mark / total num words
    """
    f = np.zeros((len(tok_tweets), 3))
    for i,toks in enumerate(tok_tweets):
        features_set = 0
        for tok in toks:
            if is_modal(tok):
                f[i][0] = 1
                features_set += 1
            if is_negation(tok):
                f[i][1] = 1
                features_set += 1
            if features_set == 2:
                break
        try:
            first_pos = toks.index('?') + 1.0
            f[i][2] = first_pos / len(toks)
        except ValueError:
            f[i][2] = 1

    return f

In [24]:
hand_feats = handcrafted_features(tweet_tokens)

In [28]:
print hand_feats.shape
hand_feats[:10]

(7483, 3)


array([[ 0.        ,  1.        ,  0.875     ],
       [ 0.        ,  1.        ,  0.45454545],
       [ 1.        ,  0.        ,  0.63333333],
       [ 0.        ,  0.        ,  0.375     ],
       [ 0.        ,  1.        ,  0.4375    ],
       [ 0.        ,  0.        ,  0.95      ],
       [ 0.        ,  0.        ,  0.42857143],
       [ 0.        ,  1.        ,  0.67567568],
       [ 1.        ,  0.        ,  1.        ],
       [ 0.        ,  1.        ,  0.67741935]])

## Convert to embedding

In [29]:
def tweets2emb(tweet_tokens, emb):  
    """
    Convert list of tokenized tweets to sum of embedding
    tweet_tokens: list of list of words
    emb: embedding table
    
    returns (num_samples x emb_dim)
    """
    x = []
    for toks in tweet_tokens:
        x_i = np.zeros((1,100))
        for tok in toks:
            # handle captilized words
            if tok.upper() != tok:
                tok = tok.lower()

            if tok in emb:
                x_i += emb[tok]

            else:
                if tok[0] == '#':
                    x_i += emb['HASHTAG']

                elif tok == 'TOUSER':
                    x_i += emb['@touser']

                elif tok == 'TOUSER':
                    x_i += emb['@touser']

                elif contains_digit(tok):
                    x_i += emb['NUMBER']

                elif not contains_alpha(tok):
                    x_i += emb['CHARACTER']

                else:
                    x_i += emb['UNKNOWN']     
        x.append(x_i)
    x = np.concatenate(x, axis=0)
    return x

In [30]:
x = tweets2emb(tweet_tokens, emb)
print x.shape
print y_integer.shape

(7483, 100)
(7483,)


## Concatenate embeddings and handcrafted features

In [32]:
x = np.concatenate([x, hand_feats], axis=1)
print x.shape

(7483, 103)


# Classify with SVM

In [33]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support as score

x_train, x_test, y_train, y_test = train_test_split(
    x, y_integer, test_size=0.33, random_state=42)

In [44]:
clf = svm.SVC(class_weight='balanced') # default is rbf kernel
clf.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [45]:
# clf.score(x_test, y_test)
pred = clf.predict(x_test)
precision, recall, f1, _ = score(y_test, pred)
precisiont, recallt, f1t, _ = score(y_test, pred, average='macro')
print 'Score\tNot RQ  \tRQ  \t        Overall'
print 'precis\t{}\t{}\t{}'.format(precision[0], precision[1], precisiont)
print 'recall\t{}\t{}\t{}'.format(recall[0], recall[1], recallt)
print 'f1\t{}\t{}\t{}'.format(f1[0], f1[1], f1t)

Score	Not RQ  	RQ  	        Overall
precis	0.756026296567	0.740236148955	0.748131222761
recall	0.783497350492	0.709312445605	0.746404898048
f1	0.769516728625	0.724444444444	0.746980586534


#### Try linear svm

In [42]:
lin_clf = svm.SVC(kernel='linear', class_weight='balanced') # default is rbf kernel
lin_clf.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [43]:
pred_lin = lin_clf.predict(x_test)
precision, recall, f1, _ = score(y_test, pred_lin)
precisiont, recallt, f1t, _ = score(y_test, pred_lin, average='macro')
print 'Score\tNot RQ  \tRQ  \t        Overall'
print 'precis\t{}\t{}\t{}'.format(precision[0], precision[1], precisiont)
print 'recall\t{}\t{}\t{}'.format(recall[0], recall[1], recallt)
print 'f1\t{}\t{}\t{}'.format(f1[0], f1[1], f1t)

Score	Not RQ  	RQ  	        Overall
precis	0.734678624813	0.701413427562	0.718046026187
recall	0.7441332324	0.691035683203	0.717584457801
f1	0.739375705152	0.696185883384	0.717780794268


## [WIP] LSTM

In [8]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

In [9]:
class RQClassifier(nn.Module):

    def __init__(self):
        super(RQClassifier, self).__init__()


    def forward(self, sentence):


IndentationError: expected an indented block (<ipython-input-9-317c0a85bdff>, line 7)

In [None]:
# model = RQClassifier(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
# loss_function = nn.NLLLoss()
# optimizer = optim.SGD(model.parameters(), lr=0.1)

# # See what the scores are before training
# # Note that element i,j of the output is the score for tag j for word i.
# inputs = prepare_sequence(training_data[0][0], word_to_ix)
# tag_scores = model(inputs)
# print(tag_scores)

# for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
#     for sentence, tags in training_data:
#         # Step 1. Remember that Pytorch accumulates gradients.
#         # We need to clear them out before each instance
#         model.zero_grad()

#         # Also, we need to clear out the hidden state of the LSTM,
#         # detaching it from its history on the last instance.
#         model.hidden = model.init_hidden()

#         # Step 2. Get our inputs ready for the network, that is, turn them into
#         # Variables of word indices.
#         sentence_in = prepare_sequence(sentence, word_to_ix)
#         targets = prepare_sequence(tags, tag_to_ix)

#         # Step 3. Run our forward pass.
#         tag_scores = model(sentence_in)

#         # Step 4. Compute the loss, gradients, and update the parameters by
#         #  calling optimizer.step()
#         loss = loss_function(tag_scores, targets)
#         loss.backward()
#         optimizer.step()

# # See what the scores are after training
# inputs = prepare_sequence(training_data[0][0], word_to_ix)
# tag_scores = model(inputs)
# # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
# #  for word i. The predicted tag is the maximum scoring tag.
# # Here, we can see the predicted sequence below is 0 1 2 0 1
# # since 0 is index of the maximum value of row 1,
# # 1 is the index of maximum value of row 2, etc.
# # Which is DET NOUN VERB DET NOUN, the correct sequence!
# print(tag_scores)

## Load test data, clean it, evaluate on it

In [46]:
rq_file2 = 'rq_test.txt'
not_rq_file2 = 'notrq_test.txt'

tweets_raw2,y2 = load_data(rq_file2, not_rq_file2)
y_integer2 = [np.argmax(y_i) for y_i in y2] # store integer labels instead of one-hot vec
y_integer2 = np.array(y_integer2)

tweets2 = [remove_taboo(t) for t in tweets_raw2]
visualize(tweets2, y2)

RQ: 26
Not RQ: 30
Total samples: 56
1 Real good friend, A+ to you ? 

1 Did you know there are people in the world that actually WANT Hillary Clinton for president?! #depressingfact 

1 Isn't it cool how the state spends our tax money on building new jails instead of schools? #facepalm 

1 Oh weird, the NFL referees screwed up? That's NEVER happened before. 

1 I sprayed perfume in my eye! I am now a genius!! ? 

1 Glad people keep their promises these days ? 

0 Alabama is awesome. 

0 An afternoon of some shopping!! Luca had a blast. #bestfriends http://t.co/sG8DPHiizq 

0 ToUser never too many Italians at one party.. 

0 Because being on #tmt is more important than getting an education.. Good shit nigga. Glad your priorities are on point. 

0 As much as I love some world history I really wanna read about 95 theses of a religion I'm not apart off 

0 So excited to get the bus back to aberdeen tonight. 



In [47]:
tweets2 = replace_special(tweets2)
visualize(tweets2, y2)

1 Real good friend, A+ to you ? 

1 Did you know there are people in the world that actually WANT Hillary Clinton for president?! #depressingfact 

1 Isn't it cool how the state spends our tax money on building new jails instead of schools? #facepalm 

1 Oh weird, the NFL referees screwed up? That's NEVER happened before. 

1 I sprayed perfume in my eye! I am now a genius!! ? 

1 Glad people keep their promises these days ? 

0 Alabama is awesome. 

0 An afternoon of some shopping!! Luca had a blast. #bestfriends URL 

0 ToUser never too many Italians at one party.. 

0 Because being on #tmt is more important than getting an education.. Good shit nigga. Glad your priorities are on point. 

0 As much as I love some world history I really wanna read about 95 theses of a religion I'm not apart off 

0 So excited to get the bus back to aberdeen tonight. 



In [48]:
tweets2 = tokenize_tweets(tweets2)
visualize(tweets2, y2)

1 [u'Real', u'good', u'friend', u',', u'A+', u'to', u'you', u'?'] 

1 [u'Did', u'you', u'know', u'there', u'are', u'people', u'in', u'the', u'world', u'that', u'actually', u'WANT', u'Hillary', u'Clinton', u'for', u'president', u'?', u'!', u'#', u'depressingfact'] 

1 [u'Is', u"n't", u'it', u'cool', u'how', u'the', u'state', u'spends', u'our', u'tax', u'money', u'on', u'building', u'new', u'jails', u'instead', u'of', u'schools', u'?', u'#', u'facepalm'] 

1 [u'Oh', u'weird', u',', u'the', u'NFL', u'referees', u'screwed', u'up', u'?', u'That', u"'s", u'NEVER', u'happened', u'before', u'.'] 

1 [u'I', u'sprayed', u'perfume', u'in', u'my', u'eye', u'!', u'I', u'am', u'now', u'a', u'genius', u'!', u'!', u'?'] 

1 [u'Glad', u'people', u'keep', u'their', u'promises', u'these', u'days', u'?'] 

0 [u'Alabama', u'is', u'awesome', u'.'] 

0 [u'An', u'afternoon', u'of', u'some', u'shopping', u'!', u'!', u'Luca', u'had', u'a', u'blast', u'.', u'#', u'bestfriends', u'URL'] 

0 [u'ToUser', u'never', 

In [49]:
hand_feats2 = handcrafted_features(tweets2)
x_test2 = tweets2emb(tweets2,emb)
print hand_feats2.shape
print x_test2.shape

(56, 3)
(56, 100)


In [51]:
# concat embedding and handcrafted features
x_test2 = np.concatenate([x_test2, hand_feats2], axis=1)
print x_test2.shape

(56, 103)


In [52]:
pred2 = clf.predict(x_test2)
precision, recall, f1, support = score(y_integer2, pred2)
precisiont, recallt, f1t, support = score(y_integer2, pred2, average='macro')
print 'Score\tNot RQ  \tRQ  \t        Overall'
print 'precis\t{}\t{}\t{}'.format(precision[0], precision[1], precisiont)
print 'recall\t{}\t{}\t{}'.format(recall[0], recall[1], recallt)
print 'f1\t{}\t{}\t{}'.format(f1[0], f1[1], f1t)

Score	Not RQ  	RQ  	        Overall
precis	0.6	0.571428571429	0.585714285714
recall	0.7	0.461538461538	0.580769230769
f1	0.646153846154	0.510638297872	0.578396072013


In [53]:
print 'Gold Pred Tweet'
for i in range(pred2.shape[0]):
    print '{} {} {}'.format(y_integer2[i], pred2[i], tweets_raw2[i])

Gold Pred Tweet
1 0 Real good friend, A+ to you ?
1 1 You know what? Josh Freeman looks really good on this drive.
1 1 After the game last night, seems to me like Fernando Torres is BACK!!!! ?? ?? #notreally #CFC
1 0 Mr shey why are you so cool????
1 0 Before the weekend gets started im gonna get started in this homework ?
1 1 Did you know there are people in the world that actually WANT Hillary Clinton for president?! #depressingfact
1 1 Did Holliday just get a homerun? I couldn't tell..
1 1 Is Thanksgiving today? I would have never known if it weren't for social media..... Close call.
1 1 Adios Wendy's see you tomorrow same time ? #work #don'twannago #justgotoff #bedtime http://t.co/8fz6o4Kbac
1 0 I'm starting to enjoy cardio now ? ? http://t.co/KTWKvBEzFk
1 1 Isn't it cool how the state spends our tax money on building new jails instead of schools? #facepalm
1 1 Adios Wendy's see you tomorrow same time ? #work #don'twannago #justgotoff #bedtime http://t.co/8fz6o4Kbac
1 0 Glad people