## An RNN classifier for rumoreval that incorporates the scalar and binary features generated for this project. Please go down the cells in order for full understanding.

In [7]:
import sys
sys.path.insert(1, "./rnn")
import train
import model
from FileReader import FileReader
classInstance = FileReader
classInstance.get_dataframe() #IMPORTANT:  saves a pickle to output/simple or output/full. 
import sys
sys.path.insert(1, "./feature-extraction/embed-extractor")
#from EmbedExtractor import EmbedExtractor
sys.path.insert(1, "./feature-extraction/vulgar-extractor")
from VulgarExtractor import VulgarExtractor
sys.path.insert(1, "./feature-extraction/twitter-parser")
from TwitterParser import TwitterParser
import classifiers
import pandas as pd
import numpy as np
import json

saving data to output..


## From here, we can pick the attributes that we want to include in our model. All current attributes mentioned in cell below.

In [5]:
attributes = ['isVulgar', 'containsAdjective', 'containsEmoji', 'containsURL'. 'containsAbbreviation', 'wordCount']
for tag in TwitterParser.tagset:
    attributes.append('num_' + tag)

attributes = attributes + ['num_replies', 're_has_?', 're_has_NOT', 're_has_correct',
 're_has_credib', 're_has_data', 're_has_detail', 're_has_fabricat', 're_has_lie', 're_has_proof', 
                  're_has_source', 're_has_witness']

# ['opinion', 'user_default_profile',
#  'user_favourites_count', 'user_followers_count', 'user_friends_count', 'user_geo_enabled', 'user_listed_count', 
#  'user_statuses_count', 'user_verified', 'user_created']

Additionally, we can now load the word embeddings.

In [None]:
#pre-pca word embeddings: 'feature-extraction/embed-extractor/word_embedding_vectors.pickle'
#post-pca word embeddings: 'feature-extraction/embed-extractor/word_embedding_vectors_pca.pickle'

with open('feature-extraction/embed-extractor/word_embedding_vectors.pickle', 'rb') as pickle_file:
    ee = pickle.load(pickle_file)

## From here we can define functions that normalize and transform our data suitable for the classifier.

## ******Note that you can toggle on/off inclusion of word embeddings through here. Please read code and look for comment that mentions word embeddings.******

In [18]:
def normalize(column_name, df):
    std = df[column_name].std()
    norm_col = df[column_name].apply(lambda x: x - std)
    df[column_name] = norm_col

def labels_and_vectors(file, index=0):
    df = pd.read_pickle(file)
    
    wordlist = VulgarExtractor.vulgarWords("feature-extraction/vulgar-extractor/badwords.txt") 
    dftext = df[['text']]
    result = dftext.applymap(lambda x: VulgarExtractor.containsVulgar(x,wordlist))
    df['isVulgar'] = result

    word_embeddings = [ee.tweetVec(tagged_line) for tagged_line in df['text']]
    textlist = [txt.replace('\n','') for txt in df['text'].tolist()]
    tagged_sents = TwitterParser.tag(textlist)
    df['POS'] = tagged_sents

    processed_sents = []
    for tagged_sent in df['POS']:
        processed_words = []
        for word, tag in tagged_sent:
            if tag == 'U':
                processed_words.append('someurl')
            elif tag == '@':
                processed_words.append('@someuser')
            else:
                processed_words.append(word)
        sent = ' '.join(processed_words)
        processed_sents.append(sent)
    df['text'] = processed_sents

    word_counts = [TwitterParser.word_count(tagged_line) for tagged_line in df['POS']]
    pos_count_list = [TwitterParser.pos_counts(tagged_line) for tagged_line in df['POS']]
    contains_adjs = [TwitterParser.contains_adjectives(tagged_line) for tagged_line in df['POS']]
    contains_urls = [TwitterParser.contains_url(tagged_line) for tagged_line in df['POS']]
    contains_emojis = [TwitterParser.contains_emoji(tagged_line) for tagged_line in df['POS']]
    contains_abbrevs = [TwitterParser.contains_abbreviation(tagged_line) for tagged_line in df['POS']]

    df['wordCount'] = word_counts
    df['posCounts'] = pos_count_list
    df['containsAdjective'] = contains_adjs
    df['containsURL'] = contains_urls
    df['containsEmoji'] = contains_emojis
    df['containsAbbreviation'] = contains_abbrevs
    df['wordEmbedding'] = word_embeddings


    for i, tag in enumerate(TwitterParser.tagset):
        tag_counts = []
        for pos_counts in df['posCounts']:
            tag_counts.append(pos_counts[i])
        column_name = 'num_' + tag
        df[column_name] = tag_counts
        normalize(column_name, df)
        
    # Changes "true"/"false"/"unverified" to numeric values, just like the in the early cells

    df.loc[df.classification == 'true', 'classification'] = 1
    df.loc[df.classification == 'false', 'classification'] = 2
    df.loc[df.classification == 'unverified', 'classification'] = 0

    
    # getting the labels
    labels = df['classification']
    labels = [l for l in labels]
    labels = np.array(labels)


    # getting the values as a list of lists
    values = df[attributes].values.tolist()

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# UNCOMMENT THIS IN ORDER TO INCOPORATE WORD_EMBEDDINGS AGAIN
    #word_embedding_values = df['wordEmbedding'].values.tolist()
    #for i,d in enumerate(word_embedding_values):
     #   values[i].extend(d)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    values = np.array(values)
    if index == 1:
        return df.index, values

    return labels, values


## From here, you can set where your training and target data comes from (we just "call it" dev_values here). Please maintain the index variable seen in the second line.

In [20]:
tr_labels, tr_values = labels_and_vectors('output/simple/train_data_simple.pickle')
indices, dev_values,= labels_and_vectors('output/simple/goldtest_data_simple.pickle', index=1)

## Now we can get onto the RNN part!

In [21]:
## convert to tensor Variable here
import torch
import torch.nn as nn
import helpers
from torch.autograd import Variable



In [22]:
n_epochs = 100
hidden_size = 20
learning_rate = 0.001


decoder = model.RNN(len(tr_values[0]), hidden_size=hidden_size, output_size =3)
decoder_optimizer = torch.optim.SGD(decoder.parameters(), learning_rate)
criterion = nn.NLLLoss()


Further transformation of data such that we can make it suitable for the torch library.

In [23]:
tvs = [arr.tolist() for arr in tr_values]
dvs = [arr.tolist() for arr in tr_values]
tls = [l.item() for l in tr_labels]

A function that simply turns the vectors into tensors.

In [25]:
def vector_to_tensor(vector):
    tensor = torch.zeros(1, len(tvs[0]))
    for j in range(len(vector)):
        tensor[0][j] = vector[j]
    return tensor


Similar to the pytorch tutorial for RNNs, this is a function that goes through the training data.

In [26]:
def train_all():
    for i, label in enumerate(tls):
        tensor = vector_to_tensor(tvs[i])  
        category_tensor = Variable(torch.LongTensor([label]))
        line_tensor = Variable(torch.FloatTensor(tensor))
        output, loss = train.train(category_tensor, line_tensor, decoder)
        decoder_optimizer.step()


By executing this script, we train the model get the label predictions.

In [27]:
for i in range(n_epochs):
    train_all()

predictions=[]

for d in dvs:
    tensor = vector_to_tensor(d)  
    line_tensor = Variable(torch.FloatTensor(tensor))
    output = decoder.predict(line_tensor)
    predictions.append(output)
    


ps = []

for i, p in enumerate(predictions):
    p = p[0][0]
    if p == 2:
        ps.append('false')
    if p == 1:
        ps.append('true')
    if p == 0:
        ps.append('unverified')

pred_dict = {index:(ps[i],1) for i,index in enumerate(indices)}

## Output to json and scoring script. The first argument is the gold set and the second argument is the predictions you should have generated.

In [30]:
with open('output/rnn/goldtest_rnn.json', 'w') as outfile:
    json.dump(pred_dict, outfile)

In [31]:
!python3 scorer/score.py data/semeval2017-task8-dataset/goldtest/subtaskb.json output/rnn/goldtest_rnn.json

28 entries in reference file
matching entry: 775057555865206784
matching entry: 498486826269548545
matching entry: 774991078265094144
matching entry: 524948206023880704
matching entry: 769988636754505729
matching entry: 544315472075042818
matching entry: 524936793633083394
matching entry: 553561170637238272
matching entry: 763098277986209792
matching entry: 498293668655423488
matching entry: 524941720249978880
matching entry: 553553288625672192
matching entry: 764927075522260992
matching entry: 544274544174071809
matching entry: 758159624122097664
matching entry: 552788945017516032
matching entry: 500280249629036544
matching entry: 580352273001410560
matching entry: 500298588992593920
matching entry: 524923293711998976
matching entry: 553480082996879360
matching entry: 544294893146091520
matching entry: 767725956706414592
matching entry: 581153923987206146
matching entry: 581359544682614784
matching entry: 524961721744900097
matching entry: 768859780240773121