In [26]:
import sys
sys.path.insert(1, "./feature-extraction/twitter-features")
from EmbedExtractor import EmbedExtractor
from VulgarExtractor import VulgarExtractor
from TwitterParser import TwitterParser
import classifiers
import pandas as pd
import numpy as np
import json


In [5]:
ee = EmbedExtractor()

In [22]:
def normalize(column_name, df):
    std = df[column_name].std()
    norm_col = df[column_name].apply(lambda x: x - std)
    df[column_name] = norm_col

# builds the labels and vectorizations of given data
#if you want to fool around with including/excluding certain features and whatnot, this is the place to do it

def labels_and_vectors(file, index=0):
    df = pd.read_pickle(file)
    
    wordlist = VulgarExtractor.vulgarWords("badwords.txt") 
    dftext = df[['text']]
    result = dftext.applymap(lambda x: VulgarExtractor.containsVulgar(x,wordlist))
    df['isVulgar'] = result

    word_embeddings = [ee.tweetVec(tagged_line) for tagged_line in df['text']]
    textlist = [txt.replace('\n','') for txt in df['text'].tolist()]
    tagged_sents = TwitterParser.tag(textlist)
    df['POS'] = tagged_sents

    processed_sents = []
    for tagged_sent in df['POS']:
        processed_words = []
        for word, tag in tagged_sent:
            if tag == 'U':
                processed_words.append('someurl')
            elif tag == '@':
                processed_words.append('@someuser')
            else:
                processed_words.append(word)
        sent = ' '.join(processed_words)
        processed_sents.append(sent)
    df['text'] = processed_sents

    word_counts = [TwitterParser.word_count(tagged_line) for tagged_line in df['POS']]
    pos_count_list = [TwitterParser.pos_counts(tagged_line) for tagged_line in df['POS']]
    contains_adjs = [TwitterParser.contains_adjectives(tagged_line) for tagged_line in df['POS']]
    contains_urls = [TwitterParser.contains_url(tagged_line) for tagged_line in df['POS']]
    contains_emojis = [TwitterParser.contains_emoji(tagged_line) for tagged_line in df['POS']]
    contains_abbrevs = [TwitterParser.contains_abbreviation(tagged_line) for tagged_line in df['POS']]

    df['wordCount'] = word_counts
    df['posCounts'] = pos_count_list
    df['containsAdjective'] = contains_adjs
    df['containsURL'] = contains_urls
    df['containsEmoji'] = contains_emojis
    df['containsAbbreviation'] = contains_abbrevs
    df['wordEmbedding'] = word_embeddings


    for i, tag in enumerate(TwitterParser.tagset):
        tag_counts = []
        for pos_counts in df['posCounts']:
            tag_counts.append(pos_counts[i])
        column_name = 'num_' + tag
        df[column_name] = tag_counts
        normalize(column_name, df)
        
    # Changes "true"/"false"/"unverified" to numeric values, just like the in the early cells

    df.loc[df.classification == 'true', 'classification'] = 1
    df.loc[df.classification == 'false', 'classification'] = 0
    df.loc[df.classification == 'unverified', 'classification'] = 2
    # getting the labels

    attributes = ['isVulgar', 'containsAdjective', 'containsURL', 'containsEmoji', 'containsAbbreviation', 'wordCount']
    for tag in TwitterParser.tagset:
        attributes.append('num_' + tag)

    labels = df['classification']
    labels = [l for l in labels]
    labels = np.array(labels)


    # getting the values as a list of lists
    values = df[attributes].values.tolist()
    word_embedding_values = df['wordEmbedding'].values.tolist()


    #Below puts the tweet ID as a feature. Comment this out if you aren't using tweetID
    ###for i,index in enumerate(df.index):
    ###    dev_values[i].append(int(index))


    for i,d in enumerate(word_embedding_values):
        values[i].extend(d)

    values = np.array(values)
    if index == 1:
        return df.index, values
    
    return labels, values
    

In [23]:
# note that the indices of labels-to-values should not be mismatched
tr_labels, tr_values = labels_and_vectors('output/simple/train_data_simple.pickle')
indices, dev_values,= labels_and_vectors('output/simple/dev_data_simple.pickle', index=1)



In [24]:
#change classifier here
predictions, probabilities = classifiers.naive_bayes(tr_values, tr_labels, dev_values)
ps = []

# need to convert the numerical predictions back into their string values
for i, p in enumerate(predictions):
    if p == 0:
        ps.append('false')
    if p == 1:
        ps.append('true')
    if p == 2:
        ps.append('unverified')

# creates pairings of the prediction and the probability of the prediction
pred_probs_pairs = [[ps[i], probabilities[i][predictions[i]]] for i in range(len(predictions))] 
#attaches the tweetID (called reference_id in the score.py file)
pred_dict = {index:pred_probs_pairs[i] for i,index in enumerate(indices)}

In [27]:
with open('output/classifier_output/tr_test.json', 'w') as outfile:
    json.dump(pred_dict, outfile)

In [28]:
!python3 scorer/score.py semeval2017-task8-dataset/traindev/rumoureval-subtaskB-dev.json output/classifier_output/tr_test.json

25 entries in reference file
matching entry: 580325090367315968
matching entry: 580348081100734464
matching entry: 580324027715063808
matching entry: 580319184652890113
matching entry: 580333909008871424
matching entry: 580321156508577792
matching entry: 580320684305416192
matching entry: 580333763512705025
matching entry: 580340476949086208
matching entry: 580339825649291264
matching entry: 580360165540642816
matching entry: 580322453928431617
matching entry: 580882341880446977
matching entry: 580326222107951104
matching entry: 581473088249958400
matching entry: 580371845997682688
matching entry: 580331561398108160
matching entry: 581047170637381632
matching entry: 581293286268129280
matching entry: 580332109782466561
matching entry: 580319078155468800
matching entry: 580339547269144576
matching entry: 581386094337474560
matching entry: 581063377226637312
matching entry: 580323060533764097
25 matched entries in submission
25 entries in reference file
veraci