## Uses basic classifiers (naive bayes, svm, decision tree, random forest) to predict labels for rumoreval, using binary and scalar features, along with word embeddings

In [1]:
import sys
import os
sys.path.insert(1, "./feature-extraction/embed-extractor")
from FileReader import FileReader
from EmbedExtractor import EmbedExtractor
sys.path.insert(1, "./feature-extraction/vulgar-extractor")
from VulgarExtractor import VulgarExtractor
sys.path.insert(1, "./feature-extraction/twitter-parser")
from TwitterParser import TwitterParser
sys.path.insert(1, "./feature-extraction/opinion-extractor/")
from OpinionExtractor import OpinionExtractor
from sklearn import decomposition
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
import pickle
import classifiers
import pandas as pd
import numpy as np
import json
from datetime import datetime


## This execution will give you the correct files to run this program. It includes all of the data of the training, dev, and goldtest sets in pd.dataframe form, as a pickle file.

In [None]:
classInstance = FileReader
df_list = classInstance.get_dataframe() #IMPORTANT:  saves a pickle to output/simple or output/full. 

## This cell will generate the pre-PCA word embeddings and the post-PCA word embeddings.

In [None]:
# Run this cell to generate the pre-PCA and post-PCA word embeddings
def chunkIt(seq, num):
    print(len(seq))
    avg = len(seq) / float(num)
    out = []
    last = 0.0
    zeros = [0]*int(avg)

    while last < len(seq):
        subVector = seq[int(last):int(last + avg)]
        if zeros==subVector and last>4700:
            break
        out.append(subVector)
        last += avg
    return out

###UNCOMMENT THIS TO RUN THE EE#####
ee = EmbedExtractor()
tweet2vec = {}
    
for d in ["train","dev","test"]:
	with open('./output/full/'+d+'_data_full.json', 'r') as f:
	    jstr = f.read()

	j = json.loads(jstr)

	for key in j:
		tweet=j[key]['text']
		tweet2vec[key]=ee.tweetVec(tweet)
pickleFile="feature-extraction/embed-extractor/word_embedding_vectors.pickle"
pickle.dump(tweet2vec,open(pickleFile,"wb"))
##########################################


word_embeddings_pca = {}
pickleFile="feature-extraction/embed-extractor/word_embedding_vectors.pickle"
word_embeddings = pd.read_pickle(pickleFile)

word_embeddings_chunked = {}
for word in word_embeddings:
    word_embeddings_chunked[word]=chunkIt(word_embeddings[word],30)
for word in word_embeddings_chunked:
    pca = decomposition.PCA(n_components=24)
    x = np.array(word_embeddings_chunked[word])
    try:
        x_std = StandardScaler().fit_transform(x)
        pca.fit_transform(x_std)
        word_embeddings_pca[word]=pca.singular_values_
#         print(len(pca.singular_values_))
    except ValueError:
        # print(word_embeddings_chunked[word])
        print("UH OH")

pickle.dump(word_embeddings_pca,open("feature-extraction/embed-extractor/word_embedding_vectors_pca.pickle","wb"))


## A list of functions for data normalization. Should be ignored.

In [None]:
def create_opinion_column(df, strongly_subj_list):
    #add a binary column where opinion == 1 if the tweet text contains a strongly subjective word
    #global strongly_subj_list
    OpinionExtractor.add_opinion_column(df, strongly_subj_list)

def extract_user_column(row):
    global user_labels
        
    user_dict = row['user']
    
    for col in user_dict.keys():
        user_labels.append(col)

def update_user_column(row):
    global user_vals
    user_dict = row['user']
    
    for key in user_vals.keys():
#         print(key)
        concat_key = key[5:]
        if concat_key in user_dict:
#             print(sup)
            val = user_dict[concat_key]
            user_vals[key].append(val)
        else:
            user_vals[key].append(np.nan)
            
def convert_date(row):
    date = row['user_created_at'].split()
    date_str = ' '.join([date[1], date[2], date[-1]])
    
    datetime_object = datetime.strptime(date_str, '%b %d %Y')
    date_int = datetime_object.year * 10000 + datetime_object.month * 100 + datetime_object.day
        
    return date_int

def convert_to_int(row, col):
    return int(row[col])

def normalize_column(df, col):
    col_array = np.asarray(df[col].tolist())
    mean = np.mean(col_array)
    std = np.std(col_array)
    col_array = (col_array - mean) / float(std)
    
    df[col] = col_array
    
def create_user_features(df):

    global user_labels
    global user_vals
    user_labels = []
    df.apply(extract_user_column, axis = 1)
    user_labels = ['user_' + label for label in set(user_labels)]
    user_vals = {label:[] for label in user_labels}
    
    df.apply(update_user_column, axis = 1) 
    
    user_df = pd.DataFrame(user_vals)
    user_df['user_created'] = user_df.apply(convert_date, axis = 1)

    col_list = ['user_default_profile', 
                'user_favourites_count', 'user_followers_count', 'user_friends_count', 'user_geo_enabled',
                'user_listed_count', 'user_statuses_count', 'user_verified','user_created']

    for col in col_list:
        user_df[col] = user_df.apply(lambda x : convert_to_int(x, col), axis = 1)

    # normalize_column(user_df, col)
    user_df = user_df[col_list]    

    norm_list = ['user_favourites_count', 'user_followers_count', 'user_friends_count',
                'user_listed_count', 'user_statuses_count','user_created']

    for col in norm_list:
        normalize_column(user_df, col)

    df = pd.concat([df.reset_index(), user_df], axis = 1)
    df = df.set_index('index')

    return df    

## We can now load the word embeddings. These can come from the following:

    PRE-PCA EMBEDDINGS:
     'feature-extraction/embed-extractor/word_embedding_vectors.pickle'
    POST-PCA EMBEDDINGS:
     'feature-extraction/embed-extractor/word_embedding_vectors_pca.pickle'

In [None]:
with open('feature-extraction/embed-extractor/word_embedding_vectors.pickle', 'rb') as pickle_file:
    ee = pickle.load(pickle_file)

## From here, we can pick the attributes that we want to include in our model. All current attributes mentioned in cell below.

In [None]:
attributes = ['isVulgar', 'containsAdjective', 'containsEmoji', 'containsURL'. 'containsAbbreviation', 'wordCount']
for tag in TwitterParser.tagset:
    attributes.append('num_' + tag)

attributes = attributes + ['num_replies', 're_has_?', 're_has_NOT', 're_has_correct',
 're_has_credib', 're_has_data', 're_has_detail', 're_has_fabricat', 're_has_lie', 're_has_proof', 
                  're_has_source', 're_has_witness']

# ['opinion', 'user_default_profile',
#  'user_favourites_count', 'user_followers_count', 'user_friends_count', 'user_geo_enabled', 'user_listed_count', 
#  'user_statuses_count', 'user_verified', 'user_created']

## From here we can define functions that normalize and transform our data suitable for the classifier.

## ******Note that you can toggle on/off inclusion of word embeddings through here. Please read code and look for comment that mentions word embeddings.******

In [None]:
def normalize(column_name, df):
    std = df[column_name].std()
    norm_col = df[column_name].apply(lambda x: x - std)
    df[column_name] = norm_col

# builds the labels and vectorizations of given data
#if you want to fool around with including/excluding certain features and whatnot, this is the place to do it

def labels_and_vectors(file, index=0):
    df = pd.read_pickle(file)
    
    wordlist = VulgarExtractor.vulgarWords("feature-extraction/vulgar-extractor/badwords.txt") 
    dftext = df[['text']]
    result = dftext.applymap(lambda x: VulgarExtractor.containsVulgar(x,wordlist))
    df['isVulgar'] = result

    word_embeddings = [ee[key] for key in df.index]
    # word_embeddings = [ee.tweetVec(tagged_line) for tagged_line in df['text']]
    textlist = [txt.replace('\n','') for txt in df['text'].tolist()]
    tagged_sents = TwitterParser.tag(textlist)
    df['POS'] = tagged_sents

    processed_sents = []
    for tagged_sent in df['POS']:
        processed_words = []
        for word, tag in tagged_sent:
            if tag == 'U':
                processed_words.append('someurl')
            elif tag == '@':
                processed_words.append('@someuser')
            else:
                processed_words.append(word)
        sent = ' '.join(processed_words)
        processed_sents.append(sent)
    df['text'] = processed_sents

    word_counts = [TwitterParser.word_count(tagged_line) for tagged_line in df['POS']]
    pos_count_list = [TwitterParser.pos_counts(tagged_line) for tagged_line in df['POS']]
    contains_adjs = [TwitterParser.contains_adjectives(tagged_line) for tagged_line in df['POS']]
    contains_urls = [TwitterParser.contains_url(tagged_line) for tagged_line in df['POS']]
    contains_emojis = [TwitterParser.contains_emoji(tagged_line) for tagged_line in df['POS']]
    contains_abbrevs = [TwitterParser.contains_abbreviation(tagged_line) for tagged_line in df['POS']]

    df['wordCount'] = word_counts
    df['posCounts'] = pos_count_list
    df['containsAdjective'] = contains_adjs
    df['containsURL'] = contains_urls
    df['containsEmoji'] = contains_emojis
    df['containsAbbreviation'] = contains_abbrevs
    df['wordEmbedding'] = word_embeddings

    
    for i, tag in enumerate(TwitterParser.tagset):
        tag_counts = []
        for pos_counts in df['posCounts']:
            tag_counts.append(pos_counts[i])
        column_name = 'num_' + tag
        df[column_name] = tag_counts
        normalize(column_name, df)
    
    # global strongly_subj_list
    strongly_subj_list = OpinionExtractor.initialize_subjectivity()
    create_opinion_column(df, strongly_subj_list)
    df = create_user_features(df)    
        
    # Changes "true"/"false"/"unverified" to numeric values, just like the in the early cells
    df.loc[df.classification == 'true', 'classification'] = 1
    df.loc[df.classification == 'false', 'classification'] = 0
    df.loc[df.classification == 'unverified', 'classification'] = 2
   
    # getting the labels     
    labels = df['classification']
    labels = [l for l in labels]
    labels = np.array(labels)

    # getting the values as a list of lists
    values = df[attributes].values.tolist()
    word_embedding_values = df['wordEmbedding'].values.tolist()


#     #Below puts the tweet ID as a feature. Comment this out if you aren't using tweetID
#     for i,index in enumerate(df.index):
#         dev_values[i].append(int(index))


##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## UNCOMMENT THIS IN ORDER TO INCOPORATE WORD_EMBEDDINGS AGAIN
    #word_embedding_values = df['wordEmbedding'].values.tolist()
    #for i,d in enumerate(word_embedding_values):
     #   values[i].extend(d)
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    values = np.array(values)
    if index == 1:
        return df.index, values
    
    
    return labels, values

    

## From here, you can set where your training and target data comes from (we just "call it" dev_values here). Please maintain the index variable seen in the second line.

In [None]:
tr_labels, tr_values = labels_and_vectors('output/full/train_data_full.pickle')
indices, dev_values = labels_and_vectors('output/full/goldtest_data_full.pickle', index=1)

## By executing this script, we train the model get the label predictions.

## NOTE: This is where you should experiment with different classifiers.

There is currently:

classifiers.naive_bayes(tr_values, tr_labels, dev_values)

classifiers.svm_classifier(tr_values, tr_labels, dev_values)

classifiers.decision_tree_classifier(tr_values, tr_labels, dev_values)

classifiers.random_forest(tr_values, tr_labels, dev_value)


In [None]:
#change classifier here
predictions, probabilities = classifiers.random_forest(tr_values, tr_labels, dev_values, 80, 3, "gini")
ps = []

# need to convert the numerical predictions back into their string values
for i, p in enumerate(predictions):
    if p == 0:
        ps.append('false')
    if p == 1:
        ps.append('true')
    if p == 2:
        ps.append('unverified')

# creates pairings of the prediction and the probability of the prediction
pred_probs_pairs = [[ps[i], probabilities[i][predictions[i]]] for i in range(len(predictions))] 
#attaches the tweetID (called reference_id in the score.py file)
pred_dict = {index:pred_probs_pairs[i] for i,index in enumerate(indices)}

## Output to json and scoring script. The first argument is the gold set and the second argument is the predictions you should have generated.

In [None]:
output_dir = './output/classifier_output/'
try:
    os.stat(output_dir)
except:
    os.mkdir(output_dir)  

with open('output/classifier_output/goldtest_nb.json', 'w') as outfile:
    json.dump(pred_dict, outfile)

In [None]:
!python3 scorer/score.py data/semeval2017-task8-dataset/goldtest/subtaskb.json output/classifier_output/goldtest_nb.json