In [1]:
import ujson as json
import io
import glob
import os
from collections import Counter
from collections import defaultdict
import pandas as pd 
from copy import copy
from gensim import corpora
from gensim import models
from gensim.matutils import corpus2dense
from twitter_dm.nlp.Tokenize import extract_tokens_twokenize_and_regex as do_tokenize
import os
from datetime import datetime 
from collections import Counter
from vaderSentiment.vaderSentiment import sentiment
import numpy as np 
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
import io
from multiprocessing import Pool
from functools import partial
from numpy.random import choice

np.set_printoptions(threshold=10000,
                    linewidth=100,
                    formatter={'float_kind':lambda x: "%.8f" % x })


In [2]:
np.__version__

'1.11.3'

In [5]:
import gensim
gensim.__version__

'0.12.2'

In [7]:
import sklearn
sklearn.__version__

'0.18.2'

# Load in Data

In [2]:
train_tweets = pd.read_csv("./data/annotation_results_full.csv",dtype={"tid":"str"})
test_tweets = pd.read_csv("./data/test_tweets.csv",dtype={"tid":"str"})
print test_tweets.shape
print train_tweets.shape

(254, 22)
(10609, 24)


In [3]:
# Make sure candidate interaction columns are the same for training and test
tweet_info = pd.concat((train_tweets[~train_tweets.tid.duplicated()],test_tweets),axis=0)
tweet_info.loc[tweet_info.candidate_interaction == "NONE","candidate_interaction"] = "No Mention of Target"
tweet_info.loc[tweet_info.candidate_interaction == "TextMention",
                                                        "candidate_interaction"] = "Plaintext Mention"

# Add feature as to whether or not the tweet had a link
tweet_info['has_link'] = tweet_info.tweet_text.apply(lambda x: '{{link}}' in x)

# User Features

In [4]:
def get_user_features(uids_to_featurize,user_to_tid,tweet_info):
    
    # clean user features, add political party
    user_features = pd.DataFrame(user_to_tid.items(),columns=['uid','tid'])
    user_features['uid'] = user_features.uid.astype(int)
    user_affil = pd.get_dummies(tweet_info[['tid','voter_pol_affil']], prefix='pol', columns=['voter_pol_affil'],drop_first=True)
    tweet_info['voter_race'] = tweet_info.voter_ethnicity.map({"B":"B","H":"H","N":"O","O":"O","U":"O","W":"0_White","A":"A"})
    user_race = pd.get_dummies(tweet_info[['tid','voter_race']], prefix='race', columns=['voter_race'],drop_first=True)
    user_gender = pd.get_dummies(tweet_info[['tid','voter_gender']], prefix='gend', columns=['voter_gender'],drop_first=True)
    user_features = pd.merge(user_features,user_affil,on="tid")
    user_features = pd.merge(user_features,user_race,on="tid")
    user_features = pd.merge(user_features,user_gender,on="tid")
    print user_features.shape
    return user_features

In [5]:
uids_to_featurize = train_tweets.uid.unique().tolist() + test_tweets.uid.unique().tolist()
user_to_tid = {}
for i, row in train_tweets.iterrows():
    user_to_tid[row['uid']] = row['tid']
for i, row in test_tweets.iterrows():
    user_to_tid[row['uid']] = row['tid']
    
user_features = get_user_features(uids_to_featurize,user_to_tid,tweet_info)

(816, 9)


# Tweet Features

In [6]:
## Get tweet features
TERM_SEP = "\n\n\n#####$$$$$\n\n\n#####$$$$$"
stopwords = {"{{","}}","...", '', '""', "a","and","the"}
sent_names = ['e_tot','p_tot','a_tot','has_link','tot_sent']
def featurize_particular_tweet_set(tweet_data,sent_words,sent_values,n_below=5,dictionary=None):
    tweet_texts = []
    sent_features = []
    for k, v in tweet_data.iterrows():

        # Create bag of words from all tweets seen by annotators
        tokens = []
        char_ngrams = []
        for tok_set in v["all_text"].split(TERM_SEP):
            tokens += do_tokenize(tok_set,stopwords,do_arabic_stemming=False,gram_list=[2,3])
            ng_tok = do_tokenize(tok_set,stopwords,do_arabic_stemming=False)
            for k in [3,4,5]:
                for tok in ng_tok:
                    char_ngrams += [tok[i:i+k] for i in range(len(tok)-k-1)]
        tokens += char_ngrams
        tweet_texts.append(tokens)
        
        # Create sentiment features from single tweet, b/c we have target info
        single_tweet_tokens = do_tokenize(v['tweet_text'],stopwords,do_arabic_stemming=False)
        sent_arr_tweet = np.array([sent_values[word] for word in set(single_tweet_tokens) & sent_words])
        sent_tot = sentiment(v['tweet_text'])['compound']
        if len(sent_arr_tweet):
            sent_feats = (sent_arr_tweet.sum(axis=0)/len(single_tweet_tokens)).tolist()
            if v['candidate'] == 'Donald Trump':
                sent_feats = [-1*x for x in sent_feats]
        else:
            sent_feats = [0] * (len(sent_names)-2)
        sent_features.append(sent_feats + [v['has_link'], sent_tot])


    if not dictionary:
        dictionary = corpora.Dictionary(tweet_texts)
        # get rid of words appearing < K times (K == 10)
        dictionary.filter_extremes(no_below=n_below)
        dictionary.compactify()
    
    # score using IDF
    tfidf = models.TfidfModel([dictionary.doc2bow(text) for text in tweet_texts])
    text_mat = corpus2dense([tfidf[dictionary.doc2bow(text)] for text in tweet_texts],num_terms=len(dictionary)).T
    return text_mat, np.array(sent_features), dictionary

def get_tweet_features(tweet_info,
                       dictionary_tweet=None,
                       dictionary_pol=None,
                       dictionary_prev=None):
    sent_values = {}
    sent_words = set()
    for x in io.open("data/clean_epa_terms.txt"):
        x_spl = x.split("\t")
        word = x_spl[0]
        sent_values[word] = [float(x_spl[1]),float(x_spl[2]),float(x_spl[3])]
        sent_words.add(word)
        
    tweet_info['description'] = tweet_info.description.fillna("")
    tweet_info['all_text'] = tweet_info.apply(lambda x : TERM_SEP.join([x['prev1'],x['prev2'],
                                                              x['pol_prev1'],x['pol_prev2'],
                                                              x['tweet_text'],
                                                              x['description'].decode("utf8")]),axis=1)
    # Get features 
    (word_features_tweet, 
     sent_features_tweet, 
     dictionary_tweet) = featurize_particular_tweet_set(tweet_info,sent_words,sent_values,10,dictionary_tweet)

    # Concat w/ candidate interaction features
    tweet_features = np.concatenate((word_features_tweet, sent_features_tweet,
                                 pd.get_dummies(tweet_info.candidate_interaction,drop_first=True).values),axis=1)

    # get column names
    colnames = []
    all_token_features = []
    ordered_tokens = [dictionary_tweet[i] for i in range(len(dictionary_tweet))]
    colnames += ['t_'+x for x in ordered_tokens] + sent_names
    colnames += pd.get_dummies(tweet_info.candidate_interaction,drop_first=True).columns.tolist()

    # tack on uid and tid
    tweet_features = pd.DataFrame(tweet_features, columns=colnames)
    tweet_features['tid'] = tweet_info.tid.tolist()
    tweet_features['uid'] = tweet_info.uid.tolist()

    return tweet_features, dictionary_tweet

In [7]:
tweet_features,dictionary_tweet = get_tweet_features(tweet_info)

# Construct Feature Matrix

In [8]:
features = pd.merge(tweet_features,user_features,on="tid")

In [9]:
tweet_features = tweet_features.columns.tolist()[:-2]
user_features = ['race_A','race_B','race_H','race_O','gend_M','gend_U']

In [10]:
feature_names = tweet_features + user_features

# Methods for Prediction

In [11]:
y_map = {-1 : "Trump", 1 : "Clinton", 0 : "A_None"}
y_list = ['A_None','Clinton','Trump']

def my_mode(x):
    c_res = Counter(x).most_common()
    maxv = c_res[0][1]
    vals = [c_res[0][0]]
    for x in c_res[1:]:
        if x[1] == maxv:
            vals.append(x[0])
        else:
            break
    if len(vals) == 1:
        return vals[0]
    else:
        try:
            vals.remove(0)
        except:
            pass
        if len(vals) == 1:
            return vals[0]
        else:
            return 0

        
def set_annotations_array_indices(data, candidate):
    df = pd.DataFrame([x-1 for x in data],columns=['final annotation'])
    df['candidate'] = candidate
    return set_annotations(df)

def set_annotations(data,field='final annotation'):
    annotations = [x[field] if x['candidate'] == 'Hillary Clinton' else x[field]*-1 for i,x in data.iterrows()]
    return [y_map[x] for x in annotations]

def get_annotation_index(a):
    if a == 'A_None':
        return 0
    if a == 'Clinton':
        return 1
    if a == 'Trump':
        return 2
    return -1000

def get_pred_from_index(a):
    if a == 1:
        return 1
    elif a == 2:
        return -1
    return a

def get_evaluations(testY, testX, model):
    preds = model.predict(testX)
    prob_a_preds = model.predict_proba(testX)    
    no_avg = metrics.f1_score(testY, preds,average=None)
    print no_avg
    return { "ll" : metrics.log_loss(testY, prob_a_preds ,labels=['A_None','Clinton','Trump']),
             "f1_clint" : no_avg[1],
             "f1_trump" : no_avg[2],
             "f1_avg": (no_avg[1]+no_avg[2])/2.,
             "predictions": preds,
             "probabilities": prob_a_preds
           }

def get_from_sampling(train_tweet_ann, features, feature_cols,n=5):
    df = (train_tweet_ann.groupby(['tid','trinary']).uid
          .count()
          .reset_index()
          .pivot("tid","trinary","uid")
          .fillna(0)
          .reset_index())

    df.columns = ['tid','-1','0','1']
    df = pd.merge(df,features,on="tid")
    df = pd.merge(df, train_tweet_ann[['tid','candidate']].drop_duplicates(),how='left')
    print 'df shape: ', df.shape
    ys = []
    obs = []
    for i, x in df.iterrows():
        # set y_i
        prob_values = (x.loc[['-1','0','1']].astype(float).values / 
                       x.loc[['-1','0','1']].astype(float).values.sum())

        ys += set_annotations_array_indices(
                np.random.choice(3,size=n,p=prob_values.tolist()),
                x['candidate'])
        #print prob_values, ys[-1], x['candidate']
        obs += [x[feature_cols].tolist()] * n
    y = np.array(ys)
    X = np.array(obs)
    return X, y


# Majority Vote Functions

In [12]:

def run_majority_vote_models(context,
                             train_tweets,
                             test_tweets,
                             features,
                             feature_cols,
                             feature_cols_type="",
                             do_sampling=False,
                             add_manual=False,
                             max_depth=15,
                             n_estimators=2000,
                             n_to_sample=5,
                             class_weights=None,
                             return_classifier=False):
    
    print context
    
    # pick tweets based on context
    if context == 'All':
        train_tweet_ann = train_tweets
    else:
        train_tweet_ann = train_tweets[train_tweets.context == context]
    
    # get modal agreement score / gold standard label
    if do_sampling:
        train_tweet_annotations = train_tweet_ann
    else:
        train_tweet_annotations = (train_tweet_ann
                                   .groupby(['tid','uid','candidate'])
                                   .trinary.apply(my_mode)
                                   .reset_index())
    train_tweet_annotations.loc[:,'final annotation'] = train_tweet_annotations.trinary
    
    # merge in features
    train_annotations = pd.merge(train_tweet_annotations,features,on="tid")
    
    # construct test data
    test_annotations = pd.merge(test_tweets,features,on="tid")
    testX = test_annotations[feature_cols]
    testY = set_annotations(test_annotations)
    
    # Run model
    if do_sampling:
        X, y = get_from_sampling(train_tweet_ann, features, features,n_to_sample)
    else:
        y = set_annotations(train_annotations)
        X = train_annotations[feature_cols]
        
        
       
    if class_weights == "OUR_AUTO":
        y_labs = Counter(y)
        test_labs = Counter(testY)
        class_weights = {k : float(test_labs[k])/y_labs[k] for k in y_list}
    
    classifier=RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators,
                                     class_weight=class_weights)

    
    res = classifier.fit(X,y)
    evals = get_evaluations(testY,testX, res)

    ret_v = evals.values() + [context, feature_cols_type,do_sampling,
            add_manual,max_depth,n_estimators,
            str(class_weights["Clinton"])+"_"+str(class_weights["A_None"]) if class_weights else 1,res,
            Counter(res.predict(testX)), evals.keys()]
    if not return_classifier:
        return ret_v
    else:
        return ret_v + [classifier]
    
def run_func_majority_vote(x):
    return run_majority_vote_models(train_tweets=train_tweets,
                                     test_tweets=test_tweets,
                                     features=features,
                                     do_sampling=False,
                                     n_to_sample=5,
                                     **x)

# Constance

# Model Functions

In [13]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    out = e_x / e_x.sum()
    return out

# Compute LL
def compute_loglikelihood(alphas,gammas,classifier_predictions,
                          tweet_to_context_to_annotations_map,n_contexts):
    log_likelihood = 0
    for tw_it in range(N_TWEETS):
        # for each possible "true" tweet lable
        y_sum = 0
        for y_it in range(N_QUESTION_ANSWERS):
            # for each context
            m_prod = 1
            for m_it in range(n_contexts):
                s_sum = 0
                # for each possible "true" context value
                for s_it in range(N_QUESTION_ANSWERS):
                    # for each annotation
                    ann_v = 1
                    for ann_id, ann, conf in tweet_to_context_to_annotations_map[tw_it][m_it]:
                        ann_v *= alphas[ann_id,s_it,ann] * conf
                    s_sum += gammas[m_it,y_it,s_it] * ann_v
                m_prod *= s_sum
            y_sum += m_prod*classifier_predictions[tw_it,y_it]
        log_likelihood += np.log(y_sum)
    return log_likelihood

    
def get_p_s_im_a(y_it, alphas, gammas, context_to_annotations_map,n_contexts):
    p_s_im_a = np.zeros((n_contexts,N_QUESTION_ANSWERS))
    for m_it in range(n_contexts):
        for s_it in range(N_QUESTION_ANSWERS):
            # probability of s_im
            p_s_given_y_gamma = gammas[m_it,y_it,s_it]
            # p(y_im for all j | s_im, alpha)
            p_yj_given_s_alpha = 1
            for ann_id, ann, conf in context_to_annotations_map[m_it]:
                p_yj_given_s_alpha *= alphas[ann_id,s_it,ann] *conf
                
            # set the value for this expectation
            p_s_im_a[m_it,s_it] = (p_s_given_y_gamma * p_yj_given_s_alpha)
    return p_s_im_a

def e_step(classifier_predictions,alphas,gammas,y_prior,tweet_to_context_to_annotations_map,n_contexts):
    
    prob_latent = np.zeros((N_TWEETS,N_QUESTION_ANSWERS) + tuple([N_QUESTION_ANSWERS]*n_contexts))

    # for each tweet
    for tw_it in range(N_TWEETS):
        # for each possible "true" tweet label
        for y_it in range(N_QUESTION_ANSWERS):
            # get the classifier probability
            p_y_given_x_w = classifier_predictions[tw_it, y_it]
            
            prob_mat = np.zeros(tuple([N_QUESTION_ANSWERS]*n_contexts))
            p_s_im_a = get_p_s_im_a(y_it, alphas, gammas, tweet_to_context_to_annotations_map[tw_it],n_contexts)
            for index, x in np.ndenumerate(prob_mat):
                prob_mat[index] = np.sum(np.log([ p_s_im_a[v_ind,val] for v_ind,val in enumerate(index)]))
                
                
                
            # normalize the expectation over possible combinations of y_i, s_im
            prob_latent[tw_it,y_it] = np.log(p_y_given_x_w) + prob_mat + np.log(y_prior[y_it])
        prob_latent[tw_it] -= np.log(np.exp(prob_latent[tw_it]).sum())
        prob_latent[tw_it] = np.exp(prob_latent[tw_it]) 
    return prob_latent

def construct_marginals_over_latent_for_alpha(prob_latent,n_contexts):
    marginals = np.zeros((N_TWEETS,n_contexts,N_QUESTION_ANSWERS,N_QUESTION_ANSWERS))
    for tw_it in range(N_TWEETS):
        for m_it in range(n_contexts):
            for y_it in range(N_QUESTION_ANSWERS):
                marginals[tw_it,m_it,y_it,:] = prob_latent[tw_it,y_it].sum(axis=tuple([i for i in range(n_contexts) if i != m_it]))
    return marginals

def m_step_alpha(prob_latent,annotator_to_annotations_map,n_contexts,n_annotators,alpha_prior):
    alphas = np.zeros((n_annotators,N_QUESTION_ANSWERS,N_QUESTION_ANSWERS))
    marginals = construct_marginals_over_latent_for_alpha(prob_latent,n_contexts)
    # update alphas
    # for each annotator
    for ann_it in range(n_annotators):
        # get all of their annotations
        annotations = annotator_to_annotations_map[ann_it]

        for cell_s_it in range(N_QUESTION_ANSWERS):
            for cell_q_it in range(N_QUESTION_ANSWERS):
                numerator = 0
                denominator = 0

                # calculate their "expected" answer for each annotation
                for ann_tweet, ann_context, ann_value, ann_conf in annotations:
                    # sum over all possible y
                    for y_it in range(N_QUESTION_ANSWERS):  
                        v = marginals[ann_tweet,ann_context, y_it, cell_s_it]
                        if ann_value == cell_q_it:
                            numerator += v
                        denominator += v

                # set their alpha for this combination
                #print numerator, denominator
                numerator +=  alpha_prior[cell_s_it,cell_q_it]
                alphas[ann_it,cell_s_it,cell_q_it] = float(numerator) / (denominator + alpha_prior[cell_s_it,:].sum())
    return alphas
                
                
                
def m_step_gamma(prob_latent,n_contexts,gamma_prior):
    gammas = np.zeros((n_contexts,N_QUESTION_ANSWERS,N_QUESTION_ANSWERS)) 
    # update gammas
    # for each context
    for m_it in range(n_contexts): 
        # for all cominations of the latent variables
        for y_it in range(N_QUESTION_ANSWERS):
            for s_it in range(N_QUESTION_ANSWERS):
                axis_to_sum_over = tuple([0] + [i+1 for i in range(n_contexts) if i != m_it])
                numerator = prob_latent[:,y_it].sum(axis=axis_to_sum_over)[s_it] + gamma_prior[y_it,s_it]
                denominator = prob_latent[:, y_it].sum(axis=axis_to_sum_over).sum() + gamma_prior[y_it,:].sum()
                gammas[m_it,y_it,s_it] = numerator / float(denominator)
    return gammas


def m_step_classifier(prob_latent,n_contexts,n_samples_per_obs,max_depth,n_estimators,manual_labels,y_list,X):
    y_probs = prob_latent.sum(axis=tuple([x+2 for x in range(n_contexts)]))
    obs = []
    ys = []
    for q_it in range(N_TWEETS):
        nsamp = n_samples_per_obs
        vals = np.random.choice(N_QUESTION_ANSWERS,size=nsamp,p=y_probs[q_it,:]).tolist()
        ys += [y_list[m] for m in vals]
        obs += [X[q_it,:].tolist()] * n_samples_per_obs
    
    
    res = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators,n_jobs=10).fit(np.array(obs),np.array(ys))
    classifier_predictions = res.predict_proba(X)
    
    # Hack for structural ablations that deteriorate. 
    y_counter = Counter(ys)
    if len(y_counter) != 3:
        if 'A_None' not in y_counter:
            classifier_predictions = np.concatenate((np.zeros((len(classifier_predictions),1)),
                                                     classifier_predictions),
                                                    axis=1)
    
    return classifier_predictions, res

In [14]:

def run_model(train_tweets,test_tweets, features,  y_list,
              feature_cols, feature_cols_name, contexts_to_consider,
              alpha_initializer, alpha_prior, gamma_initializer, gamma_prior,
              do_manual,n_samples_per_obs,max_depth, n_estimators,use_conf_scaling,y_prior):
    
    context_names_info = 'all' if len(contexts_to_consider) == 6 else " ".join(contexts_to_consider)
    
    params_str_info = [feature_cols_name, context_names_info, 
                       str(alpha_initializer[0]), #alpha_prior[0,0],
                       str(gamma_initializer[0]), # gamma_prior[0,0],
                       do_manual,max_depth,n_estimators,n_samples_per_obs,use_conf_scaling]
    
    # get subset of tweets for model
    model_train_tweets = train_tweets[train_tweets.context.isin(contexts_to_consider)]
    
    # create test data
    test_annotations = pd.merge(test_tweets,features,on="tid")
    testX = test_annotations[feature_cols]
    testY = set_annotations(test_annotations)

    ## Construct maps to array indices
    annotator_map = {a : i for i,a in enumerate(set(model_train_tweets.annotator.tolist()))}
    tid_map =  {t : i for i,t in enumerate(set(model_train_tweets.tid.tolist()))}
    context_map =  {c : i for i,c in enumerate(set(model_train_tweets.context.tolist()))}

    # Constants for iterators
    n_contexts = len(context_map)
    n_annotators = len(annotator_map)

    # Constants/variables for algorithm
    old_ll = -9999999.
    new_ll = old_ll + EM_STOPPING_LL_THRESHOLD + 10
    em_iter = 0

    # Construct the X vector
    X = [None] * N_TWEETS
    for i, row in features.iterrows():
        if row['tid'] in tid_map:
            X[tid_map[row['tid']]] = row[feature_cols]
    X = np.array(X)

    # Generate initial values
    tweet_to_context_to_annotations_map = defaultdict(lambda: defaultdict(list))
    annotator_to_annotations_map = defaultdict(list)

    tweet_to_candidate_map = {}
    for i, row in model_train_tweets.iterrows():
        tw_id = tid_map[row['tid']]
        context_id = context_map[row['context']]
        annotator_id = annotator_map[row['annotator']]
        value = row['trinary']
        tweet_to_candidate_map[tw_id] = row['candidate']
        if use_conf_scaling:
            conf = [.25,.5,1][abs(row['value'] - 3)]
        else:
            conf= 1

        # make sure alignment is right across candidates
        if row['candidate'] == "Donald Trump":
            value *= -1
        # make values indices ensuring 0 is the base case, 1 is clinton, 2 is trump
        if value == -1:
            value = 2

        tweet_to_context_to_annotations_map[tw_id][context_id].append((annotator_id,value,conf))
        annotator_to_annotations_map[annotator_id].append((tw_id,context_id,value,conf))

    # Initialize Variables
    alphas = np.array([ np.array(alpha_initializer).reshape((N_QUESTION_ANSWERS,N_QUESTION_ANSWERS))
                        for i in range(n_annotators) ])

    gammas = np.array([ np.array(gamma_initializer).reshape((N_QUESTION_ANSWERS,N_QUESTION_ANSWERS))
                       for i in range(n_contexts) ])

    initial_predictions = [None] * N_TWEETS
    for tweet, context_anns in tweet_to_context_to_annotations_map.items():
        options = [0] * N_QUESTION_ANSWERS
        for context, anns in context_anns.items():
            for a in anns:
                options[a[1]] += 1
        initial_predictions[tweet] = np.array(options)/float(sum(options))

    classifier_predictions = np.array(initial_predictions)

    # set manual annotations ... for future work on semi-supervision with gold labels
    manual_labels = {}
        
    ## Run Algorithm
    intermediate_results = []
    while (new_ll-old_ll > EM_STOPPING_LL_THRESHOLD):
        # E step
        prob_latent = e_step(classifier_predictions,alphas,gammas,y_prior,tweet_to_context_to_annotations_map,n_contexts)

        # M step alpha
        alphas = m_step_alpha(prob_latent,annotator_to_annotations_map,n_contexts,n_annotators,alpha_prior)

        # M step gamma
        gammas = m_step_gamma(prob_latent,n_contexts,gamma_prior)

        classifier_predictions, model = m_step_classifier(prob_latent,n_contexts,n_samples_per_obs,
                                                          max_depth,n_estimators,manual_labels,y_list,X)

        old_ll = new_ll
        new_ll = compute_loglikelihood(alphas,gammas,classifier_predictions,
                                       tweet_to_context_to_annotations_map,n_contexts)
        em_iter += 1
        evals = get_evaluations(testY,testX, model)
        intermediate_results.append([em_iter,evals]+params_str_info+
                                    [prob_latent.sum(axis=tuple([x+2 for x in range(n_contexts)]))])
        print evals['f1_avg'], new_ll

    return (params_str_info+[em_iter,evals.values(),model], 
            intermediate_results, gammas, alphas, classifier_predictions,tid_map,context_map)


# Run Model

In [15]:
def run_func(x):
    return run_model(train_tweets=train_tweets, test_tweets=test_tweets, features=features,y_list=y_list,**x)

N_TWEETS = len(train_tweets.tid.unique())
N_QUESTION_ANSWERS = 3
EM_STOPPING_LL_THRESHOLD = 25

# Validation

In [18]:
# read in validation tweets
validation_tweets = pd.read_csv("data/validation_tweets.tsv",dtype={"tid":"str"})
validation_tweets = validation_tweets.rename(columns={"FINAL_ANNOTATION" : "final annotation"})
validation_tweets['has_link'] = validation_tweets.tweet_text.apply(lambda x: '{{link}}' in x)

In [19]:
# create user features for validation tweets
uids_to_featurize = validation_tweets.uid.unique().tolist() 
user_to_tid = {}
for i, row in validation_tweets.iterrows():
    user_to_tid[row['uid']] = row['tid']
user_validation_features = get_user_features(uids_to_featurize,user_to_tid,validation_tweets)

(318, 11)


In [20]:
# create tweet features for validation tweets
validation_tweets.loc[validation_tweets.candidate_interaction == "NONE",
                                                        "candidate_interaction"] = "No Mention of Target"
validation_tweets.loc[validation_tweets.candidate_interaction == "TextMention",
                                                        "candidate_interaction"] = "Plaintext Mention"


(validation_tweet_features,
 dictionary_tweet) = get_tweet_features(validation_tweets,dictionary_tweet,None,None)

In [21]:
user_validation_features['tid'] = user_validation_features.tid.astype("str")
validation_tweet_features['tid'] = validation_tweet_features.tid.astype("str")

In [22]:
validation_features = pd.merge(validation_tweet_features,user_validation_features,on="tid")

In [23]:
features['pol_Independent'] = 0
features['pol_na'] = 0
validation_tweets = validation_tweets.drop("has_link",axis=1)

In [24]:
features_w_validation = pd.concat((features,validation_features))

In [25]:
# Run the baseline prediction models
val_params_set = []
for context in train_tweets.context.unique().tolist() + ['All']:
    val_params_set.append({"context":context})

def run_func_majority_vote_validation(x):
    return run_majority_vote_models(train_tweets=train_tweets,
                                    test_tweets = validation_tweets,
                                    features=features_w_validation,
                                    feature_cols=feature_names,
                                    feature_cols_type="final_feature_set",
                                    do_sampling=False,
                                    n_estimators=3000,
                                    add_manual=False,
                                    max_depth=30,
                                    n_to_sample=0,
                                    class_weights="OUR_AUTO",
                                     **x)

np.random.seed(0)
modal_prediction_results_validation = []
for param in val_params_set:
    modal_prediction_results_validation.append(run_func_majority_vote_validation(param))

poltweet


  'precision', 'predicted', average, warn_for)


[0.00000000 0.84407484 0.55944056]
prevtweet
[0.00000000 0.84453782 0.58108108]
polparty
[0.00000000 0.83368421 0.53691275]
partialuser
[0.00000000 0.84188912 0.52554745]
fulluser
[0.00000000 0.81799591 0.42962963]
none
[0.00000000 0.82258065 0.39682540]
All
[0.00000000 0.83966245 0.57333333]


In [26]:
val_modal_results_df = pd.DataFrame([[x[1]]+x[3:6] + x[6:-3] + [x[-2]['A_None'],x[-2]['Trump']] 
                                     for x in modal_prediction_results_validation],
                                columns= ['ll','f1_clint','f1_avg','f1_trump',
                                          'context','feature_set','do_sampling','manual',
                                          'max_depth','n_estimators','weights','n_none','n_trump'])
val_modal_results_df.sort_values("f1_avg",ascending=False).head(15)

Unnamed: 0,ll,f1_clint,f1_avg,f1_trump,context,feature_set,do_sampling,manual,max_depth,n_estimators,weights,n_none,n_trump
1,0.650287,0.844538,0.712809,0.581081,prevtweet,final_feature_set,False,False,30,3000,0.669902912621_0.101694915254,0,49
6,0.622822,0.839662,0.706498,0.573333,All,final_feature_set,False,False,30,3000,0.614243323442_0.136363636364,0,51
0,0.613064,0.844075,0.701758,0.559441,poltweet,final_feature_set,False,False,30,3000,0.570247933884_0.193548387097,0,44
2,0.62666,0.833684,0.685298,0.536913,polparty,final_feature_set,False,False,30,3000,0.583098591549_0.255319148936,0,50
3,0.711638,0.841889,0.683718,0.525547,partialuser,final_feature_set,False,False,30,3000,0.726315789474_0.075,0,38
4,0.693789,0.817996,0.623813,0.42963,fulluser,final_feature_set,False,False,30,3000,0.704081632653_0.0821917808219,0,36
5,0.724261,0.822581,0.609703,0.396825,none,final_feature_set,False,False,30,3000,0.734042553191_0.0685714285714,2,27


In [27]:
alpha_init = [.98,0.01,0.01,0.01,.98,0.01,0.01,0.01,.98]
gamma_init =[.4,.3,.3,.2,.75,.05,.2,.05,.75]
alpha_prior = 0
gamma_prior = 0

ap = np.array(alpha_init).reshape((N_QUESTION_ANSWERS,N_QUESTION_ANSWERS))*alpha_prior
gp = np.array(gamma_init).reshape((N_QUESTION_ANSWERS,N_QUESTION_ANSWERS))*gamma_prior

In [28]:
# Full Model
np.random.seed(0)
model_result = run_model(train_tweets=train_tweets,
                          test_tweets=validation_tweets, 
                          features=features_w_validation, 
                          y_list=y_list,
                          feature_cols=feature_names,
                          feature_cols_name="final_feature_set",
                          contexts_to_consider=['polparty','poltweet','prevtweet','none','fulluser','partialuser'],
                          alpha_prior=ap, alpha_initializer=alpha_init,
                          gamma_prior=gp, gamma_initializer=gamma_init,
                          do_manual=False,
                          n_samples_per_obs=10,
                          max_depth=30,
                          n_estimators=3000,
                          use_conf_scaling=False,
                          y_prior = [.01,.495,.495])

[0.00000000 0.85344828 0.65000000]
0.751724137931 -5899.841733
[0.00000000 0.85344828 0.65000000]
0.751724137931 -5747.08511901
[0.00000000 0.85097192 0.64150943]
0.746240678104 -5681.03936085
[0.12500000 0.85529158 0.63694268]
0.746117125917 -5652.68152299
[0.12500000 0.86274510 0.67080745]
0.766776275728 -5637.98632087


In [29]:
# Model - No Context 

# Note the y-prior difference -> the Log-loss on these models below up if we didn't do this.
# Because its fair to say that we could have optimized these ablations as well (they are, in a sense, 
# separate models), we chose better y-priors for them to make a more fair comparison.

train_tweets_copy_all_contexts_same = train_tweets.copy()
train_tweets_copy_all_contexts_same['context'] = 'ALL_SAME'

np.random.seed(0)

model_res_no_context = run_model(train_tweets=train_tweets_copy_all_contexts_same,
                      test_tweets=validation_tweets, 
                      features=features_w_validation, 
                      y_list=y_list,
                      feature_cols=feature_names,
                      feature_cols_name="final_feature_set",
                      contexts_to_consider=["ALL_SAME"],
                      alpha_prior=np.array(alpha_init).reshape((N_QUESTION_ANSWERS,N_QUESTION_ANSWERS)),
                      alpha_initializer=alpha_init,
                      gamma_prior=np.array(gamma_init).reshape((N_QUESTION_ANSWERS,N_QUESTION_ANSWERS)),
                      gamma_initializer=gamma_init,
                      do_manual=False,
                      n_samples_per_obs=10,
                      max_depth=30,
                      n_estimators=3000,
                      use_conf_scaling=False,
                      y_prior = [.1,.45,.45])

[0.00000000 0.85161290 0.64150943]
0.746561168594 -6060.39340612
[0.00000000 0.85224839 0.63694268]
0.744595534582 -5976.40273798
[0.00000000 0.85407725 0.64556962]
0.749823436736 -5958.71048625


In [30]:
# Model - One Context 
np.random.seed(0)

model_res_one_context = run_model(train_tweets=train_tweets,
                      test_tweets=validation_tweets, 
                      features=features_w_validation, 
                      y_list=y_list,
                      feature_cols=feature_names,
                      feature_cols_name="final_feature_set",
                      contexts_to_consider=["poltweet"],
                      alpha_prior=np.array(alpha_init).reshape((N_QUESTION_ANSWERS,N_QUESTION_ANSWERS)),
                      alpha_initializer=alpha_init,
                      gamma_prior=np.array(gamma_init).reshape((N_QUESTION_ANSWERS,N_QUESTION_ANSWERS)),
                      gamma_initializer=gamma_init,
                      do_manual=False,
                      n_samples_per_obs=10,
                      max_depth=30,
                      n_estimators=3000,
                      use_conf_scaling=False,
                      y_prior = [.1,.45,.45])

[0.00000000 0.84810127 0.60000000]
0.724050632911 -498.038355669
[0.00000000 0.85052632 0.60402685]
0.727276580714 -453.428815318
[0.00000000 0.85232068 0.61333333]
0.732827004219 -423.108463813
[0.00000000 0.85052632 0.60402685]
0.727276580714 -413.036364162


In [31]:
# Model - One Annotator
train_tweets_copy_all_annotators_same = train_tweets.copy()
train_tweets_copy_all_annotators_same['annotator'] = 'ANNOTATOR'
np.random.seed(0)

model_res_no_ann = run_model(train_tweets=train_tweets_copy_all_annotators_same,
                      test_tweets=validation_tweets, 
                      features=features_w_validation, 
                      y_list=y_list,
                      feature_cols=feature_names,
                      feature_cols_name="final_feature_set",
                      contexts_to_consider=['polparty','poltweet','prevtweet','none','fulluser','partialuser'],
                      alpha_prior=np.array(alpha_init).reshape((N_QUESTION_ANSWERS,N_QUESTION_ANSWERS)),
                      alpha_initializer=alpha_init,
                      gamma_prior=np.array(gamma_init).reshape((N_QUESTION_ANSWERS,N_QUESTION_ANSWERS)),
                      gamma_initializer=gamma_init,
                      do_manual=False,
                      n_samples_per_obs=10,
                      max_depth=30,
                      n_estimators=3000,
                      use_conf_scaling=False,
                      y_prior = [.1,.45,.45])

[0.12500000 0.85529158 0.63694268]
0.746117125917 -6937.37173774
[0.11764706 0.86026201 0.67080745]
0.765534731075 -6813.38360958
[0.11764706 0.85217391 0.64150943]
0.746841673503 -6763.13174349
[0.10526316 0.85152838 0.64150943]
0.746518909121 -6731.4186799
[0.19047619 0.85274725 0.65000000]
0.751373626374 -6710.66308483


In [37]:
# F1 Predictions, write out
val_annotations = set_annotations(validation_tweets)

modal = [zip(val_annotations,x[2], [x[6]]*len(x[2])) for x in modal_prediction_results_validation]
non_modal = [zip(val_annotations,x[0][-2][2], [y]*len(x[0][-2][2])) 
                 for y,x in [("Full",model_result),
                             ("Pol Tweet Context",model_res_one_context),
                             ("One Context",model_res_no_context),
                             ("One Annotator",model_res_no_ann)]]
results_df = pd.DataFrame([y for x in modal + non_modal for y in x],columns=['actual','predicted','model'])
results_df.to_csv("results/model_results.csv",index=False)

In [38]:
# Log Loss Predictions, write out
val_annotations = set_annotations(validation_tweets)

modal = [zip(val_annotations,x[0][:,1],x[0][:,2],[x[6]]*len(x[0])) for x in modal_prediction_results_validation]
non_modal = [zip(val_annotations,x[0][-2][0][:,1],x[0][-2][0][:,2], [y]*len(x[0][-2][0])) 
                 for y,x in [("Full",model_result),
                             ("Pol Tweet Context",model_res_one_context),
                             ("One Context",model_res_no_context),
                             ("One Annotator",model_res_no_ann)]]
results_df = pd.DataFrame([y for x in modal + non_modal for y in x],columns=['actual','clinton_prob','trump_prob','model'])
results_df.to_csv("results/model_results_ll.csv",index=False)

In [34]:
# gamma
for k, v in model_result[-1].items():
    r = [(k,y_list[i], y_list[j], model_result[-5][v][i,j]) for i in range(3) for j in range(3)]
    for x in r:
        print ", ".join([str(y) for y in x])

none, A_None, A_None, 0.999998082164
none, A_None, Clinton, 1.91781604772e-06
none, A_None, Trump, 2.01110368158e-11
none, Clinton, A_None, 0.23590827035
none, Clinton, Clinton, 0.761269289358
none, Clinton, Trump, 0.00282244029141
none, Trump, A_None, 0.291378123205
none, Trump, Clinton, 0.00807223635941
none, Trump, Trump, 0.700549640435
partialuser, A_None, A_None, 0.99528868331
partialuser, A_None, Clinton, 0.00471126178026
partialuser, A_None, Trump, 5.49097049215e-08
partialuser, Clinton, A_None, 0.220455391771
partialuser, Clinton, Clinton, 0.770076540135
partialuser, Clinton, Trump, 0.0094680680947
partialuser, Trump, A_None, 0.202557451929
partialuser, Trump, Clinton, 0.0171579614647
partialuser, Trump, Trump, 0.780284586606
fulluser, A_None, A_None, 0.972443995475
fulluser, A_None, Clinton, 3.12456443242e-07
fulluser, A_None, Trump, 0.0275556920689
fulluser, Clinton, A_None, 0.206686465974
fulluser, Clinton, Clinton, 0.787116780343
fulluser, Clinton, Trump, 0.00619675368298
f