In [18]:
import numpy as np 
import io
import sys
import codecs
from user import *
import glob
import cPickle as pickle
import os
from twitter_dm.utility.general_utils import read_grouped_by_newline_file
from twitter_dm.utility.general_utils import tab_stringify_newline as tsn
from collections import defaultdict, Counter
from textunit import TextUnit
from constraints import get_id_and_value_map
from constraints import IDENTITY_PREFIX, SENTWORD_PREFIX
from math import log
from vaderSentiment.vaderSentiment import sentiment
from pred_models import *
from stat_util import *
from math import log
from vaderSentiment.vaderSentiment import sentiment
from twitter_dm.nlp.Tokenize import extract_tokens_twokenize_and_regex

SMOOTHING_PARAM = 1
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
float_formatter = lambda x: "%.6f" % x
np.set_printoptions(threshold=10000,
                    linewidth=100,
                    formatter={'float_kind':float_formatter})

output_dir = "../output/run_4/"

In [19]:
identities = [x.strip() for x in io.open("../data/identity_data/final_identities_list.txt").readlines()]
identity_to_id = {identity : IDENTITY_PREFIX+str(i) for i, identity in enumerate(identities)}
id_to_identity = {v : k for k, v in identity_to_id.items()}

index_to_id = {int(x.split("\t")[0]):x.strip().split("\t")[1] 
               for x in  io.open(os.path.join(output_dir,"index_to_id_final.tsv"))}
id_to_index = {v : k for k,v in index_to_id.items()}

ids_in_index_order = [None] * len(identities)
for k, v in index_to_id.items():
    ids_in_index_order[k] = v

In [20]:
users = list(load_users(output_dir))

loading user data...
 ... 
 loaded! 


In [21]:
len(users)

44886

In [22]:
dout = io.open(os.path.join(output_dir, "user_uids.txt"),"w")
for u in users:
    dout.write(unicode(u.uid) + "\n")
dout.close()

In [23]:
import pandas as pd 

# construct simple sentiment info for simple sent models
i2ind = pd.DataFrame(id_to_index.items())
i2ind.columns = ['iden_id','index']
simple_sent_data = pd.read_csv(os.path.join(output_dir,"user_to_identity_to_simple_sent_val.csv"))

overall_simple_sent = simple_sent_data[['iden_id','val']].groupby("iden_id").mean().reset_index()
overall_simple_sent = pd.merge(overall_simple_sent,i2ind, on='iden_id')
simple_sent_vector = np.zeros(len(i2ind))
for x in overall_simple_sent.values.tolist():
    simple_sent_vector[x[2]] = x[1]

user_simple_sent_data = defaultdict(dict)
for x in simple_sent_data.values.tolist():
    if x[2] in id_to_index:
        user_simple_sent_data[x[1]][x[2]] = x[3]

In [24]:
# load in some other data we're going to need
training_user_counts = np.zeros((len(users),len(identity_to_id)))
test_user_counts = np.zeros((len(users), len(identity_to_id)))
for i, u in enumerate(users):
    training_user_counts[i] = u.training_vector
    test_user_counts[i] = u.test_vector

In [25]:
# load up spark
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from pyspark import SparkContext, SparkConf

conf = (SparkConf().setMaster("local[*]").setAppName("My app")
        .set("spark.local.dir", "/usr1/kjoseph/spark_tmp"))
sc = SparkContext(conf=conf)

In [30]:
def run_user(data):
    uid, test_vector,training_vector, test_raw_text, test_deflection_strings,\
        test_identities_per_tweet, u_phi,u_eta,simple_assoc,simple_sent,usimpsent_uid = data

    perpl = Counter()
    spot = Counter()
    n_obs = 0
    user_simple_sent = SimpleSent("u_sent", simple_sent_vector, id_to_index, rescale_value=rsv, 
                                  user_sent_info=usimpsent_uid, power_val=pv)
    sent_models = [simp_sent,user_simple_sent]
    user_assoc = SimpleMult("user_assoc",(training_vector + SMOOTHING_PARAM) / 
                                   (training_vector.sum() + n_identities * SMOOTHING_PARAM),div_by_sum=False)

    our_assoc = SimpleMult("our_assoc",softmax(u_eta))

    #perplexity for associative models
    for m in [simp_assoc, user_assoc, our_assoc]:
        perpl[m.name()] += (test_vector * m.log_prob()).sum()

    # set values for user
    user_values = {v : u_phi[s_iter] for s_iter,v in enumerate(sent_index_to_ids) }
    our_sent = OurSent(ids_in_index_order, user_values)

    for tw_iter, test_text in enumerate(test_raw_text):
        # get deflection string
        test_deflection_str = test_deflection_strings[tw_iter].replace("uv.","self.uv.")

        for identity in test_identities_per_tweet[tw_iter]:
            n_obs += 1
            index_of_identity = id_to_index[identity]

            # get sentiment with this identity word replaced w/ placeholder
            tmp = extract_tokens_twokenize_and_regex(test_text.decode("utf8"),[], [],
                                   make_lowercase=False,
                                   do_lemmatize=False,
                                   remove_possessive=False,
                                   do_arabic_stemming=False)
            id_test_text = " ".join([x 
                                     if x.lower() not in [id_to_identity[identity],id_to_identity[identity]+"s"]
                                    else 'identity' for x in tmp ])
            if ' ' in id_to_identity[identity]:
                id_test_text = id_test_text.replace(id_to_identity[identity],"compound identity")
                id_test_text = id_test_text.replace(id_to_identity[identity]+'s',"compound identity")
            test_sent = sentiment(id_test_text )['compound']


            for m in sent_models:
                name = m.name()
                probs = m.compute_prob(test_sent)
                try:
                    perpl[name] += log(probs[index_of_identity])
                except:
                    pass
                spot[name] += np.where((-probs).argsort() == index_of_identity)[0][0]

            if (not len(test_deflection_str) or
                        (identity+'e' not in test_deflection_str and
                         identity+'p' not in test_deflection_str and
                         identity+'a' not in test_deflection_str)):
                perpl['our_sent'] += log_eq
                spot['our_sent'] += n_identities/2.
            else:
                se_prob = our_sent.compute_prob(identity, test_deflection_str)
                try:
                    perpl['our_sent'] += log(se_prob[index_of_identity])
                except:
                    pass
                spot['our_sent'] += np.where((-se_prob).argsort() == index_of_identity)[0][0]

    return [[spot, perpl, n_obs]]

In [31]:
of = open(os.path.join(output_dir,"results.tsv"),"w")
sent_model_info = msgpack.load(open(os.path.join(output_dir,"sent_res_final", "0_sent_basic.mpack")))
sent_ids_to_index = sent_model_info['ids_to_index']
sent_index_to_ids = sent_model_info['index_to_ids']
pv = 1
rsv = 1
log_eq = log(1./float(len(id_to_index)))
n_identities = len(id_to_index)

dout = io.open(os.path.join(output_dir, "sent_ids_list.txt"),"w")
for s in sent_index_to_ids:
    dout.write(unicode(s) + "\n")
dout.close()

for iteration in ['300']:#,'600','700','800','900'
    eta = np.load(os.path.join(output_dir,"assoc_res_final",iteration+"_assoc_eta.npy"))
    #eta = np.load(os.path.join(output_dir,"sent_res_final", iteration+"_sent_phi.npy"))
    phi = np.load(os.path.join(output_dir,"sent_res_final", iteration+"_sent_phi.npy"))

    simp_sent = SimpleSent("sent_basic", simple_sent_vector, id_to_index, rescale_value=rsv, power_val=pv)
    simp_assoc = SimpleMult("simp_assoc",training_user_counts.sum(axis=0))

    data = [[users[i].uid, 
             users[i].test_vector,
             users[i].training_vector, 
             users[i].test_raw_text, 
             users[i].test_deflection_strings,
             users[i].test_identities_per_tweet,
             phi[i],eta[i],simp_assoc,simp_sent,
             user_simple_sent_data[int(users[i].uid)]] for i in range(len(users))]

    
    d = sc.parallelize(data,512).flatMap(run_user).collect()

    perpl = Counter()
    spot_dat = Counter()
    n_tot_obs = 0
    for dat in d:
        spot, ppl, nobs = dat
        perpl.update(ppl)
        spot_dat.update(spot)
        n_tot_obs += nobs
    
    for k, v in perpl.items():
        of.write( tsn([iteration, k, v/float(n_tot_obs), spot_dat[k]/float(n_tot_obs)]))
        print tsn([iteration, k, v/float(n_tot_obs), spot_dat[k]/float(n_tot_obs)])
        
of.close()

300	our_assoc	-4.36447039092	0.0

300	simp_assoc	-4.86241343742	0.0

300	user_assoc	-4.47382903111	0.0

300	sent_basic	-5.71403519446	134.885093981

300	u_sent	-5.68876381475	127.770838345

300	our_sent	-10.4174328436	126.30917173



In [32]:
sc.stop()