In [11]:
from operator import add
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from pyspark import SparkContext, SparkConf

import numpy as np
float_formatter = lambda x: "%.6f" % x
np.set_printoptions(threshold=10000,
                    linewidth=100,
                    formatter={'float_kind':float_formatter})

output_dir = "../output/run_4/"

In [12]:
# load up spark
conf = (SparkConf().setMaster("local[*]").setAppName("My app"))
sc = SparkContext(conf=conf)

In [13]:
import os
from vaderSentiment.vaderSentiment import sentiment
def get_sentiment_for_line(line):
    uid, identities, text = line.strip().split("\t")
    identities = identities.split(",")
    sent = sentiment(text,alpha_normalizer=15.)['compound']
    for identity in identities:
        yield ((uid, identity), sent)

raw_text = sc.textFile(os.path.join(output_dir,"training_raw_text_fin.txt"),600)
sent_per_uid_identity = (raw_text.flatMap(get_sentiment_for_line)
                         .aggregateByKey((0,0), lambda a,b: (a[0] + b,    a[1] + 1),
                                                 lambda a,b: (a[0] + b[0], a[1] + b[1]))
                          .mapValues(lambda v: float(v[0])/float(v[1])).collect())

In [14]:
sc.stop()

In [15]:
import pandas as pd
df = pd.DataFrame([ [x[0],x[1], y] for x, y in sent_per_uid_identity])
df.columns = ['uid','iden_id','val']
df.to_csv(os.path.join(output_dir,"user_to_identity_to_simple_sent_val.csv"))

In [None]:
import numpy as np
index_to_identity = {int(line.split("\t")[0]) : line.strip().split("\t")[1] 
               for line in open(os.path.join(output_dir,"index_to_identity_final.tsv")) }

from constraints import IDENTITY_PREFIX
identities = [x.strip() for x in open("../data/identity_data/final_identities_list.txt").readlines()]

identity_to_id = pd.DataFrame([(identity,IDENTITY_PREFIX+str(i)) for i, identity in enumerate(identities)], 
                              columns=['identity','iden_id'])

In [None]:
res = df[['iden_id','val']].groupby("iden_id").mean().reset_index()
res.columns = ['iden_id','mean_val']
res = pd.merge(res, identity_to_id,on='iden_id')

In [None]:
import pandas as pd
import msgpack
import seaborn as sns
sns.set()
%matplotlib inline

In [None]:
sent_model_info = msgpack.load(open(os.path.join(output_dir,"sent_res_final/0_sent_basic.mpack")))
x = np.load(open(os.path.join(output_dir,"sent_res_final/499_mu.npy")))
my_res = pd.DataFrame([(sent_model_info['index_to_ids'][i][:-1], z) 
                       for i,z in enumerate(x) if sent_model_info['index_to_ids'][i].endswith('e')],
                      columns=['iden_id','my_res'])

In [None]:
d = pd.merge(res,my_res)

sns.regplot('mean_val','my_res',d)

In [None]:
from sklearn.preprocessing import scale
d['diffv'] = abs(scale(d.mean_val) - scale(d.my_res))

In [None]:
d[d.identity=='thot']

In [17]:
d['scale_vader'] = scale(d.mean_val)
d['scale_my'] = scale(d.my_res)
d.sort('diffv',ascending=False)

Unnamed: 0,iden_id,mean_val,identity,my_res,diffv,scale_vader,scale_my
199,i_278,0.783361,best friend,1.946802,2.713278,4.046961,1.333683
162,i_244,0.058594,alcoholic,-2.042480,2.488524,0.051773,-2.436751
308,i_98,0.056920,gangster,-2.042217,2.479052,0.042550,-2.436502
278,i_70,0.038896,pimp,-2.090867,2.425678,-0.056805,-2.482483
207,i_285,0.035869,addict,-1.883085,2.212606,-0.073493,-2.286100
10,i_107,0.048778,homeless,-1.653495,2.066773,-0.002332,-2.069106
83,i_173,-0.054217,firefighter,2.082671,2.032177,-0.570078,1.462099
153,i_236,0.498631,friend,1.097798,1.946167,2.477422,0.531255
187,i_267,0.069151,goon,-1.188658,1.739740,0.109971,-1.629769
261,i_55,-0.063634,arab,-1.956306,1.733314,-0.621990,-2.355304


In [None]:
sns.distplot(d.mean_val)

In [None]:
sns.distplot(d.my_res)

In [None]:
sns.distplot(d.diffv)