In [None]:
import pandas as pd
import numpy
import re
import subprocess
import os
import ipyparallel

In [None]:
data = pd.read_csv('data/sensitive/coder1_all.tsv', sep='\t')
data = data[['uni', 'Participant', 'Start', 'Excerpt Copy']]

In [None]:
data.columns.values

In [None]:
data.head()

In [None]:
def get_q(txt):
    q = ''
    m = re.search('Question: (.+?);', txt)
    if m:
        q = m.group(1)
    return q

def get_txt(txt):
    t = ''
    p = re.compile('Answer: (.+)', re.MULTILINE)
    m = re.search(p, txt)
    if m:
        t = m.group(1)
    return str(t)

data['Question'] = data['Excerpt Copy'].apply(get_q)
data['Answer'] = data['Excerpt Copy'].apply(get_txt)

data = data[['uni', 'Participant', 'Start', 'Question', 'Answer']]
data.head()

In [None]:
#data = pd.read_csv('data/just_answers.tsv', sep='\t')

In [None]:
data['tmp'] = data.Answer.apply(type)
data.tmp.value_counts()

In [None]:
data = data.head(20)

#print out each comment into its own file.
j = 0
n = len(data)
for (i, d) in data.iterrows():
    with open('data/sensitive/sentiment/' +d.uni+'_'+str(d.Participant)+'_'+str(d.Start)+'_.txt','w') as out:
        out.write(d.Answer)
    j += 1
    
    if j % 1000 == 0:
        print(j, 'of', n)
print('Done!')

In [None]:
print('Creating cluster client and view...')
c = ipyparallel.Client()
c[:].apply_sync(os.chdir, os.getcwd())
view = c.load_balanced_view()

In [None]:
def ippRunSentiment(fname):
    import subprocess
    from numpy import mean
    from numpy import std
    
    #Stanford sentiment gives text ratings, we want numeric ratings
    points = {'very negative' : -3, 'negative' : -1, 'neutral' : 0,
              'positive' : 1, 'very positive' : 3}
    
    classPath = '-cp "/home/jwlock/research/workspace/stanford-corenlp-full-2015-12-09/*"'
    settings = ' -mx5g edu.stanford.nlp.sentiment.SentimentPipeline'
    inputFile = ' -file data/sentiment/' + fname
    outputFile = ' > ' + fname + '_result.txt'
    command = 'java ' + classPath +  settings + inputFile #+ outputFile

    child = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
    out, err = child.communicate()

    #divide results by line
    scores = out.split('\n')[1:]
    values = []

    for score in scores:
        try:
            #if this line is in our scores dict
            values.append(points[score.strip().lower()])
        except KeyError:        
            continue

    #return summary stats            
    return {'name': fname, 
            'mean': mean(values), 
            'sd': std(values), 
            'n': len(values),  
            'raw': out
           }

            
dataDir = 'data/sensitive/sentiment'
print('Listing answers...')
documents = [fname for fname in os.listdir(dataDir) if '.txt' in fname]

print('Starting parallel sentiment analysis...')
results = view.map_async(ippRunSentiment, documents)
results.wait_interactive()

In [None]:
print('Analysis complete! Packaging as DataFrame...')
r = pd.DataFrame.from_records(results)

In [None]:
r['uni'] = r.name.str.split('_').str.get(0)
r['Participant'] = r.name.str.split('_').str.get(1)
r['Start'] = r.name.str.split('_').str.get(2)
r = r[['uni', 'Participant', 'Start', 'mean', 'sd', 'n', 'raw']]

r.to_csv('data/public/sentiment_scores.tsv', sep='\t', index=False)
r.head()