In [1]:
# Ideally these should be set outside code.
import os
os.environ['PGHOST'] = "localhost"
os.environ['PGDATABASE'] = "crsp"

In [2]:
import os
import re
import json
import pandas as pd

from sqlalchemy import create_engine
from pandas.io.json import json_normalize

conn_string = 'postgresql://' + os.environ['PGHOST'] + '/' + os.environ['PGDATABASE']
engine = create_engine(conn_string)

target_schema = "se_features"

engine.execute("SET search_path TO %s, public" % target_schema)
rv = engine.execute("SELECT category FROM %s.liwc_2015" % target_schema)

categories = [ r['category'] for r in rv]

plan = """
    SELECT word_list
    FROM %s.liwc_2015 """ % target_schema + "WHERE category = %s"

mod_word_list = {}
for cat in categories:
    rows = list(engine.execute(plan, [cat]))
    word_list = rows[0]['word_list']
    mod_word_list[cat] = [re.sub('\*(?:\s*$)?', "[a-z0-9]*(')?[a-z0-9]*", word.lower())
                            for word in word_list]
    
# Include numeric values in Number category (modified by Yvonne)
mod_word_list['Number'].append('[0-9]+(\.[0-9]+)?')
mod_word_list['Number'].append('\.[0-9]+')

# Include a word that is not shown in dict but used by LIWC
mod_word_list['Affect'].append('(would) like')
mod_word_list['Posemo'].append('(would) like')


# Pre-compile regular expressions.
regex_list = {}
for key in mod_word_list.keys():
    # NOTE: Deleted hyphen for testing
    regex = r"\b(?:" + "|".join(mod_word_list[key]) + r")(?=(?:[^a-zA-Z0-9_']|$))"
    regex_list[key] = re.compile(regex)

In [3]:
rv = engine.execute("SELECT id FROM %s.negative_liwc_2015" % target_schema)

neg_ids = [r['id'] for r in rv]

plan = """
    SELECT word_list
    FROM %s.negative_liwc_2015 """ % target_schema + "WHERE id = %s"

neg_mod_word_list = {}
for neg_id in neg_ids:
    rows = list(engine.execute(plan, [neg_id]))
    word_list = rows[0]['word_list']
    neg_mod_word_list[neg_id] = [re.sub('\*(?:\s*$)?', '[a-z]*', word.lower())
                            for word in word_list]
    

plan = """
    SELECT category
    FROM %s.negative_liwc_2015 """ % target_schema + "WHERE id = %s"

# Pre-compile regular expressions.
neg_cat_list = {}
neg_regex_list = {}

for key in neg_mod_word_list.keys():
    rows = list(engine.execute(plan, [key]))
    neg_cat = rows[0]['category']
    neg_cat_list[key] = neg_cat
    # NOTE: Deleted hyphen for testing
    neg_regex = r"\b(?:" + "|".join(neg_mod_word_list[key]) + r")(?=(?:[^a-zA-Z0-9_']|$))"
    neg_regex_list[key] = re.compile(neg_regex)

In [4]:
from collections import Counter 
def pos_liwc_counter(the_text):
    text = re.sub(u'\u2019', "'", the_text).lower()
    the_dict = Counter()
    for cat in categories:
        num = len(re.findall(regex_list[cat], text))
        the_dict[cat] += num
    return the_dict

def neg_liwc_counter(the_text):
    text = re.sub(u'\u2019', "'", the_text).lower()
    the_neg_dict = Counter()
    for neg_id in neg_ids:
        neg_cat = neg_cat_list[neg_id]
        num = len(re.findall(neg_regex_list[neg_id], text))
        the_neg_dict[neg_cat] += num
    return the_neg_dict

def add_liwc_counters(the_text):
    pos_counter = pos_liwc_counter(the_text)
    neg_counter = neg_liwc_counter(the_text)
    pos_counter.subtract(neg_counter.elements())
    return pos_counter

In [16]:
sample_calls = pd.read_sql("""
    SELECT file_name, last_update, section, context, speaker_number
    FROM streetevents.speaker_data 
    WHERE context = 'pres'
    ORDER BY RANDOM()
    LIMIT 50""", con = engine)

sample_calls

Unnamed: 0,file_name,last_update,section,context,speaker_number
0,5744038_T,2015-09-01 23:13:04+10:00,2,pres,4
1,12284508_T,2019-02-19 19:37:00+11:00,1,pres,5
2,2768591_T,2010-03-02 08:42:27+11:00,1,pres,75
3,13073558_T,2020-04-21 09:05:27+10:00,1,pres,22
4,2499104_T,2009-10-28 08:08:31+11:00,2,pres,16
5,12846991_T,2019-11-14 02:18:30+11:00,1,pres,6
6,3742342_T,2011-02-11 03:54:43+11:00,1,pres,54
7,1044079_T,2005-04-01 07:59:21+10:00,1,pres,1
8,12852438_T,2019-11-06 03:47:28+11:00,1,pres,1
9,5944500_T,2016-03-16 16:02:30+11:00,1,pres,20


In [18]:
for call in range(0, 50):
    file_name = sample_calls['file_name'][call]
    speaker_number = sample_calls['speaker_number'][call]
    call_text = pd.read_sql("""
    SELECT speaker_text
    FROM streetevents.speaker_data
    WHERE file_name = '%s' AND section = 1 AND context = 'pres' AND speaker_number = %s"""%(file_name, speaker_number), con = engine)
    output = "sample_50/" + file_name + ".txt"
    with open(output, 'w') as f:
        f.write(call_text.iloc[0,0])
        f.close()