In [293]:
# Ideally these should be set outside code.
import os
os.environ['PGHOST'] = "localhost"
os.environ['PGDATABASE'] = "crsp"

In [294]:
import os
import re
import json
import pandas as pd

from sqlalchemy import create_engine
from pandas.io.json import json_normalize

conn_string = 'postgresql://' + os.environ['PGHOST'] + '/' + os.environ['PGDATABASE']
engine = create_engine(conn_string)

target_schema = "se_features"

engine.execute("SET search_path TO %s, public" % target_schema)
rv = engine.execute("SELECT category FROM %s.liwc_2015" % target_schema)

categories = [ r['category'] for r in rv]

plan = """
    SELECT word_list
    FROM %s.liwc_2015 """ % target_schema + "WHERE category = %s"

mod_word_list = {}
for cat in categories:
    rows = list(engine.execute(plan, [cat]))
    word_list = rows[0]['word_list']
    mod_word_list[cat] = [re.sub('\*(?:\s*$)?', "[a-z0-9]*(')?[a-z0-9]*", word.lower())
                            for word in word_list]
    
# Include numeric values in Number category (modified by Yvonne)
mod_word_list['Number'].append('[0-9]+(\.[0-9]+)?')
mod_word_list['Number'].append('\.[0-9]+')

# Include a word that is not shown in dict but used by LIWC
mod_word_list['Affect'].append('(would) like')
mod_word_list['Posemo'].append('(would) like')


# Pre-compile regular expressions.
regex_list = {}
for key in mod_word_list.keys():
    # NOTE: Deleted hyphen for testing
    regex = r"\b(?:" + "|".join(mod_word_list[key]) + r")(?=(?:[^a-zA-Z0-9_']|$))"
    regex_list[key] = re.compile(regex)

In [295]:
rv = engine.execute("SELECT id FROM %s.negative_liwc_2015" % target_schema)

neg_ids = [r['id'] for r in rv]

plan = """
    SELECT word_list
    FROM %s.negative_liwc_2015 """ % target_schema + "WHERE id = %s"

neg_mod_word_list = {}
for neg_id in neg_ids:
    rows = list(engine.execute(plan, [neg_id]))
    word_list = rows[0]['word_list']
    neg_mod_word_list[neg_id] = [re.sub('\*(?:\s*$)?', '[a-z]*', word.lower())
                            for word in word_list]
    

plan = """
    SELECT category
    FROM %s.negative_liwc_2015 """ % target_schema + "WHERE id = %s"

# Pre-compile regular expressions.
neg_cat_list = {}
neg_regex_list = {}

for key in neg_mod_word_list.keys():
    rows = list(engine.execute(plan, [key]))
    neg_cat = rows[0]['category']
    neg_cat_list[key] = neg_cat
    # NOTE: Deleted hyphen for testing
    neg_regex = r"\b(?:" + "|".join(neg_mod_word_list[key]) + r")(?=(?:[^a-zA-Z0-9_']|$))"
    neg_regex_list[key] = re.compile(neg_regex)

In [296]:
from collections import Counter 
def pos_liwc_counter(the_text):
    text = re.sub(u'\u2019', "'", the_text).lower()
    the_dict = Counter()
    for cat in categories:
        num = len(re.findall(regex_list[cat], text))
        the_dict[cat] += num
    return the_dict

def neg_liwc_counter(the_text):
    text = re.sub(u'\u2019', "'", the_text).lower()
    the_neg_dict = Counter()
    for neg_id in neg_ids:
        neg_cat = neg_cat_list[neg_id]
        num = len(re.findall(neg_regex_list[neg_id], text))
        the_neg_dict[neg_cat] += num
    return the_neg_dict

def add_liwc_counters(the_text):
    pos_counter = pos_liwc_counter(the_text)
    neg_counter = neg_liwc_counter(the_text)
    pos_counter.subtract(neg_counter.elements())
    return pos_counter

In [297]:
sample_calls = pd.read_sql("""
    SELECT file_name, last_update, section, context, speaker_number
    FROM streetevents.speaker_data 
    WHERE context = 'pres'
    ORDER BY RANDOM()
    LIMIT 50""", con = engine)

sample_calls

Unnamed: 0,file_name,last_update,section,context,speaker_number
0,6101200_T,2016-09-16 06:32:16+10:00,1,pres,7
1,2128458_T,2009-03-20 06:16:11+11:00,1,pres,1
2,8003288_T,2017-10-06 05:44:35+11:00,1,pres,5
3,5308017_T,2014-03-12 05:18:47+11:00,1,pres,4
4,6806605_T,2016-12-02 07:42:40+11:00,1,pres,5
5,11503963_T,2018-06-09 08:07:38+10:00,1,pres,1
6,1369280_T,2006-08-15 02:19:08+10:00,1,pres,1
7,12710989_T,2019-09-06 21:23:55+10:00,1,pres,2
8,2460927_T,2009-09-26 01:50:36+10:00,1,pres,94
9,10686463_T,2017-08-13 07:32:29+10:00,1,pres,4


In [299]:
for call in range(0, 50):
    file_name = sample_calls['file_name'][call]
    speaker_number = sample_calls['speaker_number'][call]
    section = sample_calls['section'][call]
    
    call_text = pd.read_sql("""
    SELECT speaker_text
    FROM streetevents.speaker_data
    WHERE file_name = '%s' AND section = %s AND context = 'pres' AND speaker_number = %s"""%(file_name, section, speaker_number), con = engine)
    output = "sample_50/" + file_name + ".txt"
    with open(output, 'w') as f:
        f.write(call_text.iloc[0,0])
        f.close()

In [304]:
liwc_orig = pd.read_excel("sample_50/liwc2015_results_sample_50.xlsx", index_col = 0)
liwc_orig

for column in liwc_orig:
    if column == "Filename" or column == "WC":
        continue
    else:
        liwc_orig[column] = liwc_orig[column] * liwc_orig["WC"] / 100 + 0.5

liwc_orig = liwc_orig.astype(int, errors='ignore')

liwc_wc = liwc_orig.loc[:,'WC']
liwc_orig = liwc_orig.drop(columns = "WC")

In [305]:
import glob

call_files = glob.glob("sample_50/*.txt")
liwc_alt_results_list = []

for call_file in call_files:
    call_file_name = call_file.partition("/")[-1]
    
    text = open(call_file, "r").read()
    liwc_alt_counter = add_liwc_counters(text)
    
    liwc_alt_results = pd.DataFrame(liwc_alt_counter, index=[call_file_name,])
    liwc_alt_results_list.append(liwc_alt_results)
    
liwc_alt = pd.concat(liwc_alt_results_list)

liwc_alt.index.name = 'Filename'
liwc_alt.columns = map(str.lower, liwc_alt.columns)

In [306]:
liwc_orig = (liwc_orig.sort_values(by = ['Filename'])).sort_index(axis = 1)
liwc_alt = (liwc_alt.sort_values(by = ['Filename'])).sort_index(axis = 1)

In [312]:
liwc_compare = liwc_orig-liwc_alt
liwc_compare.to_csv("sample_50/liwc_compare.csv")
liwc_compare

Unnamed: 0_level_0,achieve,adj,adverb,affect,affiliation,anger,anx,article,assent,auxverb,...,social,space,swear,tentat,they,time,verb,we,work,you
Filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10336618_T.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1055701_T.txt,0,0,0,0,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
10686463_T.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1124961_T.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11503963_T.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12035995_T.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1212940_T.txt,0,0,0,0,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
12258064_T.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12296890_T.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12473365_T.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [313]:
liwc_compare[(liwc_compare.T != 0).any()]

Unnamed: 0_level_0,achieve,adj,adverb,affect,affiliation,anger,anx,article,assent,auxverb,...,social,space,swear,tentat,they,time,verb,we,work,you
Filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1055701_T.txt,0,0,0,0,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
1212940_T.txt,0,0,0,0,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
12548168_T.txt,0,0,0,0,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
12710989_T.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
1271602_T.txt,0,0,0,0,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
12835683_T.txt,0,0,0,0,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
1301567_T.txt,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
2963562_T.txt,0,0,0,0,0,0,0,0,0,0,...,-2,0,0,0,0,0,0,0,0,0
3230228_T.txt,0,0,0,0,0,0,0,0,0,0,...,-1,0,0,0,0,0,0,0,0,0
4210986_T.txt,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [314]:
liwc_compare_non_zero = liwc_compare[liwc_compare!=0].stack()
liwc_compare_non_zero

Filename                
1055701_T.txt   drives     -1.0
                male       -1.0
                power      -1.0
                social     -1.0
1212940_T.txt   drives     -1.0
                male       -1.0
                power      -1.0
                social     -1.0
12548168_T.txt  drives     -1.0
                male       -1.0
                power      -1.0
                social     -1.0
12710989_T.txt  function   -1.0
                informal   -1.0
                netspeak   -1.0
                ppron      -1.0
                pronoun    -1.0
                you        -1.0
1271602_T.txt   drives     -1.0
                male       -1.0
                power      -1.0
                social     -1.0
12835683_T.txt  drives     -1.0
                male       -1.0
                power      -1.0
                social     -1.0
1301567_T.txt   function   -1.0
                informal   -1.0
                netspeak   -1.0
                ppron      -1.0
               

In [315]:
liwc_compare.insert(0, column = "WC", value = liwc_wc)

for col in liwc_compare:
    if col == "Filename" or col == "WC":
        continue
    else:
        liwc_compare[col] = liwc_compare[col] / liwc_compare["WC"] 
        
liwc_compare.to_csv("sample_50/liwc_compare_percent.csv")
liwc_compare

Unnamed: 0_level_0,WC,achieve,adj,adverb,affect,affiliation,anger,anx,article,assent,...,social,space,swear,tentat,they,time,verb,we,work,you
Filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10336618_T.txt,389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1055701_T.txt,51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.019608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10686463_T.txt,684,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1124961_T.txt,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11503963_T.txt,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12035995_T.txt,220,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1212940_T.txt,56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.017857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12258064_T.txt,64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12296890_T.txt,2237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12473365_T.txt,1432,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
