In [3]:
# Ideally these should be set outside code.
import os
os.environ['PGHOST'] = "localhost"
os.environ['PGDATABASE'] = "crsp"

In [4]:
import os
import re
import json
import pandas as pd

from sqlalchemy import create_engine
from pandas.io.json import json_normalize

conn_string = 'postgresql://' + os.environ['PGHOST'] + '/' + os.environ['PGDATABASE']
engine = create_engine(conn_string)

target_schema = "se_features"

engine.execute("SET search_path TO %s, public" % target_schema)
rv = engine.execute("SELECT category FROM %s.liwc_2015" % target_schema)

categories = [ r['category'] for r in rv]

plan = """
    SELECT word_list
    FROM %s.liwc_2015 """ % target_schema + "WHERE category = %s"

mod_word_list = {}
for cat in categories:
    rows = list(engine.execute(plan, [cat]))
    word_list = rows[0]['word_list']
    mod_word_list[cat] = [re.sub('\*(?:\s*$)?', '[a-z]*', word.lower())
                            for word in word_list]
    
# Include numeric values in Number category (modified by Yvonne)
mod_word_list['Number'].append('[0-9]+(\.[0-9]+)?')
mod_word_list['Number'].append('\.[0-9]+')


# Pre-compile regular expressions.
regex_list = {}
for key in mod_word_list.keys():
    # NOTE: Deleted hyphen for testing
    regex = r"\b(?:" + "|".join(mod_word_list[key]) + r")(?=(?:[^a-zA-Z0-9_']|$))"
    regex_list[key] = re.compile(regex)


In [5]:
import os
import re
import json
import pandas as pd

from sqlalchemy import create_engine
from pandas.io.json import json_normalize

conn_string = 'postgresql://' + os.environ['PGHOST'] + '/' + os.environ['PGDATABASE']
engine = create_engine(conn_string)

target_schema = "se_features"

engine.execute("SET search_path TO %s, public" % target_schema)
rv = engine.execute("SELECT id FROM %s.negative_liwc_2015" % target_schema)

neg_ids = [r['id'] for r in rv]

plan = """
    SELECT word_list
    FROM %s.negative_liwc_2015 """ % target_schema + "WHERE id = %s"

neg_mod_word_list = {}
for neg_id in neg_ids:
    rows = list(engine.execute(plan, [neg_id]))
    word_list = rows[0]['word_list']
    neg_mod_word_list[neg_id] = [re.sub('\*(?:\s*$)?', '[a-z]*', word.lower())
                            for word in word_list]
    

plan = """
    SELECT category
    FROM %s.negative_liwc_2015 """ % target_schema + "WHERE id = %s"

# Pre-compile regular expressions.
neg_cat_list = {}
neg_regex_list = {}

for key in neg_mod_word_list.keys():
    rows = list(engine.execute(plan, [key]))
    neg_cat = rows[0]['category']
    neg_cat_list[key] = neg_cat
    # NOTE: Deleted hyphen for testing
    neg_regex = r"\b(?:" + "|".join(neg_mod_word_list[key]) + r")(?=(?:[^a-zA-Z0-9_']|$))"
    neg_regex_list[key] = re.compile(neg_regex)

In [6]:
from collections import Counter 
def pos_liwc_counter(the_text):
    text = the_text.lower()
    the_dict = Counter()
    for cat in categories:
        num = len(re.findall(regex_list[cat], text))
        the_dict[cat] += num
    return the_dict

In [7]:
from collections import Counter 
def neg_liwc_counter(the_text):
    text = the_text.lower()
    the_neg_dict = Counter()
    for neg_id in neg_ids:
        neg_cat = neg_cat_list[neg_id]
        num = len(re.findall(neg_regex_list[neg_id], text))
        the_neg_dict[neg_cat] += num
    return the_neg_dict

In [8]:
def add_liwc_counters(the_text):
    pos_counter = pos_liwc_counter(the_text)
    neg_counter = neg_liwc_counter(the_text)
    liwc_result = pos_counter
    liwc_result.subtract(neg_counter.elements())
    return liwc_result

In [12]:
# liwc_orig and liwc_alt return different values for `Bio` and `Body` for the below text.

bio_text = """But as we all know, particularly in the memory side, things dramatically weakened."""

In [13]:
# liwc_orig: Bio: 1, Body: 0
# liwc_alt (see below): Bio: 0, Body: 1

add_liwc_counters(bio_text)

Counter({'Achieve': 0,
         'Adj': 2,
         'Adverb': 1,
         'Affect': 1,
         'Affiliation': 1,
         'Anger': 0,
         'Anx': 0,
         'Article': 1,
         'Assent': 0,
         'Auxverb': 0,
         'Bio': 0,
         'Body': 1,
         'Cause': 0,
         'Certain': 2,
         'CogProc': 5,
         'Compare': 1,
         'Conj': 2,
         'Death': 0,
         'Differ': 1,
         'Discrep': 0,
         'Drives': 2,
         'Family': 0,
         'Feel': 0,
         'Female': 0,
         'Filler': 0,
         'FocusFuture': 0,
         'FocusPast': 1,
         'FocusPresent': 1,
         'Friend': 0,
         'Function': 7,
         'Health': 1,
         'Hear': 0,
         'Home': 0,
         'I': 0,
         'Informal': 0,
         'Ingest': 0,
         'Insight': 2,
         'Interrog': 0,
         'Ipron': 1,
         'Leisure': 1,
         'Male': 0,
         'Money': 0,
         'Motion': 0,
         'Negate': 0,
         'Negemo': 1,
       

In [14]:
# The Bio word list was incomplete as the last word is "pepsi"
# while it should be "zoloft"
regex_list["Bio"]

re.compile(r"\b(?:abdomen[a-z]*|abortion[a-z]*|abs|ache[a-z]*|aching[a-z]*|acne|addict[a-z]*|advil|aerobic[a-z]*|aids|alcohol[a-z]*|alive|allerg[a-z]*|amput[a-z]*|anal|ankle[a-z]*|anorexi[a-z]*|antacid[a-z]*|antidepressant[a-z]*|anus[a-z]*|appeti[a-z]*|arch|arm|armpit[a-z]*|arms[a-z]*|arous[a-z]*|arse|arses|arter[a-z]*|arthr[a-z]*|asexual[a-z]*|asleep|aspirin[a-z]*|ass|asses|asshole[a-z]*|asthma[a-z]*|ate|bacon[a-z]*|bake[a-z]*|baking|bald|banana|bandage[a-z]*|bandaid|bar|bars|bbq|bdsm|bean[a-z]*|beard|beef|beer[a-z]*|bellies|belly|bi|bi-sexual[a-z]*|bicep[a-z]*|binge[a-z]*|binging|biolog[a-z]*|bipolar|bj|bladder[a-z]*|bleed[a-z]*|blind[a-z]*|blood|bloody|blowjob[a-z]*|bodi[a-z]*|body[a-z]*|boil[a-z]*|bollock[a-z]*|bone|boner[a-z]*|bones|bony|boob[a-z]*|booty|booz[a-z]*|bowel[a-z]*|brain[a-z]*|bread|breakfast[a-z]*|breast[a-z]*|breath[a-z]*|bronchi[a-z]*|brownies|brunch[a-z]*|bulimi[a-z]*|burger|burp[a-z]*|butt|butter[a-z]*|buttfuck[a-z]*|butts|cafe[a-z]*|caffeine|cake[a-z]*|call-girl[

In [15]:
# The first word of the "Body" category should start with a
regex_list['Body']

re.compile(r"\b(?:perspir[a-z]*|perv|perver[a-z]*|pervy|pharmac[a-z]*|phobi[a-z]*|physical|physicality|physically|physicals|physician[a-z]*|pie|pies|pill|pills|pimple[a-z]*|pimply|piss[a-z]*|pizza[a-z]*|pms|podiatr[a-z]*|poison[a-z]*|poop[a-z]*|popcorn|pork|porn[a-z]*|potato[a-z]*|pregnan[a-z]*|prescri[a-z]*|prick[a-z]*|prognos[a-z]*|promiscu[a-z]*|prostat[a-z]*|prostitu[a-z]*|prozac|prude|prudish|pubic|puk[a-z]*|pulse|puss|pussies|pussy[a-z]*|queas[a-z]*|queer[a-z]*|rape[a-z]*|raping|rapist[a-z]*|rash[a-z]*|rehab[a-z]*|restau[a-z]*|retina[a-z]*|rib|ribs|rice|ritalin|rum|rx|salad[a-z]*|saliv[a-z]*|salsa|salt|salty|sandwich[a-z]*|sauce|sauces|sausage[a-z]*|scab[a-z]*|scalp|schizophren[a-z]*|scrape[a-z]*|screw[a-z]*|seduc[a-z]*|seizure[a-z]*|sensation|sensations|servings|sex|sexier|sexiest|sexily|sexiness|sexing|sexless[a-z]*|sexploit[a-z]*|sexpot[a-z]*|sext[a-z]*|sexual[a-z]*|sexy|shirt[a-z]*|shit[a-z]*|shoe[a-z]*|shoulder[a-z]*|sick|sickday[a-z]*|sicker|sickest|sickleave[a-z]*|sickly|s