In [1]:
# Ideally these should be set outside code.
import os
os.environ['PGHOST'] = "localhost"
os.environ['PGDATABASE'] = "crsp"

In [2]:
import os
import re
import json
import pandas as pd

from sqlalchemy import create_engine
from pandas.io.json import json_normalize

conn_string = 'postgresql://' + os.environ['PGHOST'] + '/' + os.environ['PGDATABASE']
engine = create_engine(conn_string)

target_schema = "se_features"

engine.execute("SET search_path TO %s, public" % target_schema)
rv = engine.execute("SELECT category FROM %s.liwc_2015" % target_schema)

categories = [ r['category'] for r in rv]

plan = """
    SELECT word_list
    FROM %s.liwc_2015 """ % target_schema + "WHERE category = %s"

mod_word_list = {}
for cat in categories:
    rows = list(engine.execute(plan, [cat]))
    word_list = rows[0]['word_list']
    mod_word_list[cat] = [re.sub('\*(?:\s*$)?', '[a-z]*', word.lower())
                            for word in word_list]

# Pre-compile regular expressions.
regex_list = {}
for key in mod_word_list.keys():
    regex = r'\b(?:' + '|'.join(mod_word_list[key]) + r')\b'
    regex_list[key] = re.compile(regex)
    
regex_list['They']

re.compile(r"\b(?:their[a-z]*|them|themself|themselves|they|they'd|they'll|they've|theyd|theyll|theyve)\b",
re.UNICODE)

In [3]:
def liwc_counts(the_text):
    """Function to return number of matches against a LIWC category in a text"""
    # Construct a counter of the words and return as JSON
    text = re.sub(u'\u2019', "'", the_text).lower()
    the_dict = {cat: len(re.findall(regex_list[cat], text)) for cat in categories}
    return json.dumps(the_dict)

def expand_json(df, col):
    return pd.concat([df.drop([col], axis=1),
                      df[col].map(lambda x: json.loads(x)).apply(pd.Series)], axis=1)

In [4]:
sample_text = """
    Okay. And then, as you look at the 132 franchise agreements signed year-to-date,
    it looks like about at least through this third quarter, slightly less 
    than half were net new agreements versus renewals or conversions. 
    The new guys coming in, what's the mix of brands they're choosing? 
    Are they -- where kind of in the scale are they kind of economy up?
            And what's the mix of brands they're coming over for?"""

In [5]:
liwc_counts(sample_text)

'{"Function": 40, "Pronoun": 9, "Ppron": 5, "I": 0, "We": 0, "You": 1, "SheHe": 0, "They": 4, "Ipron": 4, "Article": 5, "Prep": 18, "Auxverb": 7, "Power": 2, "Adverb": 2, "Conj": 5, "Negate": 0, "Verb": 11, "Adj": 6, "Compare": 5, "Interrog": 3, "Number": 3, "Quant": 2, "Affect": 5, "Posemo": 5, "Negemo": 0, "Anx": 0, "Anger": 0, "Sad": 0, "Social": 8, "Family": 0, "Friend": 2, "Female": 0, "Male": 1, "CogProc": 6, "Insight": 1, "Cause": 0, "Discrep": 0, "Tentat": 3, "Certain": 0, "Differ": 3, "Percept": 2, "See": 2, "Hear": 0, "Feel": 0, "Bio": 0, "Body": 0, "Health": 0, "Sexual": 0, "Ingest": 0, "Drives": 3, "Affiliation": 1, "Achieve": 0, "Reward": 0, "Risk": 0, "FocusPast": 1, "FocusPresent": 8, "FocusFuture": 3, "Relativ": 14, "Motion": 2, "Space": 7, "Time": 5, "Work": 2, "Leisure": 0, "Home": 0, "Money": 2, "Relig": 0, "Death": 0, "Informal": 1, "Swear": 0, "Netspeak": 0, "Assent": 1, "Nonflu": 0, "Filler": 0}'

In [6]:
[re.findall(regex_list['They'], sample_text)]

[['they', 'they', 'they', 'they']]