In [1]:
%matplotlib inline

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from clpsych.store import Store
from clpsych.helpers import load_tokens

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import codecs

In [3]:
def read_liwc(path='./data/other_materials/liwc/LIWC2007.dic'):
    reading_words = []
    categories, words = {}, {}
    with codecs.open(path, 'r') as fp:
        # skip the very first '%'
        fp.readline()
        # read the data (first the categories and their indices, then the words)
        for line in fp:
            if line.strip() == '%':
                reading_words = True
            
            if reading_words:
                l = line.strip().split('\t')
                try:
                    word, cats = l[0], [int(v) for v in l[1:]]
                    # clear out the asterisk if there is one
                    if word[-1] == '*':
                        word = word[:-1]
                except ValueError:
                    continue
                # add it to the dict
                words[word] = cats
            else:
                ix, cat = line.strip().split()
                categories[int(ix)] = cat
    return categories, words

def process_text_with_liwc(text, words, subcats=[]):
    topics = []
    for token in text:
        token_topics = words.get(token, [])
        topics.extend(token_topics)
    if len(subcats) > 0:
        topics = [t for t in topics if t in subcats]
    return topics

In [4]:
categories, words = read_liwc()

In [5]:
train_classes = pd.DataFrame.from_csv('data/classes/train_classes.txt')
dev_classes = pd.DataFrame.from_csv('data/classes/dev_classes.txt')
sample_classes = pd.DataFrame.from_csv('data/classes/sample_classes.txt')

In [6]:
df = load_tokens('./data/tokens/lemmas.txt')
df.replace(np.nan, '', regex=True)
df['text_features'] = df[['title', 'doc']].astype(str).apply(lambda x: ' '.join(x), axis=1)

In [9]:
df['topics'] = df.text_features.astype(str).str.split().apply(lambda x: process_text_with_liwc(x, words))

In [10]:
vect = CountVectorizer(lowercase=False, tokenizer=lambda x: x)

In [11]:
X = vect.fit_transform(df['topics'])

In [12]:
vectorized = pd.DataFrame(X.todense(), columns=vect.get_feature_names())

In [13]:
df = pd.concat([df, vectorized], axis=1)

In [13]:
for col in ['title', 'doc', 'topics', 'text_features']:
    df = df.drop(col, 1)

In [15]:
df.to_csv('data/features/liwc.csv')