In [17]:
#! /usr/bin/env python

# Create pandas dataframe & lists
import pandas

colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_2.csv', names=colnames)
talks = df.text.tolist()
titles = df.title.tolist()

In [16]:
import math, re


# Set up a dictionary where k = word and v = weight
concretes = open('../data/Concreteness_ratings_Brysbaert_et_al.txt')
concrete_dict = dict(map(lambda wns: (wns[0], float(wns[2])), 
                 [ ws.strip().split('\t') for ws in concretes ]))

# Word splitter pattern
pattern_split = re.compile(r"\W+")

# Function to 
def concreteness(text):
    """
    Returns a float for concreteness strength based on the input text.
    The higher the number, the more concrete.
    """
    words = pattern_split.split(text.lower())
    concretions = []
    for word in words:
        concretions.append(concrete_dict.get(word,0))
    if len(concretions) > 0:
        concreteness = sum(concretions)/math.sqrt(len(concretions))
        # Should we weight the individual word concreteness? 
        # I've seen N, sqrt(N) or 1.    
    else:
        concreteness = 0
    return concreteness

In [None]:
# =-=-=-=-=-=-=-=-=-=-=
# Plotting
# =-=-=-=-=-=-=-=-=-=-= 
def sentiplot(filename, title, use_cuml=True, method='afinn'):
    fig = plt.figure()
    sent, cuml = senticuml(filename, method=method)
    if use_cuml == True:
        plt.plot(cuml,label=title)
        plt.ylabel("Cumulative Emotional Valence")
        plt.xlabel("Sentence #")
    else:
        plt.plot(sent,label=title)
        plt.ylabel("Emotional Valence")
        plt.xlabel("Sentence #")
    plt.legend()

In [None]:
# Old Plotting Stuff
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 12, 8

In [None]:
# =-=-=-=-=-=
# Clean and Tokenize, then Drop Stopwords
# =-=-=-=-=-=

# Load tokenizer, stopwords, and stemmer
tokenizer = WhitespaceTokenizer()
stopwords = re.split('\s+', open('../data/tt_stop.txt', 'r').read().lower())
p_stemmer = PorterStemmer()

# Loop to tokenize, stop, and stem (if needed) texts.
texts = []
for i in talks:   
    # clean and tokenize document string
    raw = re.sub(r"[^\w\d'\s]+",'', i).lower()
    tokens = tokenizer.tokenize(raw)
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in stopwords]
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    # add tokens to list
    texts.append(stemmed_tokens)

# =-=-=-=-=-=-=-=-=-=-=
# Re-Assemble Texts as Strings from Lists of Words
# =-=-=-=-=-=-=-=-=-=-= 

strungs = []
for text in texts:
    strung = ' '.join(text)
    strungs.append(strung)