In [1]:
#! /usr/bin/env python

# Create pandas dataframe & lists
import pandas

colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_2.csv', names=colnames)
talks = df.text.tolist()
titles = df.title.tolist()

In [2]:
import math, re


# Set up a dictionary where k = word and v = weight
concretes = open('../data/Concreteness_ratings_Brysbaert_et_al.txt')
concrete_dict = dict(map(lambda wns: (wns[0], float(wns[2])), 
                 [ ws.strip().split('\t') for ws in concretes ]))

# Word splitter pattern
pattern_split = re.compile(r"\W+")

# Function to 
def concreteness(text):
    """
    Returns a float for concreteness strength based on the input text.
    The higher the number, the more concrete.
    """
    words = pattern_split.split(text.lower())
    concretions = []
    for word in words:
        concretions.append(concrete_dict.get(word,0))
    if len(concretions) > 0:
        concreteness = sum(concretions)/math.sqrt(len(concretions))
        # Should we weight the individual word concreteness? 
        # I've seen N, sqrt(N) or 1.    
    else:
        concreteness = 0
    return concreteness

In [None]:
# =-=-=-=-=-=-=-=-=-=-=
# Plotting
# =-=-=-=-=-=-=-=-=-=-= 
def sentiplot(filename, title, use_cuml=True, method='afinn'):
    fig = plt.figure()
    sent, cuml = senticuml(filename, method=method)
    if use_cuml == True:
        plt.plot(cuml,label=title)
        plt.ylabel("Cumulative Emotional Valence")
        plt.xlabel("Sentence #")
    else:
        plt.plot(sent,label=title)
        plt.ylabel("Emotional Valence")
        plt.xlabel("Sentence #")
    plt.legend()

In [3]:
talk_concreteness = []
for talk in talks:
    talk_concreteness.append(concreteness(talk))

In [4]:
talk_concreteness[0:10]

[100.39600088686036,
 123.44488037506318,
 122.96122519963039,
 126.45273286109526,
 57.0752608417299,
 101.87852455698962,
 99.71820055571283,
 98.7120237966078,
 93.82145323981567,
 108.42209698729881]

In [None]:
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 12, 8


# the histogram of the data
n, bins, patches = plt.hist(x, 50, normed=1, facecolor='green', alpha=0.75)

# add a 'best fit' line
y = mlab.normpdf( bins, mu, sigma)
l = plt.plot(bins, y, 'r--', linewidth=1)

plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title(r'$\mathrm{Histogram\ of\ IQ:}\ \mu=100,\ \sigma=15$')
plt.axis([40, 160, 0, 0.03])
plt.grid(True)

plt.show()
