In [4]:
#! /usr/bin/env python

# Create pandas dataframe & lists
import pandas

colnames = ['old_index','citation','author','gender','title','date','length','text','occupation']
df = pandas.read_csv('../data/talks_6b.csv', names=colnames)
# talks = df.text.tolist()
# titles = df.title.tolist()

In [9]:
# Cleaning up pandas need to write out indices to a file but not drop them when importing
df1 = df.drop('old_index', axis=1)
df2 = df1.drop(df1.index[0])

In [11]:
# Checking to see if things are in good shape
df2.head()

Unnamed: 0,citation,author,gender,title,date,length,text,occupation
1,Al Gore 2006,Al Gore,male,Averting the climate crisis,Jun 2006,957,Thank you so much Chris. And it's truly a gre...,Climate advocate
2,David Pogue 2006,David Pogue,male,Simplicity sells,Jun 2006,1271,Hello voice mail my old friend. I've called f...,Technology columnist
3,Cameron Sinclair 2006,Cameron Sinclair,male,My wish: A call for open-source architecture,Jul 2006,1398,I'm going to take you on a journey very quickl...,"Co-founder, Architecture for Humanity"
4,Sergey Brin + Larry Page 2007,Sergey Brin + Larry Page,male,The genesis of Google,May 2007,1205,Sergey Brin I want to discuss a question I kn...,
5,Nathalie Miebach 2011,Nathalie Miebach,female,Art made of storms,Oct 2011,247,What you just heard are the interactions of ba...,Artist


In [13]:
#Create a new function:
def num_missing(x):
  return sum(x.isnull())

#Applying per column:
print("Missing values per column:")
print(df2.apply(num_missing, axis=0)) #axis=0 defines that function is to be applied on each column

Missing values per column:
citation       0
author         0
gender         0
title          0
date           0
length         0
text           0
occupation    79
dtype: int64


In [26]:
# Get all the unique values in the occupation column
# (which will come out as an array) and convert to a list
# Included here is the total number of values and the number of rows
# that have no values at all.
occupations = df2.occupation.unique().tolist()
print(len(occupations), sum(df2.occupation.isnull()), occupations)

1176 79 ['Climate advocate', 'Technology columnist', 'Co-founder, Architecture for Humanity', nan, 'Artist', 'Public health researcher', 'Writer', 'Research scientist', 'Privacy artist', 'Neuroeconomist', 'Inventor', 'Movement expert', 'Chemist', 'Digital preservationist', 'Brain scientist', 'Comedian', 'Medical image maker', 'Jetman', 'Developer, 12 year old', 'Biochemist, geneticist', 'Social critic', 'Astronomer', 'Storyteller', 'Artist, urban farmer', 'Pro snowboarder', 'Philosopher, entrepreneur', 'Science author', 'Science writer', 'Researcher', 'Wrongologist', 'Designer, educator', 'Organizer', 'Computer scientist', 'Spider silk scientist', 'Medical inventor', 'Jazz vibraphonist', 'Photographer', 'Surgeon', 'Environmental economist', 'Education innovator', 'Neuroscientist', 'Behavioral economist', 'Anti-torture activist', 'Physiotherapist', 'Author', 'Actor and activist', 'Diplomat', 'Data scientist', 'Computer designer, brain researcher', 'Biomedical animator', 'Philosopher', '

In [27]:
import math, re


# Set up a dictionary where k = word and v = weight
concretes = open('../metrics/Concreteness_Brysbaert_et_al.txt')
concrete_dict = dict(map(lambda wns: (wns[0], float(wns[2])), 
                 [ ws.strip().split('\t') for ws in concretes ]))

In [30]:
# Word splitter pattern
pattern_split = re.compile(r"\W+")

# Function to 
def concreteness(text):
    """
    Returns a float for concreteness strength based on the input text.
    The higher the number, the more concrete.
    """
    words = pattern_split.split(text.lower())
    concretions = []
    for word in words:
        concretions.append(concrete_dict.get(word,0))
    if len(concretions) > 0:
        concreteness = sum(concretions)/math.sqrt(len(concretions))
        # Should we weight the individual word concreteness? 
        # I've seen N, sqrt(N) or 1.    
    else:
        concreteness = 0
    return concreteness

In [31]:
talks = df2.text.tolist()
titles = df2.title.tolist()

In [41]:
concrete_score = [ int(concreteness(talk)) for talk in talks ]
# print(concrete_score)

In [52]:
# Some Baseline Statistics
import statistics

concrete_mean = statistics.mean(concrete_score)
concrete_median = statistics.median(concrete_score)
concrete_mode = statistics.mode(concrete_score)

print("For concreteness:")
print("the mean is {}".format(concrete_mean))
print("the median is {}".format(concrete_median))
print("the mode is {}".format(concrete_mode))
print("the standard deviation is {}".format(statistics.pstdev(concrete_score)))

For concreteness:
the mean is 95.09719535783366
the median is 98.0
the mode is 119
the standard deviation is 23.27374168661536


In [55]:
# Now let's look at that as a functon of gender

# We need all the talks that are by men:
df_male = df2.loc[df2['gender'] == 'male']
male_talks = df_male.text.tolist()
male_concrete = [ int(concreteness(talk)) for talk in male_talks ]
print(len(male_talks), statistics.mean(male_concrete))

1437 97.08559498956159


In [56]:
# We need all the talks that are by women:
female_df = df2.loc[df2['gender'] == 'female']
female_talks = female_df.text.tolist()
female_concrete = [ int(concreteness(talk)) for talk in female_talks ]
print(len(female_talks), statistics.mean(female_concrete))

607 90.68204283360791


In [None]:
# =-=-=-=-=-=-=-=-=-=-=
# Plotting
# =-=-=-=-=-=-=-=-=-=-= 
def plotter (filename, title, use_cuml=True, method='afinn'):
    fig = plt.figure()
    sent, cuml = senticuml(filename, method=method)
    if use_cuml == True:
        plt.plot(cuml,label=title)
        plt.ylabel("Cumulative Emotional Valence")
        plt.xlabel("Sentence #")
    else:
        plt.plot(sent,label=title)
        plt.ylabel("Emotional Valence")
        plt.xlabel("Sentence #")
    plt.legend()

In [None]:
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 12, 8


# the histogram of the data
n, bins, patches = plt.hist(x, 50, normed=1, facecolor='green', alpha=0.75)

# add a 'best fit' line
y = mlab.normpdf( bins, mu, sigma)
l = plt.plot(bins, y, 'r--', linewidth=1)

plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title(r'$\mathrm{Histogram\ of\ IQ:}\ \mu=100,\ \sigma=15$')
plt.axis([40, 160, 0, 0.03])
plt.grid(True)

plt.show()


In [None]:
def afinn_sentiment(filename):
    from afinn import Afinn
    afinn = Afinn()
    with open (my_file, "r") as myfile:
        text = myfile.read().replace('\n', ' ')   
        sentences = tokenize.sent_tokenize(text)
        sentiments = []
        for sentence in sentences:
            sentsent = afinn.score(sentence)
            sentiments.append(sentsent)
        return sentiments

def plot_sentiments(filename):
    fig = plt.figure()
    plt.title("Comparison of Sentiment Libraries")
    plt.plot(afinn_sentiment(filename), label = "Afinn")
    plt.plot(textblob_sentiment(filename), label = "TextBlob")
    plt.plot(indico_sentiment(filename), label = "Indico")
    plt.ylabel("Emotional Valence")
    plt.xlabel("Sentence #")
    plt.legend(loc='lower right')
    plt.annotate("Oral Legend LAU-14 Used", xy=(30, 2))