In [1]:
# =-=-=-=-=-=-=-=-=-=-=
# Data Load & Tokenize
# =-=-=-=-=-=-=-=-=-=-= 

import pandas
import re
from nltk.tokenize import WhitespaceTokenizer

# LOAD
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_3.csv', names=colnames)
talks = df.text.tolist()
authors = df.author.tolist()
dates = df.date.tolist()
years = [re.sub('[A-Za-z ]', '', item) for item in dates]
authordate = [author+" "+year for author, year in zip(authors, years)]

# TOKENIZE
tokenizer = WhitespaceTokenizer()
texts = []
for talk in talks:   
    raw = re.sub(r"[^\w\d'\s]+",'', talk).lower()
    tokens = tokenizer.tokenize(raw)
    texts.append(tokens)

In [2]:
# =-=-=-=-=-=-=-=-=-=-=
# Small Test Corpus
# =-=-=-=-=-=-=-=-=-=-= 

test = texts[0:5]

In [3]:
# =-=-=-=-=-=-=-=-=-=-=
# Function to collect word positions within a text (as a word list)
# =-=-=-=-=-=-=-=-=-=-= 

def word_positions(listname):
    from collections import defaultdict
    words_with_positions = defaultdict(list)
    for position, word in enumerate(listname):
        words_with_positions[word].append(float(position)/len(listname))
    return(words_with_positions)

# word_positions(texts[0]) # check output

In [4]:
# =-=-=-=-=-=-=-=-=-=-=
# Develop "super dictionary" of all the texts involved
# =-=-=-=-=-=-=-=-=-=-= 

super_dict = {}
for text in test:
    temp_dict = word_positions(text)
    for k, v in temp_dict.items():
        if super_dict.get(k) is None:
            super_dict[k] = []
        if v not in super_dict.get(k):
            # Possibly problematic for larger data sets
            super_dict[k] = super_dict[k] + v
            
# print(super_dict) # check output

In [5]:
super_dict["thank"]

[0.0,
 0.017037387600567912,
 0.9966871746332229,
 0.044716692189892805,
 0.4214395099540582,
 0.5565084226646249,
 0.7650842266462481,
 0.9191424196018376,
 0.9978560490045941,
 0.9994391475042064,
 0.373067191453472,
 0.5780151813325837,
 0.7880236154062412,
 0.8538093899353387,
 0.9994377284228282,
 0.9969465648854962]

In [6]:
import statistics

word = "global"

print("*" + word + "* occurs " + str(len(super_dict[word])) + 
      " times in the corpus, with an average position of " + 
      str(statistics.mean(super_dict[word])) + ".")

*global* occurs 7 times in the corpus, with an average position of 0.6342322621960647.


In [7]:
df = pandas.DataFrame()
df['word'] = super_dict.keys()
df['positions'] = super_dict.values()

In [8]:
df.head(10)

Unnamed: 0,word,positions
0,minds,"[0.9081874112636062, 0.004779308405960078]"
1,but,"[0.07572172266919072, 0.2513014671083767, 0.32..."
2,adaptable,[0.9150308468872687]
3,people's,"[0.023836231071228266, 0.6573191250701066]"
4,warning,[0.004900459418070444]
5,nervous,[0.888208269525268]
6,disasters,"[0.01682557487380819, 0.0973079080201907, 0.13..."
7,survival,[0.41475042063937184]
8,not,"[0.448651206814955, 0.4538570752484619, 0.5423..."
9,public,"[0.2392035894559731, 0.24425126191811553, 0.98..."


In [9]:
len(df['positions'])

2329

In [17]:
# I need two additional columns: 
# one for mean, one for counting the number of items in a list

import numpy as np

df['total'] = np.sum(df['positions'].tolist())

ValueError: Length of values does not match length of index

In [15]:
df.head()

Unnamed: 0,word,positions,total
0,minds,"[0.9081874112636062, 0.004779308405960078]","[0.9081874112636062, 0.004779308405960078]"
1,but,"[0.07572172266919072, 0.2513014671083767, 0.32...","[0.9081874112636062, 0.004779308405960078, 0.0..."
2,adaptable,[0.9150308468872687],"[0.9081874112636062, 0.004779308405960078, 0.0..."
3,people's,"[0.023836231071228266, 0.6573191250701066]","[0.9081874112636062, 0.004779308405960078, 0.0..."
4,warning,[0.004900459418070444],"[0.9081874112636062, 0.004779308405960078, 0.0..."
