In [1]:
# =-=-=-=-=-=-=-=-=-=-=
# Data Load & Tokenize
# =-=-=-=-=-=-=-=-=-=-= 

import pandas
import re
from nltk.tokenize import WhitespaceTokenizer

# LOAD
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks_3.csv', names=colnames)
talks = df.text.tolist()
authors = df.author.tolist()
dates = df.date.tolist()
years = [re.sub('[A-Za-z ]', '', item) for item in dates]
authordate = [author+" "+year for author, year in zip(authors, years)]

# TOKENIZE
tokenizer = WhitespaceTokenizer()
texts = []
for talk in talks:   
    raw = re.sub(r"[^\w\d'\s]+",'', talk).lower()
    tokens = tokenizer.tokenize(raw)
    texts.append(tokens)

# =-=-=-=-=-=-=-=-=-=-=
# Small Test Corpus
# =-=-=-=-=-=-=-=-=-=-= 
#
# test = texts[0:5]

In [2]:
# =-=-=-=-=-=-=-=-=-=-=
# Function to collect word positions within a text (as a word list)
# =-=-=-=-=-=-=-=-=-=-= 

def word_positions(listname):
    from collections import defaultdict
    words_with_positions = defaultdict(list)
    for position, word in enumerate(listname):
        words_with_positions[word].append(float(position)/len(listname))
    return(words_with_positions)

# word_positions(texts[0]) # check output

In [3]:
# =-=-=-=-=-=-=-=-=-=-=
# Develop "super dictionary" of all the texts involved
# =-=-=-=-=-=-=-=-=-=-= 

super_dict = {}
for text in texts:
    temp_dict = word_positions(text)
    for k, v in temp_dict.items():
        if super_dict.get(k) is None:
            super_dict[k] = []
        if v not in super_dict.get(k):
            # Possibly problematic for larger data sets
            super_dict[k] = super_dict[k] + v
            
# print(super_dict) # check output

In [1]:
super_dict["cholera"]

NameError: name 'super_dict' is not defined

In [None]:
# =-=-=-=-=-=-=-=-=-=-=
# Quick way to get occurrence and average position for a term in the dictionary
# =-=-=-=-=-=-=-=-=-=-= 

import statistics

word = "adaptable"

# the format method would work better here
print("*" + word + "* occurs " + str(len(super_dict[word])) + 
      " times in the corpus, with an average position of " + 
      str(statistics.mean(super_dict[word])) + ".")

In [None]:
words_df = pandas.DataFrame()
words_df['word'] = super_dict.keys()
words_df['positions'] = super_dict.values()

# Check -----> returns 56588
# print(len(words_df))

In [None]:
words_df.head()

In [None]:
# I need two additional columns: 
# one for mean, one for counting the number of items in a list

positions = words_df['positions'].tolist()

average_positions = []
for the_list in positions:
    average_positions.append(sum(the_list)/len(the_list))
    
occurrences = []
for the_list in positions:
    occurrences.append(len(the_list))

# Quick Check
# print(average_positions[0:5])
# print(occurrences[0:5])

words_df['average'] = average_positions
words_df['occurs'] = occurrences
words_df.head(10)

In [None]:
words_df.to_csv('../data/word_places.csv', index=False)

In [2]:
import pandas
colnames = ['word', 'positions', 'average' , 'occurs']
words = pandas.read_csv('../data/word_places.csv', names=colnames)

In [5]:
words.head()

Unnamed: 0,word,positions,average,occurs
0,word,positions,average,occurs
1,casseroles,[0.7642857142857142],0.7642857142857142,1
2,procedurally,"[0.41314425961276247, 0.42568857376602126]",0.4194164166893919,2
3,gaelic,"[0.07612635939927499, 0.08130502330398758, 0.1...",0.09822199205938202,3
4,shambles,"[0.0924962852897474, 0.1502231036192365]",0.12135969445449195,2


Okay, the first thing I need to do is to make sure I understand how to access/locate a cell within a pandas dataframe. 

The following work:

In [27]:
words.iloc[3, 1]

'[0.07612635939927499, 0.08130502330398758, 0.13723459347488348]'

In [25]:
words.at[3, 'positions']

'[0.07612635939927499, 0.08130502330398758, 0.13723459347488348]'

If you simply drop one of these into a `qcut` as in `pandas.qcut(words.iloc[3,1], 10)`, you get back: `ValueError: Bin edges must be unique`. The workaround seems to be embedding qcut into a function:

In [30]:
def pct_rank_qcut(series, n):
    import pandas as pd
    edges = pd.Series([float(i) / n for i in range(n + 1)])
    f = lambda x: (edges >= x).argmax()
    return series.rank(pct=1).apply(f)

q = pct_rank_qcut(words.iloc[3, 1], 10)
print(q)

AttributeError: 'str' object has no attribute 'rank'