In [22]:
import pandas

# Load the CSV as a dataframe
colnames = ['Title' , 'Date', 'Author', 'Origin', 'URL', 'Text']
df = pandas.read_csv('./clowns.csv', names=colnames)

# Create two lists (because I'm not so good on working with the dataframe)
texts = df.Text.tolist()
dates = df.Date.tolist()

df.head()

In [26]:
# =-=-=-=-=-=-=-=-=-=-=
# Clean and Tokenize
# =-=-=-=-=-=-=-=-=-=-= 

import re
from nltk.tokenize import WhitespaceTokenizer

def string_test(s):
    if s is None:
        return ''
    else:
        return str(s)

# With any luck, this list comprehension will work:
strings = [ string_test(text) for text in texts ]

# Eliminate carriage returns
legends = []
for string in strings:
    string = string.replace(u'\xa0', u' ')
    legends.append(string)

# TOKENIZE
tokenizer = WhitespaceTokenizer()
words = []
for legend in legends:   
    raw = re.sub(r"[^\w\d'\s]+",'', legend).lower()
    tokens = tokenizer.tokenize(raw)
    words.append(tokens)

In [34]:
# =-=-=-=-=-=-=-=-=-=-=
# Create dictionary with word positions as list
# =-=-=-=-=-=-=-=-=-=-= 

def word_positions(listname):
    from collections import defaultdict
    words_with_positions = defaultdict(list)
    for position, word in enumerate(listname):
        words_with_positions[word].append(float(position)/len(listname))
    return(words_with_positions)

super_dict = {}
for word in words:
    temp_dict = word_positions(word)
    for k, v in temp_dict.items():
        if super_dict.get(k) is None:
            super_dict[k] = []
        if v not in super_dict.get(k):
            # Possibly problematic for larger data sets
            super_dict[k] = super_dict[k] + v
            
places = pandas.DataFrame()
places['word'] = super_dict.keys()
places['positions'] = super_dict.values()

places.head()

Unnamed: 0,word,positions
0,definitely,"[0.1000629326620516, 0.6154158215010141, 0.919..."
1,confirmation,[0.09612277867528271]
2,commissioner,"[0.2098360655737705, 0.17454545454545456, 0.47..."
3,triad,[0.03414634146341464]
4,spring,"[0.021688159437280186, 0.13043478260869565]"


In [36]:
# Get the list of positions out of the dataframe and into a list
positions = places['positions'].tolist()

# Calculate averages
average_positions = []
for the_list in positions:
    average_positions.append(sum(the_list)/len(the_list))

# Calculate frequencies
occurrences = []
for the_list in positions:
    occurrences.append(len(the_list))

# Write averages and frequencies back to dataframe
places['average'] = average_positions
places['occurs'] = occurrences

# Check work
places.head(10)

Unnamed: 0,word,positions,average,occurs
0,definitely,"[0.1000629326620516, 0.6154158215010141, 0.919...",0.528016,8
1,confirmation,[0.09612277867528271],0.096123,1
2,commissioner,"[0.2098360655737705, 0.17454545454545456, 0.47...",0.434423,5
3,triad,[0.03414634146341464],0.034146,1
4,spring,"[0.021688159437280186, 0.13043478260869565]",0.076061,2
5,cxr1037eenglish,[0.553030303030303],0.55303,1
6,ooooh,[0.20932539682539683],0.209325,1
7,bombarded,"[0.986404833836858, 0.6469248291571754]",0.816665,2
8,tad,"[0.6147011308562197, 0.7447495961227787, 0.899...",0.752827,3
9,nbc,"[0.17672413793103448, 0.660122699386503, 0.170...",0.42813,5


In [41]:
first_words = places[(places.average < 0.35) & (places.occurs > 10)]
first_words.head(len(first_words))

Unnamed: 0,word,positions,average,occurs
248,14yearold,"[0.5222672064777328, 0.06896551724137931, 0.62...",0.259435,12
387,fleetwood,"[0.3806646525679758, 0.022222222222222223, 0.9...",0.337122,57
1006,nation,"[0.10010649627263046, 0.041353383458646614, 0....",0.346601,33
1182,surfaced,"[0.8924302788844621, 0.11605124340617935, 0.11...",0.316133,15
1346,chapter,"[0.04697624190064795, 0.08585313174946005, 0.0...",0.096376,16
1417,persons,"[0.15264423076923078, 0.4625766871165644, 0.33...",0.328956,11
1683,hundreds,"[0.01267605633802817, 0.17516339869281045, 0.1...",0.319728,17
1717,caller,"[0.42231075697211157, 0.27403846153846156, 0.3...",0.328872,11
2518,sic,"[0.8237410071942446, 0.357487922705314, 0.3742...",0.349828,25
3519,sc,"[0.003418803418803419, 0.002403846153846154, 0...",0.209849,17


In [45]:
last_words = places[(places.average > 0.67) & (places.occurs > 10)]
last_words.sort_values('average', ascending = False)

Unnamed: 0,word,positions,average,occurs
1823,add,"[0.8218623481781376, 0.15768463073852296, 0.54...",0.81838,17
3119,line,"[0.6117788461538461, 0.9608138658628486, 0.861...",0.742678,14
9058,shemwood,"[0.8798076923076923, 0.8966346153846154, 0.900...",0.742272,14
5435,likes,"[0.9041666666666667, 0.5988372093023255, 0.276...",0.738828,14
6840,we've,"[0.9200710479573713, 0.9603315571343991, 0.468...",0.736274,12
5921,2017,"[0.08650519031141868, 0.8673349056603774, 0.54...",0.722595,12
7496,free,"[0.6299559471365639, 0.7160212604403948, 0.173...",0.719936,14
6706,zombie,"[0.4898785425101215, 0.7922077922077922, 0.863...",0.711455,12
8648,radford,"[0.7115384615384616, 0.8769230769230769, 0.132...",0.707317,20
8043,heightened,"[0.9143730886850153, 0.8707403055229143, 0.538...",0.695467,12
