In [1]:
import pandas

# Load the CSV as a dataframe
colnames = ['Title' , 'Date', 'Author', 'Origin', 'URL', 'Text']
df = pandas.read_csv('./clowns.csv', names=colnames)

# Create two lists (because I'm not so good on working with the dataframe)
texts = df.Text.tolist()
dates = df.Date.tolist()

df.head()

Unnamed: 0,Title,Date,Author,Origin,URL,Text
0,﻿Clown Attack on Woman Forces Cincinnati Subur...,September 30 2016,,News Report,http://insider.foxnews.com/2016/09/30/clown-at...,An Ohio school district closed schools today a...
1,Another Clown Was Spotted In The Woods And Pol...,September 6 2016,Michelle Broder Van Dyke,News Report,https://www.buzzfeed.com/mbvd/stop-clowning-ar...,The latest clown to be spotted was chased back...
2,Everyone in poor moiuntain please stay inside....,September 13 2016,Melissa Dooley,Facebook,https://www.facebook.com/melissa.dooley.397/po...,I don't know if this is real or fake. I didn't...
3,He's the hero this country deserves,October 13 2016,The LAD Bible,Facebook,https://www.facebook.com/LADbible/videos/29391...,"Batman, ""As for you clowns, if you want to sca..."
4,After-dark clown sightings trouble California ...,October 13 2014,"The Associated Press , WBIR",News Report,http://www.wbir.com/news/after-dark-clown-sigh...,"BAKERSFIELD, California (AP) — People dressed ..."


In [2]:
# =-=-=-=-=-=-=-=-=-=-=
# Clean and Tokenize
# =-=-=-=-=-=-=-=-=-=-= 

import re
from nltk.tokenize import WhitespaceTokenizer

def string_test(s):
    if s is None:
        return ''
    else:
        return str(s)

# With any luck, this list comprehension will work:
strings = [ string_test(text) for text in texts ]

# Eliminate carriage returns
legends = []
for string in strings:
    string = string.replace(u'\xa0', u' ')
    legends.append(string)

# TOKENIZE
tokenizer = WhitespaceTokenizer()
words = []
for legend in legends:   
    raw = re.sub(r"[^\w\d'\s]+",'', legend).lower()
    tokens = tokenizer.tokenize(raw)
    words.append(tokens)

In [3]:
# =-=-=-=-=-=-=-=-=-=-=
# Create dictionary with word positions as list
# =-=-=-=-=-=-=-=-=-=-= 

def word_positions(listname):
    from collections import defaultdict
    words_with_positions = defaultdict(list)
    for position, word in enumerate(listname):
        words_with_positions[word].append(float(position)/len(listname))
    return(words_with_positions)

super_dict = {}
for word in words:
    temp_dict = word_positions(word)
    for k, v in temp_dict.items():
        if super_dict.get(k) is None:
            super_dict[k] = []
        if v not in super_dict.get(k):
            # Possibly problematic for larger data sets
            super_dict[k] = super_dict[k] + v
            
places = pandas.DataFrame()
places['word'] = super_dict.keys()
places['positions'] = super_dict.values()

places.head()

Unnamed: 0,word,positions
0,necessary,"[0.9399538106235565, 0.5759656652360515, 0.404..."
1,lagrange,"[0.34642438452520513, 0.3733880422039859, 0.41..."
2,encounter,"[0.7460606060606061, 0.19148936170212766, 0.43..."
3,filmshorror,[0.6706484641638225]
4,titans,[0.22095671981776766]


In [4]:
# Get the list of positions out of the dataframe and into a list
positions = places['positions'].tolist()

# Calculate averages
average_positions = []
for the_list in positions:
    average_positions.append(sum(the_list)/len(the_list))

# Calculate frequencies
occurrences = []
for the_list in positions:
    occurrences.append(len(the_list))

# Write averages and frequencies back to dataframe
places['average'] = average_positions
places['occurs'] = occurrences

# Check work
places.head(10)

Unnamed: 0,word,positions,average,occurs
0,necessary,"[0.9399538106235565, 0.5759656652360515, 0.404...",0.478014,6
1,lagrange,"[0.34642438452520513, 0.3733880422039859, 0.41...",0.403176,10
2,encounter,"[0.7460606060606061, 0.19148936170212766, 0.43...",0.347966,4
3,filmshorror,[0.6706484641638225],0.670648,1
4,titans,[0.22095671981776766],0.220957,1
5,complete,"[0.22110849056603774, 0.4427710843373494]",0.33194,2
6,stunts,"[0.3518659558263519, 0.8265895953757225, 0.314...",0.497519,3
7,toes,[0.07715582450832073],0.077156,1
8,larry,[0.9193664506839453],0.919366,1
9,puppy,[0.10810810810810811],0.108108,1


In [5]:
first_words = places[(places.average < 0.35) & (places.occurs > 10)]
first_words.head(len(first_words))

Unnamed: 0,word,positions,average,occurs
290,confirmed,"[0.03391107761868877, 0.0693293142426526, 0.48...",0.261894,21
304,sc,"[0.003418803418803419, 0.002403846153846154, 0...",0.209849,17
597,multiple,"[0.05466237942122187, 0.5896414342629482, 0.08...",0.32244,36
730,nationwide,"[0.14087759815242495, 0.2138364779874214, 0.04...",0.325256,17
1164,chapter,"[0.04697624190064795, 0.08585313174946005, 0.0...",0.096376,16
1446,caller,"[0.42231075697211157, 0.27403846153846156, 0.3...",0.328872,11
1808,scheduled,"[0.8595166163141994, 0.9667673716012085, 0.635...",0.340288,13
1881,persons,"[0.15264423076923078, 0.4625766871165644, 0.33...",0.328956,11
2201,dozens,"[0.25625, 0.05770690964312832, 0.2372528616024...",0.223051,13
2236,chloe,"[0.002028397565922921, 0.038539553752535496, 0...",0.236415,21


In [6]:
last_words = places[(places.average > 0.67) & (places.occurs > 10)]
last_words.sort_values('average', ascending = False)

Unnamed: 0,word,positions,average,occurs
2483,add,"[0.8218623481781376, 0.15768463073852296, 0.54...",0.81838,17
1826,line,"[0.6117788461538461, 0.9608138658628486, 0.861...",0.742678,14
8102,shemwood,"[0.8798076923076923, 0.8966346153846154, 0.900...",0.742272,14
6206,likes,"[0.9041666666666667, 0.5988372093023255, 0.276...",0.738828,14
8027,we've,"[0.9200710479573713, 0.9603315571343991, 0.468...",0.736274,12
6494,2017,"[0.08650519031141868, 0.8673349056603774, 0.54...",0.722595,12
7785,free,"[0.6299559471365639, 0.7160212604403948, 0.173...",0.719936,14
4829,zombie,"[0.4898785425101215, 0.7922077922077922, 0.863...",0.711455,12
1490,radford,"[0.7115384615384616, 0.8769230769230769, 0.132...",0.707317,20
9633,heightened,"[0.9143730886850153, 0.8707403055229143, 0.538...",0.695467,12
