In [7]:
import pandas, nltk
from nltk.collocations import BigramCollocationFinder 
from nltk.metrics import BigramAssocMeasures 
from nltk.corpus import stopwords 

In [None]:
# Load the CSV as a dataframe
colnames = ['Title' , 'Date', 'Author', 'Origin', 'URL', 'Text']
df = pandas.read_csv('./clowns_2a.csv', names=colnames)

# Create two lists (because I'm not so good on working with the dataframe)
texts = df.Text.tolist()
dates = df.Date.tolist()

def string_test(s):
    if s is None:
        return ''
    else:
        return str(s)

# With any luck, this list comprehension will work:
strings = [ string_test(text) for text in texts ]

# Eliminate carriage returns
legends = []
for string in strings:
    string = string.replace(u'\xa0', u' ')
    legends.append(string)
    

In [9]:
# Textify for NLTK
texts = []
for legend in legends:
    tokens = nltk.word_tokenize(legend)
    text = nltk.Text(tokens)
    texts.append(text)

In [24]:
all_tokens = nltk.word_tokenize(' '.join(legends))
all_texts = nltk.Text(all_tokens)

In [26]:
biagram_collocation = BigramCollocationFinder.from_words(all_texts)
biagram_collocation.nbest(BigramAssocMeasures.likelihood_ratio, 20) 

[('’', 's'),
 ('’', 't'),
 ('5', 'months'),
 ('months', 'ago'),
 ('–', ']'),
 ('[', '–'),
 ('goldreply', '['),
 ('points', '5'),
 ('.', 'The'),
 ('.', '“'),
 ('social', 'media'),
 ('permalinkembedsaveparentreportgive', 'goldreply'),
 ('a', 'clown'),
 ('South', 'Carolina'),
 ('have', 'been'),
 ('the', 'woods'),
 ('in', 'the'),
 (',', 'but'),
 ('.', 'He'),
 ('clown', 'sightings')]

In [27]:
stopset = set(stopwords.words('english')) 
filter_stops = lambda w: len(w) < 3 or w in stopset 
  
biagram_collocation.apply_word_filter(filter_stops) 
biagram_collocation.nbest(BigramAssocMeasures.likelihood_ratio, 20) 

[('months', 'ago'),
 ('social', 'media'),
 ('permalinkembedsaveparentreportgive', 'goldreply'),
 ('South', 'Carolina'),
 ('clown', 'sightings'),
 ('Fleetwood', 'Manor'),
 ('apartment', 'complex'),
 ('creepy', 'clown'),
 ('permalinkembedsavereportgive', 'goldreply'),
 ('Stephen', 'King'),
 ('lure', 'children'),
 ('law', 'enforcement'),
 ('Police', 'Department'),
 ('Penn', 'State'),
 ('County', 'Sheriff'),
 ('Greenville', 'County'),
 ('North', 'Carolina'),
 ('permalinksaveparentreportgive', 'goldreply'),
 ('New', 'York'),
 ('clown', 'mask')]

In [28]:
from nltk.collocations import TrigramCollocationFinder 
from nltk.metrics import TrigramAssocMeasures 

trigram_collocation = TrigramCollocationFinder.from_words(all_texts) 
trigram_collocation.apply_word_filter(filter_stops) 
trigram_collocation.apply_freq_filter(3) 
  
trigram_collocation.nbest(TrigramAssocMeasures.likelihood_ratio, 20)

[('months', 'ago', 'You'),
 ('months', 'ago', 'Not'),
 ('months', 'ago', 'That'),
 ('months', 'ago', 'http'),
 ('months', 'ago', 'This'),
 ('months', 'ago', 'For'),
 ('months', 'ago', 'What'),
 ('months', 'ago', 'And'),
 ('months', 'ago', 'Just'),
 ('months', 'ago', 'Yeah'),
 ('months', 'ago', 'There'),
 ('months', 'ago', 'They'),
 ('months', 'ago', 'The'),
 ('months', 'ago', 'But'),
 ('creepy', 'clown', 'sightings'),
 ('permalinkembedsaveparentreportgive', 'goldreply', 'load'),
 ('social', 'media', 'posts'),
 ('social', 'media', 'threat'),
 ('social', 'media', 'threats'),
 ('scary', 'clown', 'sightings')]

In [13]:
texts[0].concordance("clown")

Displaying 4 of 4 matches:
being attacked by a man dressed as a clown . Schools in the Cincinnati suburb o
a cigarette on her porch when the `` clown '' approached her and told her , `` 
a striped outfit , red wig and white clown mask . The incident comes after poli
chool students there . The `` creepy clown '' incidents have been popping up ar
