In [1]:
#! /usr/bin/env python

import pandas
import re

# Get all talks in a list & then into one string
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks-v1b.csv', names=colnames)
talks = df.text.tolist()
alltalks = " ".join(str(item) for item in talks) # Solves pbm of floats in talks
all_words = re.sub(r"[^\w\d'\s]+",'',alltalks).lower() # Remove all punctuation save apostrophes

In [2]:
all_words[0:500]

"thank you so much  chris and it's truly a great honor to have the opportunity to come to this stage twice  i'm extremely grateful i have been blown away by this conference  and i want to thank all of you for the many nice comments about what i had to say the other night and i say that sincerely  partly because  mock sob  i need that put yourselves in my position i flew on air force two for eight years now i have to take off my shoes or boots to get on an airplane  i'll tell you one quick story t"

In [3]:
# The NLTK Way

import nltk

# Tokenize string
tt_tokens = nltk.word_tokenize(all_words)

# Build a dictionary of words and their frequency in the corpus
tt_freq = {}
for word in tt_tokens:
    try:
        tt_freq[word] += 1
    except: 
        tt_freq[word] = 1

In [4]:
# And now to sort into a list 
tt_freq_list = [(val, key) for key, val in tt_freq.items()]

In [5]:
# tt_freq_list.sort() 
tt_freq_list.sort(reverse=True)

tt_freq_list[0:20]

[(210294, 'the'),
 (151163, 'and'),
 (126887, 'to'),
 (116155, 'of'),
 (106547, 'a'),
 (96375, 'that'),
 (83740, 'i'),
 (78986, 'in'),
 (75643, 'it'),
 (71766, 'you'),
 (68573, 'we'),
 (65295, 'is'),
 (56535, "'s"),
 (49889, 'this'),
 (37525, 'so'),
 (33424, 'they'),
 (32231, 'was'),
 (30067, 'for'),
 (28869, 'are'),
 (28245, 'have')]

In [11]:
# The Pure Python Way

#word_list = re.split('\s+', all_words)
word_list = all_words.split()
freq_dic = {}
for word in word_list:
    try: 
        freq_dic[word] += 1
    except: 
        freq_dic[word] = 1

# Convert dictionary to list, sort it, show top 20 items
freq_list = [(val, key) for key, val in tt_freq.items()]
freq_list.sort(reverse=True)
freq_list[0:20]

[(210294, 'the'),
 (151163, 'and'),
 (126887, 'to'),
 (116155, 'of'),
 (106547, 'a'),
 (96375, 'that'),
 (83740, 'i'),
 (78986, 'in'),
 (75643, 'it'),
 (71766, 'you'),
 (68573, 'we'),
 (65295, 'is'),
 (56535, "'s"),
 (49889, 'this'),
 (37525, 'so'),
 (33424, 'they'),
 (32231, 'was'),
 (30067, 'for'),
 (28869, 'are'),
 (28245, 'have')]

In [None]:
import csv

with open('../outputs/tt_freqs.csv', 'w') as f:
        wtr = csv.writer(f)
        wtr.writerows(sorted(tt_freq_list, reverse=True))
        f.close()