In [1]:
#! /usr/bin/env python

import pandas
import re
import csv

# Get all talks in a list & then into one string
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks-v1b.csv', names=colnames)
talks = df.text.tolist()
alltalks = " ".join(str(item) for item in talks) # Solves pbm of floats in talks
all_words = re.sub(r"[^\w\d'\s]+",'',alltalks).lower() # Remove all punctuation save apostrophes


# Tokenize on whitespace with NLTK
from nltk.tokenize import WhitespaceTokenizer
tt_tokens = WhitespaceTokenizer().tokenize(all_words)

# Build a dictionary of words and their frequency in the corpus
tt_freq = {}
for word in tt_tokens:
    try:
        tt_freq[word] += 1
    except: 
        tt_freq[word] = 1

# Convert dictionary into a list of tuples
tt_freq_list = [(val, key) for key, val in tt_freq.items()]

# Sort with most frequent words at top
tt_freq_list.sort(reverse=True)

# Write to file
with open('../outputs/tt_freq.csv', 'w') as f:
        wtr = csv.writer(f)
        wtr.writerows(sorted(tt_freq_list, reverse=True))
        f.close()

In [2]:
# Discovering the Bad Texts

# We establish which talks are empty
i = 0
no_good = []
for talk in talks:
    A = type(talk)
    B = type('string or something')
    if A != B:
        no_good.append(i)
    i = i + 1

print(no_good)

[185, 398, 513, 877, 1015, 1100, 2011]


In [3]:
df.iloc[no_good]

Unnamed: 0,author,title,date,length,text
185,Quixotic Fusion,Dancing with light,Jun 2012,718,
398,Bruno Maisonnier,"Dance, tiny robots!",Feb 2013,74,
513,Kenichi Ebina,My magic moves,Oct 2007,204,
877,Aakash Odedra,"A dance in a hurricane of paper, wind and light",Dec 2014,573,
1015,Joey Alexander,An 11-year-old prodigy performs old-school jazz,Jun 2015,372,
1100,Kaki King,A musical escape into a world of light and color,Nov 2015,671,
2011,Robert Gupta + Joshua Roman,"On violin and cello, ""Passacaglia""",May 2011,526,


In [5]:
j = 0
too_short = []
for talk in talks: 
    if len(str(talk)) < 1000:
        too_short.append(j)
    j = j + 1

In [11]:
print(len(too_short), too_short)

21 [115, 185, 331, 398, 513, 877, 982, 1015, 1100, 1299, 1342, 1427, 1641, 1846, 1852, 1937, 1947, 2011, 2028, 2080, 2102]


In [12]:
df.iloc[too_short]

Unnamed: 0,author,title,date,length,text
115,Improv Everywhere,A TED speaker's worst nightmare,Mar 2012,209,Today I'm going to talk about unexpected disco...
185,Quixotic Fusion,Dancing with light,Jun 2012,718,
331,Sleepy Man Banjo Boys,Teen wonders play bluegrass,Nov 2012,12,Tommy Mizzone Tonight we're going to play you...
398,Bruno Maisonnier,"Dance, tiny robots!",Feb 2013,74,
513,Kenichi Ebina,My magic moves,Oct 2007,204,
877,Aakash Odedra,"A dance in a hurricane of paper, wind and light",Dec 2014,573,
982,Bill T. Jones,"The dancer, the singer, the cellist ... and a ...",May 2015,379,Isadora Duncan crazy long legged woman fro...
1015,Joey Alexander,An 11-year-old prodigy performs old-school jazz,Jun 2015,372,
1100,Kaki King,A musical escape into a world of light and color,Nov 2015,671,
1299,Bruno Bowden + Rufus Cappadocia,Blindfold origami and cello,Aug 2008,159,Hello everyone. And so the two of us are here ...


In [13]:
with open('../data/drop_talks.txt', 'w') as f:
    for item in too_short:  
        f.write("%s\n" % item)