In [1]:
#! /usr/bin/env python

import pandas
import re
import csv

# Load the data into a dataframe
colnames = ['author', 'title', 'date' , 'length', 'text']
df = pandas.read_csv('../data/talks-v1b.csv', names=colnames)

In [None]:
# Make a list, turn it into a giant string
talks = df.text.tolist()
alltalks = " ".join(str(item) for item in talks) # Solves pbm of floats in talks
all_words = re.sub(r"[^\w\d'\s]+",'',alltalks).lower() # Remove all punctuation save apostrophes

# Tokenize on whitespace with NLTK
from nltk.tokenize import WhitespaceTokenizer
tt_tokens = WhitespaceTokenizer().tokenize(all_words)

# Build a dictionary of words and their frequency in the corpus
tt_freq = {}
for word in tt_tokens:
    try:
        tt_freq[word] += 1
    except: 
        tt_freq[word] = 1

# Convert dictionary into a list of tuples
tt_freq_list = [(val, key) for key, val in tt_freq.items()]

# Sort with most frequent words at top
tt_freq_list.sort(reverse=True)

# Write to file
with open('../outputs/tt_freq.csv', 'w') as f:
        wtr = csv.writer(f)
        wtr.writerows(sorted(tt_freq_list, reverse=True))
        f.close()

In [None]:
# Discovering the Bad Texts

# We establish which talks are empty
i = 0
no_good = []
for talk in talks:
    A = type(talk)
    B = type('string or something')
    if A != B:
        no_good.append(i)
    i = i + 1

print(no_good)

In [None]:
df.iloc[no_good]

In [None]:
j = 0
too_short = []
for talk in talks: 
    if len(str(talk)) < 1000:
        too_short.append(j)
    j = j + 1

In [None]:
print(len(too_short), too_short)

In [None]:
df.iloc[too_short]

In [None]:
with open('../data/drop_talks.txt', 'w') as f:
    for item in too_short:  
        f.write("%s\n" % item)

In [2]:
# =-=-=-=-=-=
# Filter out null or short talks from dataframe & save as new CSV
# =-=-=-=-=-=

# Load list of empty talks or too short talks from file:
the_bad = [ int(x) for x in open("../data/drop_talks.txt", "r").read().split('\n') ]

In [3]:
print(the_bad)

[115, 185, 331, 398, 513, 877, 982, 1015, 1100, 1299, 1342, 1427, 1641, 1846, 1852, 1937, 1947, 2011, 2028, 2080, 2102]


In [4]:
# Second, delete in reverse order so as to preserve index order
dab_eht = sorted(the_bad, reverse=True)

print(dab_eht)

[2102, 2080, 2028, 2011, 1947, 1937, 1852, 1846, 1641, 1427, 1342, 1299, 1100, 1015, 982, 877, 513, 398, 331, 185, 115]


In [6]:
df_purged = df.drop(df.index[dab_eht])

In [13]:
df[184:187]

Unnamed: 0,author,title,date,length,text
184,Sebastian Deterding,What your designs say about you,May 2012,730,We are today talking about moral persuasion. W...
185,Quixotic Fusion,Dancing with light,Jun 2012,718,
186,Seth Shostak,ET is (probably) out there — get ready,Jun 2012,1094,Is E.T. out there Well I work at the SETI In...


In [16]:
df_purged[183:186]

Unnamed: 0,author,title,date,length,text
184,Sebastian Deterding,What your designs say about you,May 2012,730,We are today talking about moral persuasion. W...
186,Seth Shostak,ET is (probably) out there — get ready,Jun 2012,1094,Is E.T. out there Well I work at the SETI In...
187,David Birch,A new way to stop identity theft,Jun 2012,975,So I thought I'd talk about identity. That's s...


In [17]:
# Save to CSV
df_purged.to_csv('../data/talks_2.csv')