In [1]:
from get_data import *

titles, transcripts = import_from_mongo()

In [2]:
import nltk
import re
import pandas as pd

In [18]:
#nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ryanmurray/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
#nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ryanmurray/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
stopwords = nltk.corpus.stopwords.words('english')

In [4]:
from nltk.stem.snowball import SnowballStemmer

In [5]:
stemmer = SnowballStemmer("english")

In [6]:
def tokenize_and_stem(text):
    #First tokenizes by sentence, then by word, so that punctuation is caught as its own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        #Filter out tokens not containing letters
        if re.search('[a-zA-Z]' ,token):
            filtered_tokens.append(token)
    return filtered_tokens
        

In [7]:
def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [8]:

totalvocab_stemmed = []
totalvocab_tokenized = []
for i in transcripts:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [9]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [10]:
print ('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

there are 10223115 items in vocab_frame


In [25]:
vocab_frame.head()

Unnamed: 0,words
You,you
've,'ve
heard,heard
of,of
your,your


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.6, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(transcripts) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

CPU times: user 1min 46s, sys: 1.12 s, total: 1min 47s
Wall time: 1min 47s
(5858, 260)


In [36]:
tfidf_matrix

<5858x260 sparse matrix of type '<class 'numpy.float64'>'
	with 477719 stored elements in Compressed Sparse Row format>

In [13]:
terms = tfidf_vectorizer.get_feature_names()

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [15]:
from sklearn.cluster import KMeans

num_clusters = 10

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 1min 20s, sys: 16 ms, total: 1min 20s
Wall time: 1min 20s


In [16]:
ted = { 'title': titles,  'transcript': transcripts, 'cluster': clusters}

frame = pd.DataFrame(ted, columns = ['title', 'cluster'])

In [17]:
frame.head()

Unnamed: 0,title,cluster
0,10 myths about psychology debunked Ben Ambridge,4
1,10 things you didn't know about orgasm Mary Roach,6
2,10 top time saving tech tips David Pogue,2
3,10 ways to have a better conversation Celeste ...,6
4,12 sustainable design ideas from nature Janine...,0


In [18]:
frame['cluster'].value_counts()

5    1767
6    1293
1    1187
8     416
2     245
4     221
7     206
0     198
3     191
9     134
Name: cluster, dtype: int64

In [19]:

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.loc[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    '''print("Cluster %d titles:" % i, end='')
    for title in frame.loc[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace'''

Top terms per cluster:

Cluster 0 words: b'water', b'use', b'food', b'called', b'place', b'got',

Cluster 1 words: b'questions', b'word', b'words', b'use', b'called', b'today',

Cluster 2 words: b'light', b'space', b'matter', b'called', b'big', b'ca',

Cluster 3 words: b'music', b'applause', b'play', b'yeah', b'hear', b'love',

Cluster 4 words: b'women', b'said', b'man', b'day', b'children', b'young',

Cluster 5 words: b'technology', b'kind', b'use', b'lot', b'percent', b'human',

Cluster 6 words: b'said', b'got', b'did', b'day', b'went', b'love',

Cluster 7 words: b'body', b'called', b'human', b'inside', b'use', b'process',

Cluster 8 words: b'school', b'children', b'kids', b'said', b'family', b'young',

Cluster 9 words: b'food', b'water', b'change', b'body', b'use', b'lot',



This list of words suggests that I need to decrease the "max_df" parameter in the tfidfvectorizer to remove words that are in most of the documents.

In [21]:
import pickle

FN0 = 'vocabulary-embedding'

with open('data/%s.pkl'%FN0, 'rb') as fp:
    embedding, idx2word, word2idx, glove_idx2idx = pickle.load(fp)
vocab_size, embedding_size = embedding.shape

with open('data/%s.data.pkl'%FN0, 'rb') as fp:
    X,Y = pickle.load(fp)
    
    

In [23]:
nb_unknown_words = 30
print ('number of examples',len(X),len(Y))
print ('dimension of embedding space for words',embedding_size)
print ('vocabulary size', vocab_size, 'the last %d words can be used as place holders for unknown/oov words'%nb_unknown_words)
print ('total number of different words',len(idx2word), len(word2idx))
print ('number of words outside vocabulary which we can substitue using glove similarity', len(glove_idx2idx))
print ('number of words that will be regarded as unknonw(unk)/out-of-vocabulary(oov)',len(idx2word)-vocab_size-len(glove_idx2idx))

number of examples 5858 5858
dimension of embedding space for words 100
vocabulary size 40000 the last 30 words can be used as place holders for unknown/oov words
total number of different words 203556 203556
number of words outside vocabulary which we can substitue using glove similarity 41880
number of words that will be regarded as unknonw(unk)/out-of-vocabulary(oov) 121676


In [25]:
def prt(label, x):
    print(label+':',)
    for w in x:
        print(idx2word[w],)
    print()

In [29]:
prt('D',X[10])

D:
This
is
Lee
Sedol.
Lee
Sedol
is
one
of
the
world's
greatest
Go
players,
and
he's
having
what
my
friends
in
Silicon
Valley
call
a
“Holy
Cow”
moment
–
a
moment
where
we
realize
that
AI
is
actually
progressing
a
lot
faster
than
we
expected.
So
humans
have
lost
on
the
Go
board.
What
about
the
real
world?
Well,
the
real
world
is
much
bigger,
much
more
complicated
than
the
Go
board.
It's
a
lot
less
visible,
but
it's
still
a
decision
problem.
And
if
we
think
about
some
of
the
technologies
that
are
coming
down
the
pike
…
Noriko
[Arai]
mentioned
that
reading
is
not
yet
happening
in
machines,
at
least
with
understanding.
But
that
will
happen,
and
when
that
happens,
very
soon
afterwards,
machines
will
have
read
everything
that
the
human
race
has
ever
written.
And
that
will
enable
machines,
along
with
the
ability
to
look
further
ahead
than
humans
can,
as
we've
already
seen
in
Go,
if
they
also
have
access
to
more
information,
they'll
be
able
to
make
better
decisions
in
the
real
world
than
we
can

In [40]:
glove_transcript = [' '.join([idx2word[i] for i in h]) for h in X]

In [41]:
glove_transcript[0]

"You've heard of your I.Q., your general intelligence, but what's your Psy-Q? How much do you know about what makes you tick, and how good are you at predicting other people's behavior or even your own? And how much of what you think you know about psychology is wrong? Let's find out by counting down the top 10 myths of psychology. You've probably heard it said that when it comes to their psychology, it's almost as if men are from Mars and women are from Venus. But how different are men and women really? To find out, let's start by looking at something on which men and women really do differ and plotting some psychological gender differences on the same scale. One thing men and women do really differ on is how far they can throw a ball. So if we look at the data for men here, we see what is called a normal distribution curve. A few men can throw a ball really far, and a few men not far at all, but most a kind of average distance. And women share the same distribution as well, but actua

In [43]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, max_features=50000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time glove_tfidf_matrix = tfidf_vectorizer.fit_transform(glove_transcript) #fit the vectorizer to synopses

print(glove_tfidf_matrix.shape)

CPU times: user 1min 45s, sys: 840 ms, total: 1min 46s
Wall time: 1min 46s
(5858, 249)


In [44]:
glove_tfidf_matrix

<5858x249 sparse matrix of type '<class 'numpy.float64'>'
	with 441718 stored elements in Compressed Sparse Row format>

In [45]:
g_terms = tfidf_vectorizer.get_feature_names()

In [46]:
dist = 1 - cosine_similarity(glove_tfidf_matrix)

In [47]:
from sklearn.cluster import KMeans

num_clusters = 10

km = KMeans(n_clusters=num_clusters)

%time km.fit(glove_tfidf_matrix)

g_clusters = km.labels_.tolist()

CPU times: user 1min 10s, sys: 4 ms, total: 1min 10s
Wall time: 1min 10s


In [48]:
print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.loc[g_terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    '''print("Cluster %d titles:" % i, end='')
    for title in frame.loc[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace'''

Top terms per cluster:

Cluster 0 words: b'percent', b'country', b'change', b'money', b'social', b'community',

Cluster 1 words: b'did', b'love', b'story', b"'d", b'feel', b'went',

Cluster 2 words: b'body', b'food', b'human', b'help', b'inside', b'water',

Cluster 3 words: b'women', b'man', b'young', b'children', b'change', b'talk',

Cluster 4 words: b'light', b'space', b'matter', b'ca', b'ca', b'night',

Cluster 5 words: b'technology', b'information', b'human', b'able', b'problem', b'example',

Cluster 6 words: b'music', b'applause', b'play', b'yeah', b'hear', b'love',

Cluster 7 words: b'space', b'questions', b'word', b'words', b'power', b'form',

Cluster 8 words: b'school', b'kids', b'children', b'young', b'family', b'learn',

Cluster 9 words: b'water', b'food', b'place', b'small', b'space', b'big',

