## What are the most common topics of discussion? Have those changed over time?

Document clustering: http://brandonrose.org/clustering

In [16]:
import nltk
import re
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

## Load data

In [None]:
def loadDataFiles():
    files = [f for f in listdir('data') if isfile(join('data', f)) and f.endswith('.csv')]
    df = None
    for fname in files:
        if df is None:
            df = pd.read_csv('data/' + fname)
        else:
            df = df.append(pd.read_csv('data/' + fname), ignore_index=True)
    return df
    
df = loadDataFiles()
df.head()


In [2]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['android', 'development', 'article', 'app'])
print(stopwords[:10])

[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your']


In [3]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [4]:

# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed
def is_ascii(s):
    return all(ord(c) < 128 for c in s)


def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if not is_ascii(token):
            continue
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if not is_ascii(token):
            continue
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [5]:
titles = []
descriptions = []
for i in adf.iterrows():
    if type(i[1].description) is not str or not is_ascii(i[1].description):
        continue
    titles.append(i[1].title)
    descriptions.append(i[1].description)

NameError: name 'adf' is not defined

In [6]:
#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in descriptions:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)
    
# totalvocab_tokenized.remove('android')
# totalvocab_tokenized.remove('\'ll')
totalvocab_tokenized = [x for x in totalvocab_tokenized if x not in stopwords]
# totalvocab_tokenized.remove('development')

In [9]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

there are 0 items in vocab_frame


In [10]:
print(vocab_frame.head())

Empty DataFrame
Columns: [words]
Index: []


In [11]:

from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, max_features=200000,
                                 min_df=0, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(descriptions) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [12]:
terms = tfidf_vectorizer.get_feature_names()

NotFittedError: TfidfVectorizer - Vocabulary wasn't fitted.

In [14]:
dist = 1 - cosine_similarity(tfidf_matrix)

NameError: name 'tfidf_matrix' is not defined

## Determine the proper number of clusters

http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

In [17]:
pca = PCA(n_components=5)
pca.fit(tfidf_matrix.toarray())

NameError: name 'tfidf_matrix' is not defined

In [18]:
plt.plot(pca.components_[0], pca.components_[1])

# plotly.offline.iplot({
#     "data": [{
#         'x': pca.components_[0],
#         'y': pca.components_[1]
#         }  for col in top_authors_by_year.columns],
#     "layout": Layout(yaxis=dict(title='Frequency'), xaxis=dict(title='Year'),
#         title='Author Frequency by Year')
#     })

NameError: name 'plt' is not defined

In [19]:
tfidf_matrix

NameError: name 'tfidf_matrix' is not defined

In [20]:
from __future__ import print_function

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

print(__doc__)

# Generating the sample data from make_blobs
# This particular setting has one distinct cluster and 3 clusters placed close
# together.
# X, y = make_blobs(n_samples=500,
#                   n_features=2,
#                   centers=4,
#                   cluster_std=1,
#                   center_box=(-10.0, 10.0),
#                   shuffle=True,
#                   random_state=1)  # For reproducibility

X = tfidf_matrix

range_n_clusters = [2, 3, 4, 5, 6]

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, X.getnnz() + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

#     y_lower = 10
#     for i in range(n_clusters):
#         # Aggregate the silhouette scores for samples belonging to
#         # cluster i, and sort them
#         ith_cluster_silhouette_values = \
#             sample_silhouette_values[cluster_labels == i]

#         ith_cluster_silhouette_values.sort()

#         size_cluster_i = ith_cluster_silhouette_values.shape[0]
#         y_upper = y_lower + size_cluster_i

#         color = cm.spectral(float(i) / n_clusters)
#         ax1.fill_betweenx(np.arange(y_lower, y_upper),
#                           0, ith_cluster_silhouette_values,
#                           facecolor=color, edgecolor=color, alpha=0.7)

#         # Label the silhouette plots with their cluster numbers at the middle
#         ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

#         # Compute the new y_lower for next plot
#         y_lower = y_upper + 10  # 10 for the 0 samples

#     ax1.set_title("The silhouette plot for the various clusters.")
#     ax1.set_xlabel("The silhouette coefficient values")
#     ax1.set_ylabel("Cluster label")

#     # The vertical line for average silhouette score of all the values
#     ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

#     ax1.set_yticks([])  # Clear the yaxis labels / ticks
#     ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

#     # 2nd Plot showing the actual clusters formed
#     colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
#     ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
#                 c=colors)

#     # Labeling the clusters
#     centers = clusterer.cluster_centers_
#     # Draw white circles at cluster centers
#     ax2.scatter(centers[:, 0], centers[:, 1],
#                 marker='o', c="white", alpha=1, s=200)

#     for i, c in enumerate(centers):
#         ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)

#     ax2.set_title("The visualization of the clustered data.")
#     ax2.set_xlabel("Feature space for the 1st feature")
#     ax2.set_ylabel("Feature space for the 2nd feature")

#     plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
#                   "with n_clusters = %d" % n_clusters),
#                  fontsize=14, fontweight='bold')

#     plt.show()

Automatically created module for IPython interactive environment


NameError: name 'tfidf_matrix' is not defined

In [21]:
from sklearn.cluster import KMeans

num_clusters = 6

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

NameError: name 'tfidf_matrix' is not defined

In [22]:

from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

AttributeError: 'KMeans' object has no attribute 'labels_'

In [23]:

films = { 'descriptions': descriptions, 'cluster': clusters, 'title': titles}

frame = pd.DataFrame(films, index = [clusters] , columns = ['cluster', 'title'])

NameError: name 'clusters' is not defined

In [24]:
frame['cluster'].value_counts() #number of films per cluster (clusters from 0 to 4)

NameError: name 'frame' is not defined

In [25]:

from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    
    for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d titles:" % i, end='')
    for title in frame.ix[i]['title'].values.tolist()[:5]:
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

Top terms per cluster:



AttributeError: 'KMeans' object has no attribute 'cluster_centers_'