# Document Clustering #

*Based on materials by Brandon Rose, Jacob Eisenstein, and Eun Seo Jo et al.*

In this exercise, we're going to use k-means clustering in order to identify the latent structures within the synopses of the top 100 films of all time (per an IMDB list), a corpus created by Brandon Rose. See [the original post](http://www.brandonrose.org/top100) for a more detailed discussion on the corpus.

Let's start with the imports:

In [None]:
import numpy as np
import pandas as pd
import nltk
from bs4 import BeautifulSoup # remember this one! 
import re # this too! 
from sklearn import feature_extraction

## Step 1: Corpus Pre-Proccesing ##

Here we go again. Let's pre-process our corpus. 

In [None]:
# import three lists: titles, links, and wikipedia synopses
titles = open('title_list.txt').read().split('\n')

# ensures that only the first 100 are read in
titles = titles[:100]

links = open('link_list_imdb.txt').read().split('\n')
links = links[:100]

synopses_wiki = open('synopses_list_wiki.txt').read().split('\n BREAKS HERE')
synopses_wiki = synopses_wiki[:100]

synopses_clean_wiki = []

for text in synopses_wiki:
    text = BeautifulSoup(text, 'html.parser').getText()
    # strips html formatting and converts to unicode
    synopses_clean_wiki.append(text)

synopses_wiki = synopses_clean_wiki
    
genres = open('genres_list.txt').read().split('\n')
genres = genres[:100]

print(str(len(titles)) + ' titles')
print(str(len(links)) + ' links')
print(str(len(synopses_wiki)) + ' synopses')
print(str(len(genres)) + ' genres')

In [None]:
# now let's get the imdb synopses

synopses_imdb = open('synopses_list_imdb.txt').read().split('\n BREAKS HERE')
synopses_imdb = synopses_imdb[:100]

synopses_clean_imdb = []

for text in synopses_imdb:
    text = BeautifulSoup(text, 'html.parser').getText()
    #strips html formatting and converts to unicode
    synopses_clean_imdb.append(text)

synopses_imdb = synopses_clean_imdb

In [None]:
# make a list with the two sets of synopses

synopses = []

for i in range(len(synopses_wiki)):
    item = synopses_wiki[i] + synopses_imdb[i]
    synopses.append(item)

# see what one looks like
print(synopses[0])

### For document clustering, some people like to stem first ###

In [None]:
import sys
!{sys.executable} -m pip install textblob # an alternative to spaCy

from textblob import TextBlob

def textblob_tokenizer(str_input):
    blob = TextBlob(str_input.lower())
    tokens = blob.words
    words = [token.stem() for token in tokens]
    return words

## Step 2: TF-IDF (once again!) 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, 
                                   tokenizer=textblob_tokenizer, ngram_range=(1,3),
                                   min_df=.2, max_df=0.8) #note new params

tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(synopses)

print(tfidf_vectorizer_vectors.shape)

In [None]:
# get our feature names for future reference 
terms = tfidf_vectorizer.get_feature_names()

In [None]:
# get the first vector out (for the first synopsis) to see what it looks like
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]
 
# place tf-idf values in a pandas data frame
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=terms, columns=["tfidf"])

df.sort_values(by=["tfidf"],ascending=False).head(10)

## Step 3: On to the K-means clustering

Using our tf-idf vectors, we can now run the k-means clustering algorithm. Remember that K-means initializes with a pre-determined number of clusters. Let's choose 5. 

In [None]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters, n_init=10) # default is also 10, but good to know 

km.fit(tfidf_vectorizer_vectors)

# km.labels_ gives you the cluster assignments
clusters = km.labels_.tolist()

In [None]:
# dump our clusters into a dataframe
films = { 'title': titles, 'synopsis': synopses, 'cluster': clusters, 'genre': genres }

film_df = pd.DataFrame(films, columns = ['title', 'cluster', 'genre'])

film_df

In [None]:
# find out how many films are in each cluster
film_df['cluster'].value_counts()

In [None]:
# find the top terms per cluster

# this orders by the distance of each term from the center
# (cluster_centers_ returns an array of [n_clusters, n_features] )
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster " + str(i) + " top words: ")
    top_terms = ""
   
    for ind in order_centroids[i, :10]:
        top_terms += terms[ind] + ", "
  
    print(top_terms)

In [None]:
# find the top films per cluster    
for i in range(num_clusters):  
    print("Titles in cluster " + str(i) + ": ")
    cluster_titles = ""

    # create new df of only the specific cluster
    # remember boolean selection! 
    cluster_df = film_df[ film_df["cluster"] == i ]
 
    # create series of titles assoc w/ that cluster 
    for title in cluster_df['title']: 
        cluster_titles += title + ", "

    print(cluster_titles + "\n")

## Step 4: Visualizing Document Clusters

### Part 1: Dimensionality Reduction with T-SNE

In [None]:
from sklearn.manifold import TSNE

predictions = km.fit_predict(tfidf_vectorizer_vectors.toarray())
tsne = TSNE(n_components=2)

In [None]:
embed = tsne.fit_transform(tfidf_vectorizer_vectors.toarray())
xs, ys = zip(*embed)

### Part 2: Actually visualizing the clusters

In [None]:
#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e'}

#set up cluster names using a dict
cluster_names = {0: 'Cluster 0', 
                 1: 'Cluster 1', 
                 2: 'Cluster 2', 
                 3: 'Cluster 3', 
                 4: 'Cluster 4'}

In [None]:
#create data frame that has the result of the t-sne plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles)) 

#group by cluster
groups = df.groupby('label')

In [None]:
# set up plot
%matplotlib inline

fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=cluster_names[name], color=cluster_colors[name], mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelleft='off')
    
ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    ax.text(df.iloc[i]['x'], df.iloc[i]['y'], df.iloc[i]['title'], size=8)  
    
plt.show() #show the plot


## Another clustering method: hierarchical document clustering

Here's an example of hierarchical document clustering using Ward's method, which relies on minimum variance. [more here](https://www.statisticshowto.datasciencecentral.com/wards-method/)

In [None]:
from scipy.cluster.hierarchy import ward, dendrogram

linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances

fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="left", labels=titles);

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout() #show plot with tight layout

In [None]:
plt.close()