# Import needed packages

In [None]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import os
import csv

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

# Scrapping text

First we extract all the RELEVANT words from the books. We will also remove stop words so that our files contained partially cleaned data. Note that with the way the words are collected from the OCR output, they are already tokenized because I am storing one word at a time and the OCR output has already split the words if I target the span tag within the body where the text is.

In [None]:
def words_from_book(book_path, file_name):
    book_path = "gap-html/gap_DqQNAAAAYAAJ/"
    book_words = []
    stop_words = set(stopwords.words('english'))
    
    #Lets get all the html files for the first book
    book_pages = listdir(book_path);
    #We filter the links to make sure that we don't add links to directories in the folder with our html
    book_pages = [f for f in os.listdir(book_path) if os.path.isfile(os.path.join(book_path, f))]
    book_pages.sort()
    #print(book_pages)
    for i in book_pages:
        soup = BeautifulSoup(open(book_path+i))

        #print(soup.prettify())

        words = soup.find_all('span','ocr_cinfo');
        #words = soup.body.get_text()
        #text_file = open(file_name, "w")
        #text_file.write(words)
        #text_file.close()

        for x in words:
            text = x.contents[0].strip(".(),\'\";:*[]<>").replace(" ", "")
            if(len(text)>0):
                #print (text)
                if x not in stop_words:
                    book_words.append(text)
    
    if(len(book_words)>0):
        with open(file_name, 'w') as csvfile:
            writer = csv.writer(csvfile, dialect='excel')
            for i in book_words:
                writer.writerow([i])
    else:
        print("No words found")

words_from_book("gap-html/gap_-C0BAAAAQAAJ/", "gap-scrapped/all_words_book_01.csv")
#folders = [f for f in os.listdir("gap-html")]
#counter = 0
#for paths in folders:
#    counter = counter+1
#    words_from_book("gap-html/"+paths+"/", "gap-scrapped/all_words_book_0"+counter+".csv")

This is some code to practice reading and writing to CSV files

In [None]:
#Short code practice on how to write and read csv files
fieldnames = ['first_name', 'last_name', 'all_names']
with open('testing.csv', 'w') as csvfile:
    #If you want to give header names use method commented below
    #writer = csv.DictWriter(csvfile, fieldnames='')
    writer = csv.writer(csvfile, dialect='excel')
    for i in fieldnames:
        writer.writerow([i])

with open('testing.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=' ')
    for row in reader:
        print(', '.join(row))

# Stemming and Tokenizing

Now we open the file and stem and tokenize contents then add result to tokenized and stemmed array. We tokenize to separate any punctuation from words and we stem to obtain word roots.

In [None]:
all_words = []
total_stemmed = []
total_tokens = []
stemmer = SnowballStemmer("english")

#Extracting text from csv files
def create_content_list(file_path):
    file_list = [f for f in os.listdir(file_path) if os.path.isfile(os.path.join(file_path, f))]
    file_list.sort()
    #print(file_list)
    for file in file_list:
        book_words = ""
        with open(file_path+"/"+file, newline='') as csvfile:
            reader = csv.reader(csvfile, delimiter=' ')
            for row in reader:
                text = str(row).strip("[]'")
                #Let's remove words with less than two characters
                if(len(text)>2 and re.search('[a-zA-Z]', text)):
                    book_words += text+" "
                    #book_words.append(text)
                    
        all_words.append(book_words)
    
    #Now let's create another list of all the words but which have been stemmed
    countTokens = len(all_words)
    print("Count token list: "+str(countTokens))
    #create_tokens_and_stems(all_words)
    #countStemmed = len(total_stemmed)
    #print("Count stem list: "+str(countStemmed))
    
    #For testing
    #print("Count filtered tokens: "+str(len(total_tokens)))
    #print("Count all tokens: "+str(len(all_words_tokenized)))
    #print(total_tokens[:2000])
    #print("\n\n\n")
    #print(total_stemmed[:2000])
    
#Stemming function
def create_tokens_and_stems(word_list):
    for w in word_list:
        #print(str(w))
        tokens = [word for sent in nltk.sent_tokenize(w) for word in nltk.word_tokenize(sent)]
        for token in tokens:
            total_tokens.append(token)
        #print("Count filtered tokens: "+str(len(total_tokens)))
        
    print("Final token count: "+str(len(total_tokens)))
    writeToFile("gap-st/tokenized.csv", total_tokens)
    for t in total_tokens:
        total_stemmed.append(stemmer.stem(t))
    writeToFile("gap-st/tokenized_and_stemmed.csv", total_stemmed)
    return total_stemmed

def create_tokens_only(word_list):
    for w in word_list:
        #print(str(w))
        tokens = [word for sent in nltk.sent_tokenize(w) for word in nltk.word_tokenize(sent)]
        for token in tokens:
            total_tokens.append(token)
        print("Count filtered tokens: "+str(len(total_tokens)))
        
    print("Final token count: "+str(len(total_tokens)))
    writeToFile("gap-st/tokenized.csv", total_tokens)
    return total_tokens

def writeToFile(file_name, data):
    with open(file_name, 'w') as csvfile:
        writer = csv.writer(csvfile, dialect='excel')
        for i in data:
            writer.writerow([i])

create_content_list('gap-scrapped')

Now we create data frame to store tokenized and stemmed words

In [None]:
#Retrieve token and stem lists
total_tokens = []
total_stemmed = []
with open('gap-st/tokenized.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=' ')
    for row in reader:
        text = str(row).strip("[]'")
        total_tokens.append(text)
        #print(', '.join(row))
        
with open('gap-st/tokenized_and_stemmed.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=' ')
    for row in reader:
        text = str(row).strip("[]'")
        total_stemmed.append(text)
        
print("Tokens: "+str(len(total_tokens)))
print("Stemmed: "+str(len(total_stemmed)))

#Now we create a panda data frame to store the tokens and stems
word_frame = pd.DataFrame({'words': total_tokens}, index = total_stemmed)
print ('there are ' + str(word_frame.shape[0]) + ' items in word_frame')
print (word_frame.head())

# Document similarity (TF-IDF)
(Term frequency-inverse document frequency)

Now we create a panda data frame and use it to compute document similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Now we determine document similarity
tfidf_vectorizer = TfidfVectorizer(min_df=0.2, max_df=0.8, max_features=300000, use_idf=True, ngram_range=(1,3))

#%time tfidf_matrix = tfidf_vectorizer.fit_transform(total_stemmed) #fit the vectorizer to all_words
%time tfidf_matrix = tfidf_vectorizer.fit_transform(all_words) #fit the vectorizer to all_words

print(tfidf_matrix.shape)


In [None]:
terms = tfidf_vectorizer.get_feature_names()
#print(terms)

# K-Means Clustering

Now that we have the tf-idf matrix, we can carry out various clustering techniques. Let's start with k-means

In [None]:
from sklearn.cluster import KMeans
from sklearn.externals import joblib

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

joblib.dump(km,  'gap_cluster.pkl')

print("Model saved!")

In [None]:
#Now we create dictionary of book titles and word clusters

#Uncomment this if clusters haven't been loaded
km = joblib.load('gap_cluster.pkl')
clusters = km.labels_.tolist()

#First save titles to array
titles = []
with open('gap-titles/titles.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        text = str(row).strip("[]'")
        titles.append(text)
        
#print(all_words[0][:200])
        
books = { 'titles': titles, 'descriptions': all_words, 'cluster': clusters }

print("Clusters: "+str(len(clusters)))
print("Titles: "+str(len(titles)))
print("Words: "+str(len(all_words)))

frame = pd.DataFrame(books, index = [clusters] , columns = ['titles', 'cluster'])

frame['cluster'].value_counts()

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :100]: #replace 6 with n words per cluster
        print(' %s' % word_frame.ix[terms[ind].split(',')].values.tolist()[0][0], end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d titles:" % i, end='')
    for title in frame.ix[i]['titles'].values.tolist():
        print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace

# Multi-dimensional scaling

Now we create a 2D array of the data using the distance matrix

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.manifold import MDS
import mpld3

dist = 1 - cosine_similarity(tfidf_matrix)

MDS()

# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]


# Visualizing the data

We use matplotlib and mpld3 (matpot library for d3)

In [None]:
#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e'}

#set up cluster names using a dict
cluster_names = {0: 'Gothic, church, Julian, Strab, Persian, Christianity', 
                 1: 'Herod, Josephus, Jerusalem, judea, David, Jewish', 
                 2: 'Athenians, Minerva, Christ, Homer, Bacchus, stadia', 
                 3: 'Livius, Fabius, Publius, Volsci, Quintus, Carthaginian', 
                 4: 'nero, Otho, Vitellius, Galba, Tacitus, Suet'}

#some ipython magic to show the matplotlib plots inline
%matplotlib inline 

#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles))

#group by cluster
groups = df.groupby('label')


# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, 
            label=cluster_names[name], color=cluster_colors[name], 
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelleft='off')
    
#ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)  

    
    
plt.show() #show the plot

#uncomment the below to save the plot if need be
#plt.savefig('gap_clusters_small_noaxes.pdf', dpi=200)

Now let's make it INTERACTIVE!!!!

In [None]:
#define custom toolbar location
class TopToolbar(mpld3.plugins.PluginBase):
    """Plugin for moving toolbar to top of figure"""

    JAVASCRIPT = """
    mpld3.register_plugin("toptoolbar", TopToolbar);
    TopToolbar.prototype = Object.create(mpld3.Plugin.prototype);
    TopToolbar.prototype.constructor = TopToolbar;
    function TopToolbar(fig, props){
        mpld3.Plugin.call(this, fig, props);
    };

    TopToolbar.prototype.draw = function(){
      // the toolbar svg doesn't exist
      // yet, so first draw it
      this.fig.toolbar.draw();

      // then change the y position to be
      // at the top of the figure
      this.fig.toolbar.toolbar.attr("x", 150);
      this.fig.toolbar.toolbar.attr("y", 400);

      // then remove the draw function,
      // so that it is not called again
      this.fig.toolbar.draw = function() {}
    }
    """
    def __init__(self):
        self.dict_ = {"type": "toptoolbar"}
        
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=titles)) 

#group by cluster
groups = df.groupby('label')

#define custom css to format the font and to remove the axis labeling
css = """
text.mpld3-text, div.mpld3-tooltip {
  font-family:Arial, Helvetica, sans-serif;
}

g.mpld3-xaxis, g.mpld3-yaxis {
display: none; }

svg.mpld3-figure {
margin-left: -100px;}
"""

# Plot 
fig, ax = plt.subplots(figsize=(14,6)) #set plot size
ax.margins(0.03) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    points = ax.plot(group.x, group.y, marker='o', linestyle='', ms=18, 
                     label=cluster_names[name], mec='none', 
                     color=cluster_colors[name])
    ax.set_aspect('auto')
    labels = [i for i in group.title]
    
    #set tooltip using points, labels and the already defined 'css'
    tooltip = mpld3.plugins.PointHTMLTooltip(points[0], labels,
                                       voffset=10, hoffset=10, css=css)
    #connect tooltip to fig
    mpld3.plugins.connect(fig, tooltip, TopToolbar())    
    
    #set tick marks as blank
    ax.axes.get_xaxis().set_ticks([])
    ax.axes.get_yaxis().set_ticks([])
    
    #set axis as blank
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)

    
#ax.legend(numpoints=1) #show legend with only one dot

mpld3.display() #show the plot

#uncomment the below to export to html
#html = mpld3.fig_to_html(fig)
#print(html)

# Hierarchical clustering

Here we use the ward clustering algorithm because it allows us to carry out heirarchical clustering

In [None]:
from scipy.cluster.hierarchy import ward, dendrogram

linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances

fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=titles);

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout() #show plot with tight layout

#uncomment below to save figure
plt.savefig('gap_dendogram_clusters.png', dpi=200) #save figure as ward_clusters

# Latent Dirichlet Allocation

As an added bonus, let us try and group the books by topic. We do this by using LDA to determine the top key words in each book and form topic groups based on this.

In [None]:
#strip any proper nouns (NNP) or plural proper nouns (NNPS) from a text
from nltk.tag import pos_tag
from gensim import corpora, models, similarities 

def strip_proppers_POS(text):
    tagged = pos_tag(text.split()) #use NLTK's part of speech tagger
    non_propernouns = [word for word,pos in tagged if pos != 'NNP' and pos != 'NNPS']
    return non_propernouns

#remove proper names
%time preprocess = [strip_proppers_POS(doc) for doc in all_words]

#tokenize
#%time tokenized_text = [create_tokens_and_stems(text) for text in preprocess]

#remove stop words
#%time texts = [[word for word in text if word not in stopwords] for text in tokenized_text]

In [None]:
#create a Gensim dictionary from the texts
dictionary = corpora.Dictionary(preprocess)

#remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
dictionary.filter_extremes(no_below=0.1, no_above=0.8)

#convert the dictionary to a bag of words corpus for reference
corpus = [dictionary.doc2bow(text) for text in preprocess]

In [None]:
%time lda = models.LdaModel(corpus, num_topics=5, id2word=dictionary, update_every=5, chunksize=500, passes=25)

lda.show_topics()

Now let's show top 10 words in each created topic

In [None]:
topics_matrix = lda.show_topics(formatted=False, num_words=10)
#topics_matrix = np.array(topics_matrix)

#topic_words = topics_matrix[:,:,1]
for i in topics_matrix:
    print([str(word) for word in i])
    print()