<a href="https://colab.research.google.com/github/kleczekr/tolkenizer/blob/master/similarity_distance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import glob
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial.distance import pdist, squareform

In [14]:
# This cell is meant to accommodate the Google Colab way of dealing with reading 
# files from Google Drive; feel free to ignore it if you are running the notebook
# on your local machine
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
# Use the glob library to create a list of file names
filenames = glob.glob("/content/drive/My Drive/programming_historian/1666_texts/*.txt")

In [16]:
# Parse those filenames to create a list of file keys (ID numbers)
# You'll use these later on.
filekeys = [f.split('/')[-1].split('.')[0] for f in filenames]

# Create a CountVectorizer instance with the parameters you need
vectorizer = CountVectorizer(input="filename", max_features=1000, max_df=0.7)
# Run the vectorizer on your list of filenames to create your wordcounts
# Use the toarray() function so that SciPy will accept the results
wordcounts = vectorizer.fit_transform(filenames).toarray()

In [17]:
# Read the metadata from the separate .csv file
metadata = pd.read_csv("/content/drive/My Drive/programming_historian/1666_metadata.csv", index_col="TCP ID")

In [18]:
# Calculate the euclidean distances for words
euclidean_distances = pd.DataFrame(squareform(pdist(wordcounts)), index=filekeys, columns=filekeys)

In [19]:
# Fetch the five results with the smallest euclidean distance from the A28989 work,
# i.e. Boyle’s ''Hydrostatical paradoxes made out by new experiments''
# (six results, since the top result would be the work A28989 itself)
top5_euclidean = euclidean_distances.nsmallest(6, 'A28989')['A28989'][1:]

In [20]:
# Display the top five works
metadata.loc[top5_euclidean.index, ['Author','Title','Keywords']]

Unnamed: 0,Author,Title,Keywords
A62436,"Thomson, George, 17th cent.","Loimotomia, or, The pest anatomized in these f...","Hodges, Nathaniel, 1629-1688. -- Vindiciae med..."
A43020,"Harvey, Gideon, 1640?-1700?","Morbus anglicus: or, The anatomy of consumptio...",Tuberculosis -- Early works to 1800.
A29017,"Boyle, Robert, 1627-1691.","The origine of formes and qualities, (accordin...",Matter -- Constitution -- Early works to 1800....
A56390,"Parker, Samuel, 1640-1688.",A free and impartial censure of the Platonick ...,Platonists. Empiricism -- Early works to 1800.
A44061,"Hodges, Nathaniel, 1629-1688.",Vindiciæ medicinæ & medicorum: or An apology f...,Medicine -- Early works to 1800. Plague -- Eng...


In [21]:
# Similarly, calculate the cosine distances and fetch the five works with the 
# smallest cosine distances.
cosine_distances = pd.DataFrame(squareform(pdist(wordcounts, metric='cosine')), 
                                index=filekeys, columns=filekeys)
top5_cosine = cosine_distances.nsmallest(6, 'A28989')['A28989'][1:]

In [22]:
metadata.loc[top5_cosine.index, ['Author','Title','Keywords']]

Unnamed: 0,Author,Title,Keywords
A29017,"Boyle, Robert, 1627-1691.","The origine of formes and qualities, (accordin...",Matter -- Constitution -- Early works to 1800....
A43020,"Harvey, Gideon, 1640?-1700?","Morbus anglicus: or, The anatomy of consumptio...",Tuberculosis -- Early works to 1800.
A62436,"Thomson, George, 17th cent.","Loimotomia, or, The pest anatomized in these f...","Hodges, Nathaniel, 1629-1688. -- Vindiciae med..."
A57484,"Rochefort, César de, b. 1605.","The history of the Caribby-islands, viz, Barba...",Natural history -- West Indies. Carib Indians.
A60482,"Smith, John, 1630-1679.",Gērochomia vasilikē King Solomons portraitur...,"Bible. -- O.T. -- Ecclesiastes XII, 1-6 -- Par..."


In [23]:
# As the titles are abbreviated, we can display them in full with the following:
print(metadata.loc[top5_cosine.index, ['Author','Title','Keywords']].to_string())

                                Author                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      Title                                                                                             Keywords
A29017       Boyle, Robert, 1627-1691.                                                                                                                                                                                                                           