In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
plt.rcParams['figure.figsize'] = (20, 20)

In [None]:
import nltk
import spacy
import requests
import numpy as np
import pandas as pd
import networkx as nx
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer

nlp = spacy.load('en')

def remove_punctuation(input_string):
    return input_string.translate(str.maketrans('', '', punctuation))


def window_over_sentences(sentences, size=2):
    return [' '.join(sentences[i : i + size]) 
            for i in range(len(sentences) - size - 1)]


def sequences_to_count_matrix(sequences, count_vectorizer):
    '''count instances of each word in the vocabulary in each sentence'''
    return count_vectorizer.fit_transform(sequences).todense()


def is_plausible_entity(word):
    return ((word.pos_ == 'PROPN') & 
            (word.text.istitle()) & 
            (len(word) > 2))


def get_plausible_entities(count_vectorizer):
    vocabulary = remove_punctuation(' '.join(count_vectorizer.vocabulary_.keys()))
    plausible_entities = [word.text for word in nlp(vocabulary)
                          if is_plausible_entity(word)]
    return plausible_entities


def get_adjacency_matrix(count_matrix, count_vectorizer, plausible_entities):
    '''
    count instances of each plausible entity in each sequence. 
    return character/character counts
    '''
    relevant_indicies = [count_vectorizer.vocabulary_[e]
                         for e in plausible_entities]
    interaction_matrix = count_matrix[:, relevant_indicies]
    adjacency = interaction_matrix.T.dot(interaction_matrix)
    np.fill_diagonal(adjacency, 0)
    return pd.DataFrame(data=adjacency, 
                        columns=plausible_entities, 
                        index=plausible_entities)


def get_edgelist(adjacency, threshold):
    rows, columns = np.where(np.triu(adjacency.values, 1) > threshold)
    edges = np.column_stack([adjacency.index[rows],
                             adjacency.columns[columns],
                             adjacency.values[rows, columns]])
    return pd.DataFrame(data=edges,
                        columns=['source', 'target', 'value'])


def bookworm(book, threshold=15):
    sentences = nltk.sent_tokenize(book)
    sequences = window_over_sentences(sentences)
    count_vectorizer = CountVectorizer(lowercase=False)
    count_matrix = sequences_to_count_matrix(sequences, count_vectorizer)
    plausible_entities = get_plausible_entities(count_vectorizer)

    adjacency = get_adjacency_matrix(count_matrix, 
                                     count_vectorizer, 
                                     plausible_entities)

    edgelist = get_edgelist(adjacency, threshold)
    return nx.from_pandas_edgelist(edgelist, 
                                   source='source', 
                                   target='target', 
                                   edge_attr='value')


In [None]:
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm
import netlsd

In [None]:
base_url = 'http://www.glozman.com/textpages.html'
a = requests.get(base_url).text
soup = BeautifulSoup(a, 'html.parser')
urls = ['http://www.glozman.com/' + url.get('href') for url in soup.find_all('a')]

In [None]:
urls_to_parse = [url for url in urls if 'Harry Potter' in url]
graphs = {}

for url in tqdm(urls_to_parse):
    try:
        book = requests.get(url).text
        graphs[url] = bookworm(book, threshold=20)
    except: pass

In [None]:
feature_vectors = {url: netlsd.heat(graph) for url, graph in graphs.items()}

In [None]:
from scipy.spatial.distance import cdist

similarity = {
    n_1: {n_2: netlsd.compare(sig_1, sig_2)
          for n_2, sig_2 in feature_vectors.items()
         }
    for n_1, sig_1 in feature_vectors.items()
}

In [None]:
sns.heatmap(pd.DataFrame(data=similarity,
                         columns=feature_vectors.keys(),
                         index=feature_vectors.keys()));

In [None]:
nx.draw(graphs['http://www.glozman.com/TextPages/Harry Potter 1 - Sorcerer\'s Stone.txt'],
        with_labels=True)