In [None]:
!python3 -m pip install spacy
!python3 -m spacy download en
!python3 -m pip install gensim

In [1]:
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup as BS
from zipfile import ZipFile


from gensim.models import Word2Vec
import spacy
nlp = spacy.load('en')

import nltk
from nltk import word_tokenize, download, sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/Kevin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Path to mounted Gutenberg DVD.
gb_path = "/Volumes/PGDVD_2010_04_RC2"

In [3]:
def parse_metadata(gutenberg_path):
    """Parses metadata files from Gutenberg DVD to create metadata dataframe."""
    try:
        metadf = pd.read_pickle("gutenberg_metadata.pkl")
        
    except:
        metadata = []

        metafiles = os.listdir(gutenberg_path+"/ETEXT")
        nr_files = len(metafiles)

        for i, metafile in enumerate(metafiles):
            book = {}

            with open(gutenberg_path+"/ETEXT/"+metafile, "r") as html:
                bs = BS(html)

            tables = bs.find_all("table")
            attrs = tables[0].find_all("tr")
            for attr in attrs:
                book[attr.find("th").text] = attr.find("td").text

            try:
                book["Path"] = bs.find("td", string=re.compile("text/plain")).findNext("a").get("href").replace("..","")
            except:
                book["Path"] = ""

            metadata.append(book)

            if not i%np.ceil(nr_files/10):
                print("{:3d}% parsed".format(i*100//nr_files))

        metadf = pd.DataFrame(metadata).fillna("")
        metadf["Release Date"] = pd.to_datetime(metadf["Release Date"]).fillna(0)
        pd.to_pickle(df,"gutenberg_metadata.pkl")
    
    return metadf


def filter_meta(metadf, author="", language="English",title="",years=""):
    """Applies conditions to existing metadata dataframe."""
    
    conditions = True
    
    if author:
        conditions &= metadf["Author"].str.contains(author)
    if language:
        conditions &= metadf["Language"].str.contains(language)
    if title:
        conditions &= metadf["Title"].str.contains(title)
    if years:
        dates = years.split("-")
        if len(dates) == 1:
            conditions &= metadf["Release Date"].isin(pd.date_range(dates[0], dates[0]+"-12-31"))
        elif len(dates) == 2:
            conditions &= metadf["Release Date"].isin(pd.date_range(dates[0] or "1700", dates[1]+"-12-31" or "2010-12-31"))
        
    return metadf[conditions]


def get_texts(metadf, path=""):
    """Obtain textfiles from all documents in the metadata dataframe. 
    If no DVD path is provided, the documents will be downloaded from www.gutenberg.org"""
    
    texts = []
    for i,book in metadf.iterrows():
        print("Loading EText {}: '{}' by {}".format(book["EText-No."], book["Title"], book["Author"]))
        try:  
            if path:
                with ZipFile(path + book["Path"]) as zfile:
                    txt = zfile.read(zfile.namelist()[0])
            else:
                txt = urlopen("http://www.gutenberg.org/files/{0}/{0}.txt".format(book["EText-No."])).read()

            texts.append(txt.decode("utf8","ignore"))
        except Exception as e:
            print("ERROR Could not load EText {}: {}".format(book["EText-No."], book["Title"]))
            print(e)
              
    return texts

In [4]:
metadf = parse_metadata(gb_path)
metadf.groupby("Author").count()["Title"].sort_values(ascending=False)

Author
Various                                           1937
                                                   761
Anonymous                                          517
Shakespeare, William, 1564-1616                    257
Lytton, Edward Bulwer Lytton, Baron, 1803-1873     212
Ebers, Georg, 1837-1898                            164
Twain, Mark, 1835-1910                             154
Parker, Gilbert, 1862-1932                         133
Balzac, Honoré de, 1799-1850                       123
Unknown                                            115
Kingston, William Henry Giles, 1814-1880           114
Jacobs, W. W. (William Wymark), 1863-1943          111
Meredith, George, 1828-1909                        109
Verne, Jules, 1828-1905                            104
Motley, John Lothrop, 1814-1877                    103
Howells, William Dean, 1837-1920                   100
Ballantyne, R. M. (Robert Michael), 1825-1894       98
Dickens, Charles, 1812-1870                         92
Haw

In [30]:
selected = filter_meta(metadf,author="Doyle")
txts = get_texts(selected, path=gb_path)

Loading EText 10446: 'The Green Flag' by Doyle, Arthur Conan, Sir, 1859-1930
Loading EText 10581: 'Uncle Bernac
A Memory of the Empire' by Doyle, Arthur Conan, Sir, 1859-1930
Loading EText 108: 'The Return of Sherlock Holmes' by Doyle, Arthur Conan, Sir, 1859-1930
Loading EText 11247: 'The Exploits of Brigadier Gerard' by Doyle, Arthur Conan, Sir, 1859-1930
Loading EText 11413: 'The Refugees' by Doyle, Arthur Conan, Sir, 1859-1930
Loading EText 11656: 'The Great Shadow and Other Napoleonic Tales' by Doyle, Arthur Conan, Sir, 1859-1930
Loading EText 12555: 'The Tragedy of the Korosko' by Doyle, Arthur Conan, Sir, 1859-1930
Loading EText 126: 'The Poison Belt' by Doyle, Arthur Conan, Sir, 1859-1930
Loading EText 13152: 'The Firm of Girdlestone' by Doyle, Arthur Conan, Sir, 1859-1930
Loading EText 139: 'The Lost World' by Doyle, Arthur Conan, Sir, 1859-1930
Loading EText 1638: 'The New Revelation' by Doyle, Arthur Conan, Sir, 1859-1930
Loading EText 1644: 'The Adventures of Gerard' by Doy

In [46]:
def crop_body(txt):
    txt = re.sub(r"\*\*\* ?start.*\*\*\*\r?\n", "***START***", txt, flags=re.I)
    txt = re.sub(r"\*\*\* ?end.*\*\*\*\r?\n", "***END***", txt, flags=re.I)
    txt = txt[txt.find("***START***")+11:txt.find("***END***")]
    return txt

def clean_txt(txt):
    txt = crop_body(txt)
    #txt = txt.lower()
    txt = re.sub("(\r?\n)+"," ", txt)
    txt = re.sub(" +"," ", txt)
    txt = "".join(list(filter(lambda x: x not in '"#$%&\'()*+-/:;<=>@[\\]^_`{|}~', txt)))
    return txt
    
cleaned = clean_txt(txts[5])

In [165]:
# W2V
def tokenize(txt):
    print("Preprocess: Tokenize")
    return [word_tokenize(sentence) for sentence in sent_tokenize(cleaned)]

def trainW2V(corpus, vsize=100, window=5, epochs=10):
    print("W2V: Create corpus")
    w2v = Word2Vec(corpus, size=vsize, window=window, min_count=1, workers=4)
    print("W2V: Train model")
    w2v.train(corpus, total_examples=w2v.corpus_count, epochs=epochs)
    return w2v

def tokenizeAndTrainW2V(corpus, vsize=100, window=5, epochs=10):
    tokens = tokenize(corpus)
    return trainW2V(corpus, vsize, window, epochs)

In [167]:
w2v.wv.most_similar(positive=['woman', 'king'], negative=['man'])

[('faces', 0.9752947092056274),
 ('theyll', 0.9751505851745605),
 ('mercy', 0.9739930033683777),
 ('cheer', 0.9734542965888977),
 ('mischiefs', 0.9729212522506714),
 ('straighter', 0.9721733331680298),
 ('glorious', 0.9718793630599976),
 ('boot', 0.9712995290756226),
 ('hearts', 0.9712735414505005),
 ('Sear', 0.9708160758018494)]

In [9]:
# NER
doc = nlp(cleaned)

for entity in doc.ents:
    print(entity.text, entity.label_)

the nineteenth century DATE
fiveandfifty years of age DATE
a week DATE
a thousand CARDINAL
a dozen CARDINAL
two short years DATE
nearly a CARDINAL
night TIME
a hundred and fifty thousand CARDINAL
a third CARDINAL
night TIME
night TIME
one night TIME
two CARDINAL
morning TIME
night TIME
two CARDINAL
one CARDINAL
the night TIME
early in the morning TIME
fifteen years of age DATE
all night TIME
one CARDINAL
dozen CARDINAL
1703 CARDINAL
more than a hundred years DATE
two CARDINAL
an english half CARDINAL
half CARDINAL
one CARDINAL
first ORDINAL
first ORDINAL
second ORDINAL
nine miles QUANTITY
eleven and a half CARDINAL
one evening TIME
the next morning TIME
second ORDINAL
threeandfifty feet QUANTITY
two CARDINAL
one CARDINAL
this day DATE
three CARDINAL
one CARDINAL
two CARDINAL
two CARDINAL
some six weeks later DATE
three CARDINAL
ten CARDINAL
first ORDINAL
the day DATE
the hour TIME
first ORDINAL
third ORDINAL
a week on end DATE
two years DATE
the years DATE
five years DATE
three years D

In [11]:
# Determine semantic similarities
doc1 = nlp(u"my fries were super gross")
doc2 = nlp(u"such disgusting fries")
similarity = doc1.similarity(doc2)
print(doc1.text, doc2.text, similarity)

my fries were super gross such disgusting fries 0.713970153639


In [50]:
for token in nlp(cleaned)[:1000]:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
    token.shape_, token.is_alpha, token.is_stop)
    

    SPACE     False False
Produced produce VERB VBN ROOT Xxxxx True False
by by ADP IN agent xx True True
Lionel lionel PROPN NNP compound Xxxxx True False
G. g. PROPN NNP compound X. False False
Sear sear PROPN NNP pobj Xxxx True False
THE the DET DT det XXX True False
GREAT great ADJ JJ compound XXXX True False
SHADOW shadow PROPN NNP ROOT XXXX True False
AND and CCONJ CC cc XXX True False
OTHER other ADJ JJ amod XXXX True False
NAPOLEONIC napoleonic PROPN NNP conj XXXX True False
TALES tales PROPN NNP det XXXX True False
A. a. NOUN NN compound X. False False
CONAN conan PROPN NNP ROOT XXXX True False
DOYLE doyle NOUN NN amod XXXX True False
CONTENTS content NOUN NNS appos XXXX True False
THE the DET DT det XXX True False
GREAT great ADJ JJ amod XXXX True False
SHADOW shadow PROPN NNP compound XXXX True False
I. i. NOUN NN ROOT X. False False
THE the DET DT det XXX True False
NIGHT night NOUN NN ROOT XXXX True False
OF of ADP IN prep XX True False
THE the DET DT det XXX True False
BE

In [65]:
nlp("doyle")[0].tag_

'NN'

In [163]:
import os
import pickle as pkl

class CorpusStreamer:
    def __init__(self, path):
        self.path = path
    
    def __iter__(self):
        for fname in os.listdir(self.path):
            print("W2V: Loading: " + fname)
            with open(self.path + fname, "rb") as file:
                sents = pkl.load(file)
                for sent in sents:
                    yield sent


In [166]:
stream = CorpusStreamer("./test/")
w2v = trainW2V(stream, epochs=1)

W2V: Create corpus
W2V: Loading: 1.pkl
W2V: Loading: 3.pkl
W2V: Loading: 2.pkl
W2V: Loading: 1.pkl
W2V: Loading: 3.pkl
W2V: Loading: 2.pkl
W2V: Loading: 1.pkl
W2V: Loading: 3.pkl
W2V: Loading: 2.pkl
W2V: Loading: 1.pkl
W2V: Loading: 3.pkl
W2V: Loading: 2.pkl
W2V: Loading: 1.pkl
W2V: Loading: 3.pkl
W2V: Loading: 2.pkl
W2V: Loading: 1.pkl
W2V: Loading: 3.pkl
W2V: Loading: 2.pkl
W2V: Train model
W2V: Loading: 1.pkl
W2V: Loading: 3.pkl
W2V: Loading: 2.pkl
