In [97]:
from gutenberg.cleanup import strip_headers
import requests

def load_etext(num):
  url ='http://www.gutenberg.org/files/{0}/{0}-0.txt'.format(num)
  data= requests.get(url)
  return data.text

texts =[]
text_nums=[1342,76,11,2701,98,2591]
text_names =['Pride and Prejudice','Adventures of Huckleberry Finn',"Alice’s Adventures in Wonderland",
            "Moby Dick", "A Tale of Two Cities","Grimms’ Fairy Tales"]
for text_num in text_nums:
    text = strip_headers(load_etext(text_num)).strip()
    texts.append(text)

# Preprocess Text

- Lowercase
- Strip Tags
- Strip Punctuation
- Strip Multiple Whitespaces
- Strip Numeric
- Remove Stopwords
- Strip Shortwords
- Stem Text


In [73]:
from gensim.parsing.preprocessing import preprocess_string,preprocess_documents
from gensim import corpora,models, similarities


In [74]:
processed_docs = preprocess_documents(texts)

In [75]:
dictionary = corpora.Dictionary(processed_docs)
dictionary.save('gutenberg.dict')

In [76]:
print(dictionary)

Dictionary(17922 unique tokens: [u'\u2014\u2018the', u'\u201cbroke', u'\u2018most', u'woodi', u'beauvai']...)


In [78]:
corpus = [dictionary.doc2bow(text) for text in processed_docs]
corpora.MmCorpus.serialize('gutenberg.mm', corpus) 

# Model

In [84]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=500)
index = similarities.MatrixSimilarity(lsi[corpus])
index.save('gutenberg.index')

# Query

In [85]:
def query(doc):
    vec_bow = dictionary.doc2bow(doc.lower().split())
    vec_lsi = lsi[vec_bow] 
    return vec_lsi


In [86]:
query_texts = ["""  In fact, the artist's design seemed this: a final theory of my own,
partly based upon the aggregated opinions of many aged persons with whom
I conversed upon the subject. The picture represents a Cape-Horner in a
great hurricane; the half-foundered ship weltering there with its three
dismantled masts alone visible; and an exasperated whale, purposing to
spring clean over the craft, is in the enormous act of impaling himself
upon the three mast-heads.

The opposite wall of this entry was hung all over with a heathenish
array of monstrous clubs and spears. Some were thickly set with
glittering teeth resembling ivory saws; others were tufted with knots of
human hair; and one was sickle-shaped, with a vast handle sweeping round
like the segment made in the new-mown grass by a long-armed mower. You
shuddered as you gazed, and wondered what monstrous cannibal and savage
could ever have gone a death-harvesting with such a hacking, horrifying
implement. Mixed with these were rusty old whaling lances and harpoons
all broken and deformed. Some were storied weapons. With this once long
lance, now wildly elbowed, fifty years ago did Nathan Swain kill fifteen
whales between a sunrise and a sunset. And that harpoon--so like a
corkscrew now--was flung in Javan seas, and run away with by a whale,
years afterwards slain off the Cape of Blanco. The original iron entered
nigh the tail, and, like a restless needle sojourning in the body of a
man, travelled full forty feet, and at last was found imbedded in the
hump.

Crossing this dusky entry, and on through yon low-arched way--cut
through what in old times must have been a great central chimney with
fireplaces all round--you enter the public room. A still duskier place
is this, with such low ponderous beams above, and such old wrinkled
planks beneath, that you would almost fancy you trod some old craft's
cockpits, especially of such a howling night, when this corner-anchored
old ark rocked so furiously. On one side stood a long, low, shelf-like
table covered with cracked glass cases, filled with dusty rarities
gathered from this wide world's remotest nooks. Projecting from the
further angle of the room stands a dark-looking den--the bar--a rude
attempt at a right whale's head. Be that how it may, there stands the
vast arched bone of the whale's jaw, so wide, a coach might almost drive
beneath it. Within are shabby shelves, ranged round with old decanters,
bottles, flasks; and in those jaws of swift destruction, like another
cursed Jonah (by which name indeed they called him), bustles a little
withered old man, who, for their money, dearly sells the sailors
deliriums and death.

Abominable are the tumblers into which he pours his poison. Though
true cylinders without--within, the villanous green goggling glasses
deceitfully tapered downwards to a cheating bottom. Parallel meridians
rudely pecked into the glass, surround these footpads' goblets. Fill to
THIS mark, and your charge is but a penny; to THIS a penny more; and so
on to the full glass--the Cape Horn measure, which you may gulp down for
a shilling.

Upon entering the place I found a number of young seamen gathered about
a table, examining by a dim light divers specimens of SKRIMSHANDER. I
sought the landlord, and telling him I desired to be accommodated with a
room, received for answer that his house was full--not a bed unoccupied.
"But avast," he added, tapping his forehead, "you haint no objections
to sharing a harpooneer's blanket, have ye? I s'pose you are goin'
a-whalin', so you'd better get used to that sort of thing."

I told him that I never liked to sleep two in a bed; that if I should
ever do so, it would depend upon who the harpooneer might be, and
that if he (the landlord) really had no other place for me, and the
harpooneer was not decidedly objectionable, why rather than wander
further about a strange town on so bitter a night, I would put up with
the half of any decent man's blanket.

"I thought so. All right; take a seat. Supper?--you want supper?
Supper'll be ready directly."""""]
query_lsis = map(query,query_texts)

In [87]:
query_lsis[0]

[(0, 3.9305810457115062),
 (1, -1.2363802398549046),
 (2, 0.6331834880420697),
 (3, -0.085418237325354918),
 (4, 0.10500558462158441),
 (5, 0.51639858523624793)]

In [88]:
sims = index[query_lsis[0]]

In [89]:
print(list(enumerate(sims)))

[(0, 0.42142335), (1, 0.67105377), (2, 0.36168447), (3, 0.95134634), (4, 0.67198771), (5, 0.56603765)]


In [94]:
sorted(zip(text_names,sims),key=lambda x: x[1],reverse=True)

[('Moby Dick', 0.95134634),
 ('A Tale of Two Cities', 0.67198771),
 ('Adventures of Huckleberry Finn', 0.67105377),
 ('Grimms\xe2\x80\x99 Fairy Tales', 0.56603765),
 ('Pride and Prejudice', 0.42142335),
 ('Alice\xe2\x80\x99s Adventures in Wonderland', 0.36168447)]

# Moby Dick Matches Moby Dick