In [1]:
import gensim
import numpy as np
import pandas as pd
from nltk.tokenize import TreebankWordTokenizer
import os

pd.set_option('display.max_colwidth', None)

In [2]:
shake_df = pd.read_csv('../data/shake_corpus.csv')
shake_df.sample(5)

Unnamed: 0,Dataline,Play,Line,ActSceneLine,Player,PlayerLinenumber,Traditional,Modern
16236,16237,as you like it,Within these ten days if that thou be'st found,1.3.38,DUKE FREDERICK,22.0,comedy,comedy
1541,1542,henry iv,"charge of foot, and I know his death will be a",2.4.528,PRINCE HENRY,197.0,history,tragedy
14189,14190,alls well that ends well,"Whoever charges on his forward breast,",3.2.119,HELENA,42.0,comedy,comedy
17668,17669,as you like it,SCENE I. The forest.,4.1.1,PHEBE,24.0,comedy,comedy
63734,63735,merchant of venice,"Sir, I entreat you home with me to dinner.",4.1.417,DUKE,120.0,comedy,comedy


In [3]:
shake_df['line_clean'] = shake_df['Line'].apply(lambda x: gensim.utils.simple_preprocess(x))
shake_df.sample(5)

Unnamed: 0,Dataline,Play,Line,ActSceneLine,Player,PlayerLinenumber,Traditional,Modern,line_clean
36255,36256,hamlet,Is't possible?,5.2.27,HORATIO,6.0,tragedy,tragedy,"[is, possible]"
109133,109134,winters tale,Produce the prisoner.,3.2.8,LEONTES,1.0,comedy,romance,"[produce, the, prisoner]"
1641,1642,henry iv,"A shorter time shall send me to you, lords:",3.1.92,GLENDOWER,23.0,history,tragedy,"[shorter, time, shall, send, me, to, you, lords]"
52878,52879,loves labours lost,And give him light that it was blinded by.,1.1.83,BIRON,14.0,comedy,comedy,"[and, give, him, light, that, it, was, blinded, by]"
21478,21479,antony and cleopatra,"does stand,",4.12.2,MARK ANTONY,1.0,tragedy,tragedy,"[does, stand]"


Processing the lines above treats each document as a fragment of line. The document should be the entire line itself. Below I will join the lines, and then split into individual lines based on '.'

In [6]:
shake_lines = shake_df.Line.tolist()
len(shake_lines)

113553

In [7]:
shake_all = ' '.join(shake_lines)
shake_all[:100]

'ACT I SCENE I. London. The palace. Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAN'

In [8]:
shake_corpu = shake_all.split('.')
shake_corpus = [i + '.' for i in shake_corpu]
shake_corpus[:5]

['ACT I SCENE I.',
 ' London.',
 ' The palace.',
 ' Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others So shaken as we are, so wan with care, Find we a time for frighted peace to pant, And breathe short-winded accents of new broils To be commenced in strands afar remote.',
 " No more the thirsty entrance of this soil Shall daub her lips with her own children's blood, Nor more shall trenching war channel her fields, Nor bruise her flowerets with the armed hoofs Of hostile paces: those opposed eyes, Which, like the meteors of a troubled heaven, All of one nature, of one substance bred, Did lately meet in the intestine shock And furious close of civil butchery Shall now, in mutual well-beseeming ranks, March all one way and be no more opposed Against acquaintance, kindred and allies: The edge of war, like an ill-sheathed knife, No more shall cut his master."]

In [9]:
shake_corpus_df = pd.DataFrame(shake_corpus, columns=['line']).reset_index()

In earlier work, the pre-processing step that I used removed punctuation. Since the good Shakespearean titles do contain punctuation, I need to rerun this word-embedding

In [10]:
twd = TreebankWordTokenizer()

shake_corpus_df['line_tokenized'] = shake_corpus_df['line'].apply(lambda x: twd.tokenize(x.lower()))
shake_corpus_df.head()

Unnamed: 0,index,line,line_tokenized
0,0,ACT I SCENE I.,"[act, i, scene, i, .]"
1,1,London.,"[london, .]"
2,2,The palace.,"[the, palace, .]"
3,3,"Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others So shaken as we are, so wan with care, Find we a time for frighted peace to pant, And breathe short-winded accents of new broils To be commenced in strands afar remote.","[enter, king, henry, ,, lord, john, of, lancaster, ,, the, earl, of, westmoreland, ,, sir, walter, blunt, ,, and, others, so, shaken, as, we, are, ,, so, wan, with, care, ,, find, we, a, time, for, frighted, peace, to, pant, ,, and, breathe, short-winded, accents, of, new, broils, to, be, commenced, in, strands, afar, remote, .]"
4,4,"No more the thirsty entrance of this soil Shall daub her lips with her own children's blood, Nor more shall trenching war channel her fields, Nor bruise her flowerets with the armed hoofs Of hostile paces: those opposed eyes, Which, like the meteors of a troubled heaven, All of one nature, of one substance bred, Did lately meet in the intestine shock And furious close of civil butchery Shall now, in mutual well-beseeming ranks, March all one way and be no more opposed Against acquaintance, kindred and allies: The edge of war, like an ill-sheathed knife, No more shall cut his master.","[no, more, the, thirsty, entrance, of, this, soil, shall, daub, her, lips, with, her, own, children, 's, blood, ,, nor, more, shall, trenching, war, channel, her, fields, ,, nor, bruise, her, flowerets, with, the, armed, hoofs, of, hostile, paces, :, those, opposed, eyes, ,, which, ,, like, the, meteors, of, a, troubled, heaven, ,, all, of, one, nature, ,, of, one, substance, bred, ,, did, lately, meet, in, the, intestine, shock, and, furious, close, of, civil, butchery, shall, now, ,, in, mutual, well-beseeming, ranks, ,, march, all, one, way, and, be, no, more, opposed, against, acquaintance, ,, kindred, and, allies, ...]"


In [33]:
w2v_model = gensim.models.Word2Vec(shake_corpus_df['line_tokenized'], size=200, min_count=1)

In [34]:
w2v_model.wv.most_similar('heaven')

[('death', 0.8017988204956055),
 ('fortune', 0.785474956035614),
 ('justice', 0.7817237377166748),
 ('light', 0.7672446966171265),
 ('shame', 0.7604905962944031),
 ('sleep', 0.7568939924240112),
 ('truth', 0.744155764579773),
 ('fall', 0.7391270995140076),
 ('life', 0.7375586032867432),
 ('work', 0.7349543571472168)]

In [12]:
#For size=100
w2v_model.wv.most_similar('heaven')

[('fortune', 0.7843260169029236),
 ('death', 0.7688287496566772),
 ('justice', 0.7674165964126587),
 ('life', 0.7351908683776855),
 ('fall', 0.7195068597793579),
 ('sight', 0.7181484699249268),
 ('hand', 0.7088948488235474),
 ('heart', 0.7060418128967285),
 ('truth', 0.7049264907836914),
 ('consent', 0.7034521102905273)]

In [23]:
w2v_model.wv.most_similar('world')

[('worst', 0.8217633962631226),
 ('law', 0.7874565124511719),
 ('day', 0.7549046277999878),
 ('time', 0.7488061189651489),
 ('end', 0.743834376335144),
 ('truth', 0.743759274482727),
 ('same', 0.7346682548522949),
 ('deed', 0.7189449071884155),
 ('thing', 0.6993948221206665),
 ('oven', 0.6911342144012451)]

In [13]:
#Size=100, window=5
w2v_model.wv.most_similar('world')

[('worst', 0.8164222836494446),
 ('law', 0.8012160062789917),
 ('day', 0.7675077319145203),
 ('truth', 0.7648465037345886),
 ('end', 0.7643637657165527),
 ('deed', 0.7616981267929077),
 ('same', 0.7466306090354919),
 ('play', 0.7381706833839417),
 ('time', 0.7246567010879517),
 ('wonder', 0.7157785892486572)]

In [24]:
w2v_model.wv.most_similar('night')

[('day', 0.8524144887924194),
 ('morning', 0.7778862714767456),
 ('court', 0.7676215171813965),
 ('field', 0.765507698059082),
 ('time', 0.7567277550697327),
 ('town', 0.7555402517318726),
 ('watch', 0.7429038286209106),
 ('next', 0.742796003818512),
 ('last', 0.7361685633659363),
 ('city', 0.7351863384246826)]

In [14]:
#size=100, window=5
w2v_model.wv.most_similar('night')

[('day', 0.8597347736358643),
 ('court', 0.7767656445503235),
 ('morning', 0.7702602744102478),
 ('city', 0.7587684988975525),
 ('field', 0.7552789449691772),
 ('town', 0.7385793924331665),
 ('watch', 0.735870361328125),
 ('general', 0.7355765104293823),
 ('last', 0.7352584600448608),
 ('time', 0.7326194047927856)]

In [41]:
w2v_model.wv.most_similar('mouse')

[('river', 0.9589923620223999),
 ('hapless', 0.953300416469574),
 ('double', 0.9523762464523315),
 ('tyrant', 0.9509615302085876),
 ('dragon', 0.9505335092544556),
 ('silk', 0.950354278087616),
 ('crow', 0.9498463869094849),
 ('chair', 0.9497067928314209),
 ('combustion', 0.9493871927261353),
 ('conqueror', 0.9483494162559509)]

In [35]:
shake_word_vectors = w2v_model.wv

In [36]:
shake_word_vectors.save('shake_w2v.wordsvectors')

In [37]:
words = set(w2v_model.wv.index2word)
len(words)

26304

In [19]:
w2v_model.wv.most_similar('.')

[(':', 0.739151120185852),
 ('agitation', 0.6338746547698975),
 ('peaceable', 0.5971411466598511),
 ('learns', 0.5911572575569153),
 ('isbel', 0.5676355361938477),
 ('unload', 0.5560346841812134),
 ('liable', 0.5466629266738892),
 ('recompense', 0.5443294048309326),
 ('betake', 0.5420522689819336),
 ('spend', 0.5376721024513245)]

In [43]:
w2v_model.wv.most_similar('mouse')

[('river', 0.9589923620223999),
 ('hapless', 0.953300416469574),
 ('double', 0.9523762464523315),
 ('tyrant', 0.9509615302085876),
 ('dragon', 0.9505335092544556),
 ('silk', 0.950354278087616),
 ('crow', 0.9498463869094849),
 ('chair', 0.9497067928314209),
 ('combustion', 0.9493871927261353),
 ('conqueror', 0.9483494162559509)]

In [48]:
shake_corpus_df[shake_corpus_df.line.str.contains('mouse')]

Unnamed: 0,index,line,line_tokenized
8300,8300,"Let him alone, He did inform the truth: but for our gentlemen, The common file--a plague! tribunes for them!-- The mouse ne'er shunn'd the cat as they did budge From rascals worse than they.","[let, him, alone, ,, he, did, inform, the, truth, :, but, for, our, gentlemen, ,, the, common, file, --, a, plague, !, tribunes, for, them, !, --, the, mouse, ne'er, shunn, 'd, the, cat, as, they, did, budge, from, rascals, worse, than, they, .]"
10539,10539,Have you had quiet guard? Not a mouse stirring.,"[have, you, had, quiet, guard, ?, not, a, mouse, stirring, .]"
11329,11329,"What shall I do? Not this, by no means, that I bid you do: Let the bloat king tempt you again to bed, Pinch wanton on your cheek, call you his mouse, And let him, for a pair of reechy kisses, Or paddling in your neck with his damn'd fingers, Make you to ravel all this matter out, That I essentially am not in madness, But mad in craft.","[what, shall, i, do, ?, not, this, ,, by, no, means, ,, that, i, bid, you, do, :, let, the, bloat, king, tempt, you, again, to, bed, ,, pinch, wanton, on, your, cheek, ,, call, you, his, mouse, ,, and, let, him, ,, for, a, pair, of, reechy, kisses, ,, or, paddling, in, your, neck, with, his, damn, 'd, fingers, ,, make, you, to, ravel, all, this, matter, out, ,, that, i, essentially, am, not, in, madness, ,, but, mad, in, craft, .]"
11887,11887,"But there's a saying very old and true, 'If that you will France win, Then with Scotland first begin:' For once the eagle England being in prey, To her unguarded nest the weasel Scot Comes sneaking and so sucks her princely eggs, Playing the mouse in absence of the cat, To tear and havoc more than she can eat.","[but, there, 's, a, saying, very, old, and, true, ,, 'if, that, you, will, france, win, ,, then, with, scotland, first, begin, :, ', for, once, the, eagle, england, being, in, prey, ,, to, her, unguarded, nest, the, weasel, scot, comes, sneaking, and, so, sucks, her, princely, eggs, ,, playing, the, mouse, in, absence, of, the, cat, ,, to, tear, and, havoc, more, than, she, can, eat, .]"
16380,16380,"Look, look, a mouse! Peace, peace, this piece of toasted cheese will do 't.","[look, ,, look, ,, a, mouse, !, peace, ,, peace, ,, this, piece, of, toasted, cheese, will, do, 't, .]"
17389,17389,"What's your dark meaning, mouse, of this light word? A light condition in a beauty dark.","[what, 's, your, dark, meaning, ,, mouse, ,, of, this, light, word, ?, a, light, condition, in, a, beauty, dark, .]"
22318,22318,"Enter Lion and Moonshine You, ladies, you, whose gentle hearts do fear The smallest monstrous mouse that creeps on floor, May now perchance both quake and tremble here, When lion rough in wildest rage doth roar.","[enter, lion, and, moonshine, you, ,, ladies, ,, you, ,, whose, gentle, hearts, do, fear, the, smallest, monstrous, mouse, that, creeps, on, floor, ,, may, now, perchance, both, quake, and, tremble, here, ,, when, lion, rough, in, wildest, rage, doth, roar, .]"
22342,22342,"The Lion shakes Thisbe's mantle, and exit Well moused, Lion.","[the, lion, shakes, thisbe, 's, mantle, ,, and, exit, well, moused, ,, lion, .]"
22378,22378,"Now it is the time of night That the graves all gaping wide, Every one lets forth his sprite, In the church-way paths to glide: And we fairies, that do run By the triple Hecate's team, From the presence of the sun, Following darkness like a dream, Now are frolic: not a mouse Shall disturb this hallow'd house: I am sent with broom before, To sweep the dust behind the door.","[now, it, is, the, time, of, night, that, the, graves, all, gaping, wide, ,, every, one, lets, forth, his, sprite, ,, in, the, church-way, paths, to, glide, :, and, we, fairies, ,, that, do, run, by, the, triple, hecate, 's, team, ,, from, the, presence, of, the, sun, ,, following, darkness, like, a, dream, ,, now, are, frolic, :, not, a, mouse, shall, disturb, this, hallow, 'd, house, :, i, am, sent, with, broom, before, ,, to, sweep, the, dust, behind, the, door, .]"
25000,25000,"The cat, with eyne of burning coal, Now crouches fore the mouse's hole, And crickets sing at the oven's mouth, E'er the blither for their drouth.","[the, cat, ,, with, eyne, of, burning, coal, ,, now, crouches, fore, the, mouse, 's, hole, ,, and, crickets, sing, at, the, oven, 's, mouth, ,, e'er, the, blither, for, their, drouth, .]"


Earlier work below this line

In [15]:
shake_corpus_df['line_clean'] = shake_corpus_df['line'].apply(lambda x: gensim.utils.simple_preprocess(x))
shake_corpus_df.head()

Unnamed: 0,line,line_clean
0,ACT I SCENE I,"[act, scene]"
1,London,[london]
2,The palace,"[the, palace]"
3,"Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others So shaken as we are, so wan with care, Find we a time for frighted peace to pant, And breathe short-winded accents of new broils To be commenced in strands afar remote","[enter, king, henry, lord, john, of, lancaster, the, earl, of, westmoreland, sir, walter, blunt, and, others, so, shaken, as, we, are, so, wan, with, care, find, we, time, for, frighted, peace, to, pant, and, breathe, short, winded, accents, of, new, broils, to, be, commenced, in, strands, afar, remote]"
4,"No more the thirsty entrance of this soil Shall daub her lips with her own children's blood, Nor more shall trenching war channel her fields, Nor bruise her flowerets with the armed hoofs Of hostile paces: those opposed eyes, Which, like the meteors of a troubled heaven, All of one nature, of one substance bred, Did lately meet in the intestine shock And furious close of civil butchery Shall now, in mutual well-beseeming ranks, March all one way and be no more opposed Against acquaintance, kindred and allies: The edge of war, like an ill-sheathed knife, No more shall cut his master","[no, more, the, thirsty, entrance, of, this, soil, shall, daub, her, lips, with, her, own, children, blood, nor, more, shall, trenching, war, channel, her, fields, nor, bruise, her, flowerets, with, the, armed, hoofs, of, hostile, paces, those, opposed, eyes, which, like, the, meteors, of, troubled, heaven, all, of, one, nature, of, one, substance, bred, did, lately, meet, in, the, intestine, shock, and, furious, close, of, civil, butchery, shall, now, in, mutual, well, beseeming, ranks, march, all, one, way, and, be, no, more, opposed, against, acquaintance, kindred, and, allies, the, edge, of, war, like, an, ill, sheathed, knife, no, more, shall, ...]"


In [29]:
w2v_model = gensim.models.Word2Vec(shake_corpus_df['line_clean'], size=100, window=5, min_count=2)

Find the most similar words to the most frequently used words in our set of actual titles

In [26]:
w2v_model.wv.most_similar('heaven')

[('merchandise', 0.8550989031791687),
 ('light', 0.8457979559898376),
 ('death', 0.845095157623291),
 ('fortune', 0.8397153615951538),
 ('fall', 0.8270392417907715),
 ('work', 0.819554328918457),
 ('myself', 0.8163576722145081),
 ('consent', 0.8109981417655945),
 ('revenge', 0.8102153539657593),
 ('part', 0.8084492087364197)]

In [27]:
w2v_model.wv.most_similar('blood')

[('age', 0.916276216506958),
 ('beauty', 0.9095497727394104),
 ('virtue', 0.9070041179656982),
 ('flesh', 0.8985511064529419),
 ('breath', 0.893226683139801),
 ('birth', 0.8926401734352112),
 ('eye', 0.8921049237251282),
 ('grief', 0.8842465877532959),
 ('loss', 0.8809669017791748),
 ('sorrow', 0.8796870708465576)]

In [28]:
w2v_model.wv.most_similar('world')

[('worst', 0.8616856336593628),
 ('end', 0.804405927658081),
 ('truth', 0.8005660772323608),
 ('time', 0.7947704792022705),
 ('thing', 0.7887304425239563),
 ('deed', 0.7839971780776978),
 ('law', 0.77757728099823),
 ('earth', 0.7765325903892517),
 ('nature', 0.7726906538009644),
 ('depth', 0.7712389826774597)]

In [24]:
w2v_model.wv.most_similar('night')

[('day', 0.9143925905227661),
 ('time', 0.8499373197555542),
 ('morning', 0.8498775959014893),
 ('court', 0.837300717830658),
 ('business', 0.8262588977813721),
 ('watch', 0.8227887153625488),
 ('morrow', 0.8107404112815857),
 ('rest', 0.8095710277557373),
 ('last', 0.8034456968307495),
 ('place', 0.7928426861763)]

Refer to https://radimrehurek.com/gensim/models/word2vec.html

In [30]:
shake_word_vectors = w2v_model.wv

In [31]:
shake_word_vectors.save('shake_w2v.wordsvectors')

I am not sure that I can use the vector dictionary generated from the shakespeare corpus. My concern about using a pre-trained model was that the language would be too modern to be useful. Word Embeddings for Historical Text (https://github.com/williamleif/histwords) offers pre-trained historical embeddings, and I am going to try the English fiction (1800s-1990s) w2v embedding.

In [8]:
import gensim.downloader as api

wiki_embd = api.load('glove-wiki-gigaword-100')

In [15]:
wiki_embd.most_similar('night')

[('evening', 0.8429070711135864),
 ('morning', 0.8382436037063599),
 ('day', 0.8261710405349731),
 ('sunday', 0.8161605596542358),
 ('saturday', 0.8128535747528076),
 ('afternoon', 0.8067009449005127),
 ('weekend', 0.8065499067306519),
 ('days', 0.8000710606575012),
 ('hours', 0.7879374027252197),
 ('week', 0.7607038021087646)]

In [16]:
wiki_embd.most_similar('heaven')

[('hell', 0.7834520936012268),
 ('god', 0.7678099870681763),
 ('heavens', 0.6618111729621887),
 ('eternity', 0.6466289758682251),
 ('christ', 0.6430902481079102),
 ('heavenly', 0.6338659524917603),
 ('gods', 0.6257569193840027),
 ('love', 0.6231491565704346),
 ('divine', 0.6199791431427002),
 ('devil', 0.6167252063751221)]

In [46]:
title_bigram = pd.read_csv('../data/good_titles_bigrams.csv')

In [47]:
words = set(wiki_embd.index2word)
len(words)

400000

In [50]:
title_bigram.drop(columns=['Play', 'Occurences', 'title_tokenized', 'title_pos', 'actual_title_len', 'tokenized_len'], inplace=True)
title_bigram.head()

Unnamed: 0,uniq_id,Title,pos_tag
0,14,mingled yarn,"['VBN', 'NN']"
1,17,stiff news,"['JJ', 'NN']"
2,18,salad days,"['JJ', 'NNS']"
3,23,gaudy night,"['NN', 'NN']"
4,25,immortal longings,"['JJ', 'NNS']"


In [51]:
title_bigram['line_clean'] = title_bigram['Title'].apply(lambda x: gensim.utils.simple_preprocess(x))
title_bigram.head()

Unnamed: 0,uniq_id,Title,pos_tag,line_clean
0,14,mingled yarn,"['VBN', 'NN']","[mingled, yarn]"
1,17,stiff news,"['JJ', 'NN']","[stiff, news]"
2,18,salad days,"['JJ', 'NNS']","[salad, days]"
3,23,gaudy night,"['NN', 'NN']","[gaudy, night]"
4,25,immortal longings,"['JJ', 'NNS']","[immortal, longings]"


In [53]:
title_bigram_vect = [np.array([wiki_embd[i] for i in ls if i in words])
                                for ls in title_bigram['line_clean']]

In [61]:
for i,v in enumerate(title_bigram_vect):
    if len(v) != 2:
        print(title_bigram.iloc[i])
        print('Vector len:', len(v))

uniq_id                          645
Title            unbreathed memories
pos_tag                ['JJ', 'NNS']
line_clean    [unbreathed, memories]
Name: 60, dtype: object
Vector len: 1
uniq_id                848
Title              o romeo
pos_tag       ['NN', 'NN']
line_clean         [romeo]
Name: 74, dtype: object
Vector len: 1
uniq_id                 870
Title            jocund day
pos_tag        ['NN', 'NN']
line_clean    [jocund, day]
Name: 76, dtype: object
Vector len: 1
uniq_id                       157
Title            exeunt murderers
pos_tag             ['NN', 'NNS']
line_clean    [exeunt, murderers]
Name: 117, dtype: object
Vector len: 1
