In [1]:
import gensim
import numpy as np
import pandas as pd
from nltk.tokenize import TreebankWordTokenizer
import os

pd.set_option('display.max_colwidth', None)

In [2]:
shake_df = pd.read_csv('../data/shake_corpus.csv')
shake_df.sample(5)

Unnamed: 0,Dataline,Play,Line,ActSceneLine,Player,PlayerLinenumber,Traditional,Modern
19807,19808,antony and cleopatra,"We'll speak with thee at sea: at land, thou know'st",2.6.28,MARK ANTONY,5.0,tragedy,tragedy
49819,49820,king lear,But other of your insolent retinue,1.4.193,GONERIL,75.0,tragedy,tragedy
111930,111931,sonnet 39,What can mine own praise to mine own self bring?,0.0.2,Poet,2.0,sonnet,romance
43425,43426,henry viii,Holy and heavenly thoughts still counsel her:,5.5.34,CRANMER,8.0,history,tragedy
104950,104951,twelfth night,"become the function well, nor lean enough to be",4.2.7,Clown,2.0,comedy,comedy


In [6]:
shake_df['line_clean'] = shake_df['Line'].apply(lambda x: gensim.utils.simple_preprocess(x))
shake_df.sample(5)

Unnamed: 0,Dataline,Play,Line,ActSceneLine,Player,PlayerLinenumber,Traditional,Modern,line_clean
12850,12851,alls well that ends well,Exit,1.1.183,Page,48.0,comedy,comedy,[exit]
61570,61571,merchant of venice,"They would be better, if well followed.",1.1.200,NERISSA,36.0,comedy,comedy,"[they, would, be, better, if, well, followed]"
68929,68930,midsummer nights dream,Exit PHILOSTRATE,5.1.90,THESEUS,16.0,comedy,comedy,"[exit, philostrate]"
50902,50903,king lear,My wits begin to turn.,3.2.70,KING LEAR,11.0,tragedy,tragedy,"[my, wits, begin, to, turn]"
108266,108267,winters tale,"I play'd the fool, it was my negligence,",1.2.305,CAMILLO,68.0,comedy,romance,"[play, the, fool, it, was, my, negligence]"


Processing the lines above treats each document as a fragment of line. The document should be the entire line itself. Below I will join the lines, and then split into individual lines based on '.'

In [8]:
shake_lines = shake_df.Line.tolist()
len(shake_lines)

113553

In [10]:
shake_all = ' '.join(shake_lines)
shake_all[:100]

'ACT I SCENE I. London. The palace. Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAN'

In [11]:
shake_corpus = shake_all.split('.')
shake_corpus[:5]

['ACT I SCENE I',
 ' London',
 ' The palace',
 ' Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others So shaken as we are, so wan with care, Find we a time for frighted peace to pant, And breathe short-winded accents of new broils To be commenced in strands afar remote',
 " No more the thirsty entrance of this soil Shall daub her lips with her own children's blood, Nor more shall trenching war channel her fields, Nor bruise her flowerets with the armed hoofs Of hostile paces: those opposed eyes, Which, like the meteors of a troubled heaven, All of one nature, of one substance bred, Did lately meet in the intestine shock And furious close of civil butchery Shall now, in mutual well-beseeming ranks, March all one way and be no more opposed Against acquaintance, kindred and allies: The edge of war, like an ill-sheathed knife, No more shall cut his master"]

In [14]:
shake_corpus_df = pd.DataFrame(shake_corpus, columns=['line'])

In [15]:
shake_corpus_df['line_clean'] = shake_corpus_df['line'].apply(lambda x: gensim.utils.simple_preprocess(x))
shake_corpus_df.head()

Unnamed: 0,line,line_clean
0,ACT I SCENE I,"[act, scene]"
1,London,[london]
2,The palace,"[the, palace]"
3,"Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others So shaken as we are, so wan with care, Find we a time for frighted peace to pant, And breathe short-winded accents of new broils To be commenced in strands afar remote","[enter, king, henry, lord, john, of, lancaster, the, earl, of, westmoreland, sir, walter, blunt, and, others, so, shaken, as, we, are, so, wan, with, care, find, we, time, for, frighted, peace, to, pant, and, breathe, short, winded, accents, of, new, broils, to, be, commenced, in, strands, afar, remote]"
4,"No more the thirsty entrance of this soil Shall daub her lips with her own children's blood, Nor more shall trenching war channel her fields, Nor bruise her flowerets with the armed hoofs Of hostile paces: those opposed eyes, Which, like the meteors of a troubled heaven, All of one nature, of one substance bred, Did lately meet in the intestine shock And furious close of civil butchery Shall now, in mutual well-beseeming ranks, March all one way and be no more opposed Against acquaintance, kindred and allies: The edge of war, like an ill-sheathed knife, No more shall cut his master","[no, more, the, thirsty, entrance, of, this, soil, shall, daub, her, lips, with, her, own, children, blood, nor, more, shall, trenching, war, channel, her, fields, nor, bruise, her, flowerets, with, the, armed, hoofs, of, hostile, paces, those, opposed, eyes, which, like, the, meteors, of, troubled, heaven, all, of, one, nature, of, one, substance, bred, did, lately, meet, in, the, intestine, shock, and, furious, close, of, civil, butchery, shall, now, in, mutual, well, beseeming, ranks, march, all, one, way, and, be, no, more, opposed, against, acquaintance, kindred, and, allies, the, edge, of, war, like, an, ill, sheathed, knife, no, more, shall, ...]"


In [29]:
w2v_model = gensim.models.Word2Vec(shake_corpus_df['line_clean'], size=100, window=5, min_count=2)

Find the most similar words to the most frequently used words in our set of actual titles

In [26]:
w2v_model.wv.most_similar('heaven')

[('merchandise', 0.8550989031791687),
 ('light', 0.8457979559898376),
 ('death', 0.845095157623291),
 ('fortune', 0.8397153615951538),
 ('fall', 0.8270392417907715),
 ('work', 0.819554328918457),
 ('myself', 0.8163576722145081),
 ('consent', 0.8109981417655945),
 ('revenge', 0.8102153539657593),
 ('part', 0.8084492087364197)]

In [27]:
w2v_model.wv.most_similar('blood')

[('age', 0.916276216506958),
 ('beauty', 0.9095497727394104),
 ('virtue', 0.9070041179656982),
 ('flesh', 0.8985511064529419),
 ('breath', 0.893226683139801),
 ('birth', 0.8926401734352112),
 ('eye', 0.8921049237251282),
 ('grief', 0.8842465877532959),
 ('loss', 0.8809669017791748),
 ('sorrow', 0.8796870708465576)]

In [28]:
w2v_model.wv.most_similar('world')

[('worst', 0.8616856336593628),
 ('end', 0.804405927658081),
 ('truth', 0.8005660772323608),
 ('time', 0.7947704792022705),
 ('thing', 0.7887304425239563),
 ('deed', 0.7839971780776978),
 ('law', 0.77757728099823),
 ('earth', 0.7765325903892517),
 ('nature', 0.7726906538009644),
 ('depth', 0.7712389826774597)]

In [24]:
w2v_model.wv.most_similar('night')

[('day', 0.9143925905227661),
 ('time', 0.8499373197555542),
 ('morning', 0.8498775959014893),
 ('court', 0.837300717830658),
 ('business', 0.8262588977813721),
 ('watch', 0.8227887153625488),
 ('morrow', 0.8107404112815857),
 ('rest', 0.8095710277557373),
 ('last', 0.8034456968307495),
 ('place', 0.7928426861763)]

Refer to https://radimrehurek.com/gensim/models/word2vec.html

In [30]:
shake_word_vectors = w2v_model.wv

In [31]:
shake_word_vectors.save('shake_w2v.wordsvectors')