In [1]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_colwidth', None)

In [2]:
shake_df = pd.read_csv('../data/shake_corpus.csv')

shake_df.sample(5)

Unnamed: 0,Dataline,Play,Line,ActSceneLine,Player,PlayerLinenumber,Traditional,Modern
35423,35424,hamlet,"He is dead and gone,",4.5.33,OPHELIA,11.0,tragedy,tragedy
66654,66655,merry wives of windsor,To give our hearts united ceremony.,4.6.50,FENTON,6.0,comedy,comedy
62566,62567,merchant of venice,And of opposed natures.,2.9.61,PORTIA,8.0,comedy,comedy
56324,56325,macbeth,"But screw your courage to the sticking-place,",1.7.67,LADY MACBETH,10.0,tragedy,tragedy
27005,27006,coriolanus,"Six of his labours you'ld have done, and saved",4.1.20,CORIOLANUS,5.0,tragedy,tragedy


Subset shake_df in order to investigate tokenizing and n-gram generation with punctuation.

In [3]:
macbeth = shake_df[shake_df['Play'] == 'macbeth']
macbeth.head()

Unnamed: 0,Dataline,Play,Line,ActSceneLine,Player,PlayerLinenumber,Traditional,Modern
55779,55780,macbeth,ACT I,1.1.1,ADRIANO DE ARMADO,405.0,tragedy,tragedy
55780,55781,macbeth,SCENE I. A desert place.,1.1.1,ADRIANO DE ARMADO,405.0,tragedy,tragedy
55781,55782,macbeth,Thunder and lightning. Enter three Witches,1.1.1,ADRIANO DE ARMADO,405.0,tragedy,tragedy
55782,55783,macbeth,When shall we three meet again,1.1.1,First Witch,1.0,tragedy,tragedy
55783,55784,macbeth,"In thunder, lightning, or in rain?",1.1.2,First Witch,1.0,tragedy,tragedy


TreebankWordTokenizer performs the following steps: 
- split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
- treat most punctuation characters as separate tokens
- split off commas and single quotes, when followed by whitespace
- separate periods that appear at the end of line

In [4]:
twd = TreebankWordTokenizer()

macbeth['line_tokenized'] = macbeth['Line'].apply(lambda x: twd.tokenize(x.lower()))
macbeth.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  macbeth['line_tokenized'] = macbeth['Line'].apply(lambda x: twd.tokenize(x.lower()))


Unnamed: 0,Dataline,Play,Line,ActSceneLine,Player,PlayerLinenumber,Traditional,Modern,line_tokenized
55779,55780,macbeth,ACT I,1.1.1,ADRIANO DE ARMADO,405.0,tragedy,tragedy,"[act, i]"
55780,55781,macbeth,SCENE I. A desert place.,1.1.1,ADRIANO DE ARMADO,405.0,tragedy,tragedy,"[scene, i., a, desert, place, .]"
55781,55782,macbeth,Thunder and lightning. Enter three Witches,1.1.1,ADRIANO DE ARMADO,405.0,tragedy,tragedy,"[thunder, and, lightning., enter, three, witches]"


The TreebankWordTokenizer looks good. Lets try the full process using CountVectorizer. CountVectorizer will convert to lowercase by default

In [5]:
ngram_cv = CountVectorizer(ngram_range=(3,3), tokenizer=TreebankWordTokenizer().tokenize, analyzer='word')
macbeth_matrix = ngram_cv.fit_transform(macbeth['Line'])

macbeth_trigrams = ngram_cv.get_feature_names()

print('Number of trigrams from Macbeth:', len(macbeth_trigrams))

Number of trigrams from Macbeth: 15016


In [6]:
mac_tri_df = pd.DataFrame(macbeth_trigrams, columns=['title'])
mac_tri_df.head()

Unnamed: 0,title
0,! ' and
1,! ' exeunt
2,! ' the
3,! ' this
4,! ' to


POS Tagging using nltk

In [7]:
mac_tri_df['title_tokenized'] = mac_tri_df['title'].apply(lambda x: twd.tokenize(x))
mac_tri_df['title_pos'] = mac_tri_df['title_tokenized'].apply(lambda x: nltk.pos_tag(x))
mac_tri_df['pos_tag'] = mac_tri_df['title_pos'].apply(lambda x: [i[1] for i in x])

mac_tri_df.sample(10)

Unnamed: 0,title,title_tokenized,title_pos,pos_tag
13967,what you were,"[what, you, were]","[(what, WP), (you, PRP), (were, VBD)]","[WP, PRP, VBD]"
101,' pardon and,"[', pardon, and]","[(', POS), (pardon, NN), (and, CC)]","[POS, NN, CC]"
5125,"foreign levy ,","[foreign, levy, ,]","[(foreign, JJ), (levy, NN), (,, ,)]","[JJ, NN, ,]"
3710,can not conceive,"[can, not, conceive]","[(can, MD), (not, RB), (conceive, VB)]","[MD, RB, VB]"
2237,and an old,"[and, an, old]","[(and, CC), (an, DT), (old, JJ)]","[CC, DT, JJ]"
13163,to this terrible,"[to, this, terrible]","[(to, TO), (this, DT), (terrible, JJ)]","[TO, DT, JJ]"
13536,"venom breed ,","[venom, breed, ,]","[(venom, NNS), (breed, NN), (,, ,)]","[NNS, NN, ,]"
6548,i do think,"[i, do, think]","[(i, NNS), (do, VBP), (think, VB)]","[NNS, VBP, VB]"
11969,the selfsame day,"[the, selfsame, day]","[(the, DT), (selfsame, NN), (day, NN)]","[DT, NN, NN]"
2636,and what i,"[and, what, i]","[(and, CC), (what, WP), (i, NN)]","[CC, WP, NN]"


Let's try on the full Shakespearean dataset

In [8]:
n3gram_cv = CountVectorizer(ngram_range=(3,3), tokenizer=TreebankWordTokenizer().tokenize, analyzer='word')
shake_matrix = n3gram_cv.fit_transform(shake_df['Line'])

shake_trigrams = n3gram_cv.get_feature_names()

print('Number of trigrams from the Shakespeare plays and sonnets DataFrame:', len(shake_trigrams))

Number of trigrams from the Shakespeare plays and sonnets DataFrame: 556864


In [54]:
shake_tri_df = pd.DataFrame(shake_trigrams, columns=['title'])
shake_tri_df.head()

Unnamed: 0,title
0,! ' 'brutus
1,! ' 'citizens
2,! ' 'coriolanus
3,! ' 'patricians
4,! ' 'stay


I am curious about the punctuation used in the real titles and the punctuation captured in the entire shakespeare ngram set.

In [10]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
punct = list(string.punctuation)
flat_list = ' '.join(shake_trigrams)
present_punct_shake = [i for i in punct if i in flat_list]
present_punct_shake

['!', '$', "'", '(', ')', ',', '-', '.', ':', ';', '?', '[', ']', '`']

In [12]:
titles_df = pd.read_csv('../data/clean_titles.csv')

In [13]:
flat_title = ' '.join(titles_df.Title.tolist())
title_present_punct = [i for i in punct if i in flat_title]
title_present_punct

['!', "'", ',', '-', '.', '?']

In order to shrink the dataset, I'd like to explore removing the ngrams that contain punctuation not found in the real titles.

In [32]:
punct_2delete = [i for i in present_punct_shake if i not in title_present_punct]
punct_2delete

['$', '(', ')', ':', ';', '[', ']', '`']

In [55]:
print('ngrams before punctuation removal:', shake_tri_df.shape[0])

# for index, row in shake_tri_df.iterrows():
#     if '$' in row['title']:
#         shake_tri_df.drop(index, inplace=True)
#     elif '(' in row['title']:
#         shake_tri_df.drop(index, inplace=True)
#     elif ')' in row['title']:
#         shake_tri_df.drop(index, inplace=True)
#     elif ':' in row['title']:
#         shake_tri_df.drop(index, inplace=True)
#     elif ';' in row['title']:
#         shake_tri_df.drop(index, inplace=True)
#     elif '[' in row['title']:
#         shake_tri_df.drop(index, inplace=True)
#     elif ']' in row['title']:
#         shake_tri_df.drop(index, inplace=True)
#     elif '`' in row['title']:
#         shake_tri_df.drop(index, inplace=True)

# print('ngrams after punctuation removal:', shake_tri_df.shape)       

ngrams before punctuation removal: 556864


The method above was WAY to slow to be useful.

In [56]:
dollar = shake_tri_df.title.str.contains('$', regex=False)
shake_tri_df = shake_tri_df[~dollar]

shake_tri_df.shape[0]

556861

In [41]:
shake_tri_df.tail()

Unnamed: 0,title
556859,zo long as
556860,zodiac in his
556861,zodiacs have gone
556862,zounds ! i
556863,zwaggered out of


In [57]:
lparen = shake_tri_df.title.str.contains('(', regex=False)
shake_tri_df = shake_tri_df[~lparen]
shake_tri_df.shape[0]

556767

In [58]:
rparen = shake_tri_df.title.str.contains(')', regex=False)
shake_tri_df = shake_tri_df[~rparen]
shake_tri_df.shape[0]

556697

In [59]:
colon = shake_tri_df.title.str.contains(':', regex=False)
shake_tri_df = shake_tri_df[~colon]
shake_tri_df.shape[0]

536384

In [60]:
semi = shake_tri_df.title.str.contains(';', regex=False)
shake_tri_df = shake_tri_df[~semi]
shake_tri_df.shape[0]

536199

In [61]:
lbrack = shake_tri_df.title.str.contains('[', regex=False)
shake_tri_df = shake_tri_df[~lbrack]
shake_tri_df.shape[0]

536090

In [62]:
rbrack = shake_tri_df.title.str.contains(']', regex=False)
shake_tri_df = shake_tri_df[~rbrack]
shake_tri_df.shape[0]

535194

In [64]:
whatisthisthing = shake_tri_df.title.str.contains('`', regex=False)
shake_tri_df = shake_tri_df[~whatisthisthing]
shake_tri_df.shape[0]

535152

In [66]:
shake_tri_df['title_tokenized'] = shake_tri_df['title'].apply(lambda x: twd.tokenize(x))
shake_tri_df['title_pos'] = shake_tri_df['title_tokenized'].apply(lambda x: nltk.pos_tag(x))
shake_tri_df['pos_tag'] = shake_tri_df['title_pos'].apply(lambda x: [i[1] for i in x])

shake_tri_df.sample(5)

Unnamed: 0,title,title_tokenized,title_pos,pos_tag
128184,captain-general of the,"[captain-general, of, the]","[(captain-general, JJ), (of, IN), (the, DT)]","[JJ, IN, DT]"
478714,till destruction sicken,"[till, destruction, sicken]","[(till, NN), (destruction, NN), (sicken, NN)]","[NN, NN, NN]"
182668,for there young,"[for, there, young]","[(for, IN), (there, EX), (young, JJ)]","[IN, EX, JJ]"
55617,a rude tomb,"[a, rude, tomb]","[(a, DT), (rude, NN), (tomb, NN)]","[DT, NN, NN]"
381103,rest shall bear,"[rest, shall, bear]","[(rest, VB), (shall, MD), (bear, VB)]","[VB, MD, VB]"


Repeat the process for fourgrams and bigrams.

In [79]:
n2gram_cv = CountVectorizer(ngram_range=(2,2), tokenizer=TreebankWordTokenizer().tokenize, analyzer='word')
shake_2matrix = n2gram_cv.fit_transform(shake_df['Line'])

shake_bigrams = n2gram_cv.get_feature_names()

print('Number of bigrams from the Shakespeare plays and sonnets DataFrame:', len(shake_bigrams))

Number of bigrams from the Shakespeare plays and sonnets DataFrame: 265248


In [80]:
shake_bi_df = pd.DataFrame(shake_bigrams, columns=['title'])
shake_bi_df.sample(5)

Unnamed: 0,title
137102,marseilles. a
165237,our hot
43205,bloody parliament
125155,kings in
63383,dear madam.


In [81]:
dollar = shake_bi_df.title.str.contains('$', regex=False)
shake_bi_df = shake_bi_df[~dollar]

lparen = shake_bi_df.title.str.contains('(', regex=False)
shake_bi_df = shake_bi_df[~lparen]

rparen = shake_bi_df.title.str.contains(')', regex=False)
shake_bi_df = shake_bi_df[~rparen]

colon = shake_bi_df.title.str.contains(':', regex=False)
shake_bi_df = shake_bi_df[~colon]

semi = shake_bi_df.title.str.contains(';', regex=False)
shake_bi_df = shake_bi_df[~semi]

lbrack = shake_bi_df.title.str.contains('[', regex=False)
shake_bi_df = shake_bi_df[~lbrack]

rbrack = shake_bi_df.title.str.contains(']', regex=False)
shake_bi_df = shake_bi_df[~rbrack]

whatisthisthing = shake_bi_df.title.str.contains('`', regex=False)
shake_bi_df = shake_bi_df[~whatisthisthing]

shake_bi_df.shape[0]

259594

In [83]:
shake_bi_df['title_tokenized'] = shake_bi_df['title'].apply(lambda x: twd.tokenize(x))
shake_bi_df['title_pos'] = shake_bi_df['title_tokenized'].apply(lambda x: nltk.pos_tag(x))
shake_bi_df['pos_tag'] = shake_bi_df['title_pos'].apply(lambda x: [i[1] for i in x])

shake_bi_df.sample(5)

Unnamed: 0,title,title_tokenized,title_pos,pos_tag
158400,of date-broke,"[of, date-broke]","[(of, IN), (date-broke, NN)]","[IN, NN]"
244380,walls they,"[walls, they]","[(walls, NNS), (they, PRP)]","[NNS, PRP]"
202534,still 'they,"[still, 'they]","[(still, RB), ('they, VB)]","[RB, VB]"
190487,shall lay,"[shall, lay]","[(shall, MD), (lay, VB)]","[MD, VB]"
4083,'s castle,"['s, castle]","[('s, POS), (castle, NN)]","[POS, NN]"


In [84]:
n4gram_cv = CountVectorizer(ngram_range=(4,4), tokenizer=TreebankWordTokenizer().tokenize, analyzer='word')
shake_4matrix = n4gram_cv.fit_transform(shake_df['Line'])

shake_fourgrams = n4gram_cv.get_feature_names()

print('Number of fourgrams from the Shakespeare plays and sonnets DataFrame:', len(shake_fourgrams))

Number of fourgrams from the Shakespeare plays and sonnets DataFrame: 625445


In [85]:
shake_tetra_df = pd.DataFrame(shake_fourgrams, columns=['title'])
shake_tetra_df.sample(5)

Unnamed: 0,title
133278,"be ransomed , and"
13278,"'s master , how"
545401,to last resolve you
391722,"of breath , prompt"
513652,them this discomfort ?


In [86]:
dollar = shake_tetra_df.title.str.contains('$', regex=False)
shake_tetra_df = shake_tetra_df[~dollar]

lparen = shake_tetra_df.title.str.contains('(', regex=False)
shake_tetra_df = shake_tetra_df[~lparen]

rparen = shake_tetra_df.title.str.contains(')', regex=False)
shake_tetra_df = shake_tetra_df[~rparen]

colon = shake_tetra_df.title.str.contains(':', regex=False)
shake_tetra_df = shake_tetra_df[~colon]

semi = shake_tetra_df.title.str.contains(';', regex=False)
shake_tetra_df = shake_tetra_df[~semi]

lbrack = shake_tetra_df.title.str.contains('[', regex=False)
shake_tetra_df = shake_tetra_df[~lbrack]

rbrack = shake_tetra_df.title.str.contains(']', regex=False)
shake_tetra_df = shake_tetra_df[~rbrack]

whatisthisthing = shake_tetra_df.title.str.contains('`', regex=False)
shake_tetra_df = shake_tetra_df[~whatisthisthing]

shake_tetra_df.shape[0]

591514

In [87]:
shake_tetra_df['title_tokenized'] = shake_tetra_df['title'].apply(lambda x: twd.tokenize(x))
shake_tetra_df['title_pos'] = shake_tetra_df['title_tokenized'].apply(lambda x: nltk.pos_tag(x))
shake_tetra_df['pos_tag'] = shake_tetra_df['title_pos'].apply(lambda x: [i[1] for i in x])

shake_tetra_df.sample(5)

Unnamed: 0,title,title_tokenized,title_pos,pos_tag
465152,"sounding and discovery ,","[sounding, and, discovery, ,]","[(sounding, NN), (and, CC), (discovery, NN), (,, ,)]","[NN, CC, NN, ,]"
19112,", all sorts of","[,, all, sorts, of]","[(,, ,), (all, DT), (sorts, NNS), (of, IN)]","[,, DT, NNS, IN]"
505592,the sleepers. let us,"[the, sleepers., let, us]","[(the, DT), (sleepers., NN), (let, VBD), (us, PRP)]","[DT, NN, VBD, PRP]"
498075,the heart to do,"[the, heart, to, do]","[(the, DT), (heart, NN), (to, TO), (do, VB)]","[DT, NN, TO, VB]"
276329,i counterfeit him .,"[i, counterfeit, him, .]","[(i, JJ), (counterfeit, VBD), (him, PRP), (., .)]","[JJ, VBD, PRP, .]"


In [88]:
datapath = '../data'

In [89]:
datapath_shake_tri_df = os.path.join(datapath, 'shakespearean_trigrams.csv')
shake_tri_df.to_csv(datapath_shake_tri_df, index=False)

In [90]:
datapath_shake_tetra_df = os.path.join(datapath, 'shakespearean_fourgrams.csv')
shake_tetra_df.to_csv(datapath_shake_tetra_df, index=False)

In [91]:
datapath_shake_bi_df = os.path.join(datapath, 'shakespearean_bigrams.csv')
shake_bi_df.to_csv(datapath_shake_bi_df, index=False)