In [1]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_colwidth', None)

In [3]:
shake_df = pd.read_csv('../data/shake_corpus.csv')

shake_df.sample(5)

Unnamed: 0,Dataline,Play,Line,ActSceneLine,Player,PlayerLinenumber,Traditional,Modern
48346,48347,julius caesar,"Dearer than Plutus' mine, richer than gold:",4.3.111,CASSIUS,39.0,tragedy,tragedy
49875,49876,king lear,"You strike my people, and your disorder'd rabble",1.4.249,GONERIL,87.0,tragedy,tragedy
10913,10914,henry vi part 3,"Ay, here's a deer whose skin's a keeper's fee:",3.1.22,First Keeper,6.0,history,tragedy
58581,58582,measure for measure,"From too much liberty, my Lucio, liberty:",1.2.113,CLAUDIO,63.0,comedy,comedy
96362,96363,timon of athens,A parley sounded,5.4.3,ALCIBIADES,1.0,tragedy,tragedy


Subset shake_df in order to investigate tokenizing and n-gram generation with punctuation.

In [3]:
macbeth = shake_df[shake_df['Play'] == 'macbeth']
macbeth.head()

Unnamed: 0,Dataline,Play,Line,ActSceneLine,Player,PlayerLinenumber,Traditional,Modern
55779,55780,macbeth,ACT I,1.1.1,ADRIANO DE ARMADO,405.0,tragedy,tragedy
55780,55781,macbeth,SCENE I. A desert place.,1.1.1,ADRIANO DE ARMADO,405.0,tragedy,tragedy
55781,55782,macbeth,Thunder and lightning. Enter three Witches,1.1.1,ADRIANO DE ARMADO,405.0,tragedy,tragedy
55782,55783,macbeth,When shall we three meet again,1.1.1,First Witch,1.0,tragedy,tragedy
55783,55784,macbeth,"In thunder, lightning, or in rain?",1.1.2,First Witch,1.0,tragedy,tragedy


TreebankWordTokenizer performs the following steps: 
- split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
- treat most punctuation characters as separate tokens
- split off commas and single quotes, when followed by whitespace
- separate periods that appear at the end of line

In [4]:
twd = TreebankWordTokenizer()

macbeth['line_tokenized'] = macbeth['Line'].apply(lambda x: twd.tokenize(x.lower()))
macbeth.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  macbeth['line_tokenized'] = macbeth['Line'].apply(lambda x: twd.tokenize(x.lower()))


Unnamed: 0,Dataline,Play,Line,ActSceneLine,Player,PlayerLinenumber,Traditional,Modern,line_tokenized
55779,55780,macbeth,ACT I,1.1.1,ADRIANO DE ARMADO,405.0,tragedy,tragedy,"[act, i]"
55780,55781,macbeth,SCENE I. A desert place.,1.1.1,ADRIANO DE ARMADO,405.0,tragedy,tragedy,"[scene, i., a, desert, place, .]"
55781,55782,macbeth,Thunder and lightning. Enter three Witches,1.1.1,ADRIANO DE ARMADO,405.0,tragedy,tragedy,"[thunder, and, lightning., enter, three, witches]"


The TreebankWordTokenizer looks good. Lets try the full process using CountVectorizer. CountVectorizer will convert to lowercase by default

In [5]:
ngram_cv = CountVectorizer(ngram_range=(3,3), tokenizer=TreebankWordTokenizer().tokenize, analyzer='word')
macbeth_matrix = ngram_cv.fit_transform(macbeth['Line'])

macbeth_trigrams = ngram_cv.get_feature_names()

print('Number of trigrams from Macbeth:', len(macbeth_trigrams))

Number of trigrams from Macbeth: 15016


In [6]:
mac_tri_df = pd.DataFrame(macbeth_trigrams, columns=['title'])
mac_tri_df.head()

Unnamed: 0,title
0,! ' and
1,! ' exeunt
2,! ' the
3,! ' this
4,! ' to


POS Tagging using nltk

In [7]:
mac_tri_df['title_tokenized'] = mac_tri_df['title'].apply(lambda x: twd.tokenize(x))
mac_tri_df['title_pos'] = mac_tri_df['title_tokenized'].apply(lambda x: nltk.pos_tag(x))
mac_tri_df['pos_tag'] = mac_tri_df['title_pos'].apply(lambda x: [i[1] for i in x])

mac_tri_df.sample(10)

Unnamed: 0,title,title_tokenized,title_pos,pos_tag
13967,what you were,"[what, you, were]","[(what, WP), (you, PRP), (were, VBD)]","[WP, PRP, VBD]"
101,' pardon and,"[', pardon, and]","[(', POS), (pardon, NN), (and, CC)]","[POS, NN, CC]"
5125,"foreign levy ,","[foreign, levy, ,]","[(foreign, JJ), (levy, NN), (,, ,)]","[JJ, NN, ,]"
3710,can not conceive,"[can, not, conceive]","[(can, MD), (not, RB), (conceive, VB)]","[MD, RB, VB]"
2237,and an old,"[and, an, old]","[(and, CC), (an, DT), (old, JJ)]","[CC, DT, JJ]"
13163,to this terrible,"[to, this, terrible]","[(to, TO), (this, DT), (terrible, JJ)]","[TO, DT, JJ]"
13536,"venom breed ,","[venom, breed, ,]","[(venom, NNS), (breed, NN), (,, ,)]","[NNS, NN, ,]"
6548,i do think,"[i, do, think]","[(i, NNS), (do, VBP), (think, VB)]","[NNS, VBP, VB]"
11969,the selfsame day,"[the, selfsame, day]","[(the, DT), (selfsame, NN), (day, NN)]","[DT, NN, NN]"
2636,and what i,"[and, what, i]","[(and, CC), (what, WP), (i, NN)]","[CC, WP, NN]"


Let's try on the full Shakespearean dataset

In [4]:
n3gram_cv = CountVectorizer(ngram_range=(3,3), tokenizer=TreebankWordTokenizer().tokenize, analyzer='word')
shake_matrix = n3gram_cv.fit_transform(shake_df['Line'])

shake_trigrams = n3gram_cv.get_feature_names()

print('Number of trigrams from the Shakespeare plays and sonnets DataFrame:', len(shake_trigrams))

Number of trigrams from the Shakespeare plays and sonnets DataFrame: 556864


In [44]:
shake_tri_df = pd.DataFrame(shake_trigrams, columns=['title'])
shake_tri_df.head()

Unnamed: 0,title
0,! ' 'brutus
1,! ' 'citizens
2,! ' 'coriolanus
3,! ' 'patricians
4,! ' 'stay


I am curious about the punctuation used in the real titles and the punctuation captured in the entire shakespeare ngram set.

In [45]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [46]:
punct = list(string.punctuation)
flat_list = ' '.join(shake_trigrams)
present_punct_shake = [i for i in punct if i in flat_list]
present_punct_shake

['!', '$', "'", '(', ')', ',', '-', '.', ':', ';', '?', '[', ']', '`']

In [47]:
titles_df = pd.read_csv('../data/clean_titles.csv')

In [48]:
flat_title = ' '.join(titles_df.Title.tolist())
title_present_punct = [i for i in punct if i in flat_title]
title_present_punct

['!', "'", ',', '-', '.', '?']

In order to shrink the dataset, I'd like to explore removing the ngrams that contain punctuation not found in the real titles.

In [49]:
punct_2delete = [i for i in present_punct_shake if i not in title_present_punct]
punct_2delete

['$', '(', ')', ':', ';', '[', ']', '`']

In [50]:
print('ngrams before punctuation removal:', shake_tri_df.shape[0])

# for index, row in shake_tri_df.iterrows():
#     if '$' in row['title']:
#         shake_tri_df.drop(index, inplace=True)
#     elif '(' in row['title']:
#         shake_tri_df.drop(index, inplace=True)
#     elif ')' in row['title']:
#         shake_tri_df.drop(index, inplace=True)
#     elif ':' in row['title']:
#         shake_tri_df.drop(index, inplace=True)
#     elif ';' in row['title']:
#         shake_tri_df.drop(index, inplace=True)
#     elif '[' in row['title']:
#         shake_tri_df.drop(index, inplace=True)
#     elif ']' in row['title']:
#         shake_tri_df.drop(index, inplace=True)
#     elif '`' in row['title']:
#         shake_tri_df.drop(index, inplace=True)

# print('ngrams after punctuation removal:', shake_tri_df.shape)       

ngrams before punctuation removal: 556864


The method above was WAY to slow to be useful.

In [51]:
dollar = shake_tri_df.title.str.contains('$', regex=False)
shake_tri_df = shake_tri_df[~dollar]

shake_tri_df.shape[0]

556861

In [52]:
shake_tri_df.tail()

Unnamed: 0,title
556859,zo long as
556860,zodiac in his
556861,zodiacs have gone
556862,zounds ! i
556863,zwaggered out of


In [53]:
lparen = shake_tri_df.title.str.contains('(', regex=False)
shake_tri_df = shake_tri_df[~lparen]
shake_tri_df.shape[0]

556767

In [54]:
rparen = shake_tri_df.title.str.contains(')', regex=False)
shake_tri_df = shake_tri_df[~rparen]
shake_tri_df.shape[0]

556697

In [55]:
colon = shake_tri_df.title.str.contains(':', regex=False)
shake_tri_df = shake_tri_df[~colon]
shake_tri_df.shape[0]

536384

In [56]:
semi = shake_tri_df.title.str.contains(';', regex=False)
shake_tri_df = shake_tri_df[~semi]
shake_tri_df.shape[0]

536199

In [57]:
lbrack = shake_tri_df.title.str.contains('[', regex=False)
shake_tri_df = shake_tri_df[~lbrack]
shake_tri_df.shape[0]

536090

In [58]:
rbrack = shake_tri_df.title.str.contains(']', regex=False)
shake_tri_df = shake_tri_df[~rbrack]
shake_tri_df.shape[0]

535194

In [59]:
whatisthisthing = shake_tri_df.title.str.contains('`', regex=False)
shake_tri_df = shake_tri_df[~whatisthisthing]
shake_tri_df.shape[0]

535152

In [60]:
shake_tri_df.head()

Unnamed: 0,title
0,! ' 'brutus
1,! ' 'citizens
2,! ' 'coriolanus
3,! ' 'patricians
4,! ' 'stay


Metagaming here- Titles containing `'` and `''` are going to play havoc with my downstream analysis. I am going to remove the double apostrophes to see if that helps. The `'` is too important to lose. They also interact strangely with other punctuation, so I am going to try and do clean-up

In [62]:
apost = shake_tri_df.title.str.contains("''", regex=False)
shake_tri_df = shake_tri_df[~apost]
shake_tri_df.shape[0]

535127

In [63]:
shake_tri_df.head()

Unnamed: 0,title
0,! ' 'brutus
1,! ' 'citizens
2,! ' 'coriolanus
3,! ' 'patricians
4,! ' 'stay


In [64]:
exclaim = shake_tri_df.title.str.startswith('!')
shake_tri_df = shake_tri_df[~exclaim]
shake_tri_df.shape[0]

532774

In [65]:
comma = shake_tri_df.title.str.startswith(',')
shake_tri_df = shake_tri_df[~comma]
shake_tri_df.shape[0]

506439

In [66]:
shake_tri_df.head()

Unnamed: 0,title
2361,' ! 't
2362,' ! an
2363,' ! have
2364,' ! here
2365,' ! how


In [67]:
stapost = shake_tri_df.title.str.startswith("'")
shake_tri_df = shake_tri_df[~stapost]
shake_tri_df.shape[0]

493870

In [68]:
shake_tri_df.head()

Unnamed: 0,title
42161,-- 'ay '
42162,-- 'banished '
42163,"-- 'farewell ,"
42164,"-- 'god-a-mercy ,"
42165,"-- 'ladies ,"


In [69]:
dash = shake_tri_df.title.str.startswith('-')
shake_tri_df = shake_tri_df[~dash]
shake_tri_df.shape[0]

493061

In [70]:
shake_tri_df.head()

Unnamed: 0,title
42974,. . .
42975,. . 2s.
42976,. . 4d
42977,. 2s. 2d
42978,. 4d .


In [71]:
period = shake_tri_df.title.str.startswith('.')
shake_tri_df = shake_tri_df[~period]
shake_tri_df.shape[0]

493053

In [73]:
shake_tri_df.head()

Unnamed: 0,title
42982,1. a lively
42983,10. certain ladies
42984,"2. then ,"
42985,2s. 2d .
42986,2s. 6d .


In [76]:
twd = TreebankWordTokenizer()

shake_tri_df['title_tokenized'] = shake_tri_df['title'].apply(lambda x: twd.tokenize(x))
shake_tri_df['title_pos'] = shake_tri_df['title_tokenized'].apply(lambda x: nltk.pos_tag(x))
shake_tri_df['pos_tag'] = shake_tri_df['title_pos'].apply(lambda x: [i[1] for i in x])

shake_tri_df.sample(5)

Unnamed: 0,title,title_tokenized,title_pos,pos_tag
251264,in his retirement,"[in, his, retirement]","[(in, IN), (his, PRP$), (retirement, NN)]","[IN, PRP$, NN]"
240752,i gone .,"[i, gone, .]","[(i, NN), (gone, VBN), (., .)]","[NN, VBN, .]"
363390,"paulina , lords","[paulina, ,, lords]","[(paulina, NN), (,, ,), (lords, NNS)]","[NN, ,, NNS]"
242635,i myself could,"[i, myself, could]","[(i, NN), (myself, PRP), (could, MD)]","[NN, PRP, MD]"
402647,since his rose,"[since, his, rose]","[(since, IN), (his, PRP$), (rose, VBD)]","[IN, PRP$, VBD]"


Repeat the process for fourgrams and bigrams.

In [77]:
n2gram_cv = CountVectorizer(ngram_range=(2,2), tokenizer=TreebankWordTokenizer().tokenize, analyzer='word')
shake_2matrix = n2gram_cv.fit_transform(shake_df['Line'])

shake_bigrams = n2gram_cv.get_feature_names()

print('Number of bigrams from the Shakespeare plays and sonnets DataFrame:', len(shake_bigrams))

Number of bigrams from the Shakespeare plays and sonnets DataFrame: 265248


In [78]:
shake_bi_df = pd.DataFrame(shake_bigrams, columns=['title'])
shake_bi_df.sample(5)

Unnamed: 0,title
223862,this ague
184178,rotten fens
157137,o'er-dusted .
27717,and strike
255697,with drooping


In [80]:
print('start bigrams:', shake_bi_df.shape[0])

dollar = shake_bi_df.title.str.contains('$', regex=False)
shake_bi_df = shake_bi_df[~dollar]

lparen = shake_bi_df.title.str.contains('(', regex=False)
shake_bi_df = shake_bi_df[~lparen]

rparen = shake_bi_df.title.str.contains(')', regex=False)
shake_bi_df = shake_bi_df[~rparen]

colon = shake_bi_df.title.str.contains(':', regex=False)
shake_bi_df = shake_bi_df[~colon]

semi = shake_bi_df.title.str.contains(';', regex=False)
shake_bi_df = shake_bi_df[~semi]

lbrack = shake_bi_df.title.str.contains('[', regex=False)
shake_bi_df = shake_bi_df[~lbrack]

rbrack = shake_bi_df.title.str.contains(']', regex=False)
shake_bi_df = shake_bi_df[~rbrack]

whatisthisthing = shake_bi_df.title.str.contains('`', regex=False)
shake_bi_df = shake_bi_df[~whatisthisthing]

exclaim = shake_bi_df.title.str.startswith('!')
shake_bi_df = shake_bi_df[~exclaim]

comma = shake_bi_df.title.str.startswith(',')
shake_bi_df = shake_bi_df[~comma]

stapost = shake_bi_df.title.str.startswith("'")
shake_bi_df = shake_bi_df[~stapost]


dash = shake_bi_df.title.str.startswith('-')
shake_bi_df = shake_bi_df[~dash]

period = shake_bi_df.title.str.startswith('.')
shake_bi_df = shake_bi_df[~period]

shake_bi_df.shape[0]

start bigrams: 265248


247365

In [81]:
shake_bi_df.head()

Unnamed: 0,title
12313,1. a
12314,10. certain
12315,2. then
12316,2d .
12317,2s. 2d


In [82]:
shake_bi_df['title_tokenized'] = shake_bi_df['title'].apply(lambda x: twd.tokenize(x))
shake_bi_df['title_pos'] = shake_bi_df['title_tokenized'].apply(lambda x: nltk.pos_tag(x))
shake_bi_df['pos_tag'] = shake_bi_df['title_pos'].apply(lambda x: [i[1] for i in x])

shake_bi_df.sample(5)

Unnamed: 0,title,title_tokenized,title_pos,pos_tag
78467,experience of,"[experience, of]","[(experience, NN), (of, IN)]","[NN, IN]"
133347,love rosalind,"[love, rosalind]","[(love, NN), (rosalind, NN)]","[NN, NN]"
38703,beauty by,"[beauty, by]","[(beauty, NN), (by, IN)]","[NN, IN]"
32472,as flowers,"[as, flowers]","[(as, IN), (flowers, NNS)]","[IN, NNS]"
263908,your important,"[your, important]","[(your, PRP$), (important, JJ)]","[PRP$, JJ]"


In [83]:
n4gram_cv = CountVectorizer(ngram_range=(4,4), tokenizer=TreebankWordTokenizer().tokenize, analyzer='word')
shake_4matrix = n4gram_cv.fit_transform(shake_df['Line'])

shake_fourgrams = n4gram_cv.get_feature_names()

print('Number of fourgrams from the Shakespeare plays and sonnets DataFrame:', len(shake_fourgrams))

Number of fourgrams from the Shakespeare plays and sonnets DataFrame: 625445


In [84]:
shake_tetra_df = pd.DataFrame(shake_fourgrams, columns=['title'])
shake_tetra_df.sample(5)

Unnamed: 0,title
403302,on one and other
155383,"by proof , we"
226564,go out then .
355647,moment : i do
510654,thee i will .


In [85]:
print('start fourgrams:', shake_tetra_df.shape[0])

dollar = shake_tetra_df.title.str.contains('$', regex=False)
shake_tetra_df = shake_tetra_df[~dollar]

lparen = shake_tetra_df.title.str.contains('(', regex=False)
shake_tetra_df = shake_tetra_df[~lparen]

rparen = shake_tetra_df.title.str.contains(')', regex=False)
shake_tetra_df = shake_tetra_df[~rparen]

colon = shake_tetra_df.title.str.contains(':', regex=False)
shake_tetra_df = shake_tetra_df[~colon]

semi = shake_tetra_df.title.str.contains(';', regex=False)
shake_tetra_df = shake_tetra_df[~semi]

lbrack = shake_tetra_df.title.str.contains('[', regex=False)
shake_tetra_df = shake_tetra_df[~lbrack]

rbrack = shake_tetra_df.title.str.contains(']', regex=False)
shake_tetra_df = shake_tetra_df[~rbrack]

whatisthisthing = shake_tetra_df.title.str.contains('`', regex=False)
shake_tetra_df = shake_tetra_df[~whatisthisthing]

apost = shake_tetra_df.title.str.contains("''", regex=False)
shake_tetra_df = shake_tetra_df[~apost]

exclaim = shake_tetra_df.title.str.startswith('!')
shake_tetra_df = shake_tetra_df[~exclaim]

comma = shake_tetra_df.title.str.startswith(',')
shake_tetra_df = shake_tetra_df[~comma]

stapost = shake_tetra_df.title.str.startswith("'")
shake_tetra_df = shake_tetra_df[~stapost]

dash = shake_tetra_df.title.str.startswith('-')
shake_tetra_df = shake_tetra_df[~dash]

period = shake_tetra_df.title.str.startswith('.')
shake_tetra_df = shake_tetra_df[~period]

shake_tetra_df.shape[0]

start fourgrams: 625445


527705

In [86]:
shake_tetra_df.head()

Unnamed: 0,title
65584,1. a lively flourish
65585,10. certain ladies or
65586,"2. then , two"
65587,"3. lord chancellor ,"
65588,"4. choristers , singing"


In [87]:
shake_tetra_df['title_tokenized'] = shake_tetra_df['title'].apply(lambda x: twd.tokenize(x))
shake_tetra_df['title_pos'] = shake_tetra_df['title_tokenized'].apply(lambda x: nltk.pos_tag(x))
shake_tetra_df['pos_tag'] = shake_tetra_df['title_pos'].apply(lambda x: [i[1] for i in x])

shake_tetra_df.sample(5)

Unnamed: 0,title,title_tokenized,title_pos,pos_tag
305396,is pass 'd to,"[is, pass, 'd, to]","[(is, VBZ), (pass, JJ), ('d, MD), (to, TO)]","[VBZ, JJ, MD, TO]"
240443,have been a time,"[have, been, a, time]","[(have, VBP), (been, VBN), (a, DT), (time, NN)]","[VBP, VBN, DT, NN]"
394138,"of human skill ,","[of, human, skill, ,]","[(of, IN), (human, JJ), (skill, NN), (,, ,)]","[IN, JJ, NN, ,]"
585670,whereto the rather shall,"[whereto, the, rather, shall]","[(whereto, VB), (the, DT), (rather, RB), (shall, MD)]","[VB, DT, RB, MD]"
549671,to tell my tale,"[to, tell, my, tale]","[(to, TO), (tell, VB), (my, PRP$), (tale, NN)]","[TO, VB, PRP$, NN]"


In [88]:
datapath = '../data'

In [89]:
datapath_shake_tri_df = os.path.join(datapath, 'shakespearean_trigrams.csv')
shake_tri_df.to_csv(datapath_shake_tri_df, index=False)

In [90]:
datapath_shake_tetra_df = os.path.join(datapath, 'shakespearean_fourgrams.csv')
shake_tetra_df.to_csv(datapath_shake_tetra_df, index=False)

In [91]:
datapath_shake_bi_df = os.path.join(datapath, 'shakespearean_bigrams.csv')
shake_bi_df.to_csv(datapath_shake_bi_df, index=False)