In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from random import sample 
import os

pd.set_option('display.max_colwidth', None)

In [2]:
plays_sonnets = pd.read_csv('../data/lines_cleaned.csv')

In [3]:
plays_sonnets.head()

Unnamed: 0,Dataline,Play,Line,ActSceneLine
0,1,henry iv,ACT I,1.1.1
1,2,henry iv,SCENE I. London. The palace.,1.1.1
2,3,henry iv,"Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others",1.1.1
3,4,henry iv,"So shaken as we are, so wan with care,",1.1.1
4,5,henry iv,"Find we a time for frighted peace to pant,",1.1.2


Testing N-Gram generation on a small subset first

In [4]:
macbeth_df = plays_sonnets[plays_sonnets['Play'] == 'macbeth']

macbeth_df.head()

Unnamed: 0,Dataline,Play,Line,ActSceneLine
55779,55780,macbeth,ACT I,1.1.1
55780,55781,macbeth,SCENE I. A desert place.,1.1.1
55781,55782,macbeth,Thunder and lightning. Enter three Witches,1.1.1
55782,55783,macbeth,When shall we three meet again,1.1.1
55783,55784,macbeth,"In thunder, lightning, or in rain?",1.1.2


In [5]:
macbeth_df['Lower'] = macbeth_df['Line'].apply(lambda x: x.lower())

macbeth_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  macbeth_df['Lower'] = macbeth_df['Line'].apply(lambda x: x.lower())


Unnamed: 0,Dataline,Play,Line,ActSceneLine,Lower
55779,55780,macbeth,ACT I,1.1.1,act i
55780,55781,macbeth,SCENE I. A desert place.,1.1.1,scene i. a desert place.
55781,55782,macbeth,Thunder and lightning. Enter three Witches,1.1.1,thunder and lightning. enter three witches
55782,55783,macbeth,When shall we three meet again,1.1.1,when shall we three meet again
55783,55784,macbeth,"In thunder, lightning, or in rain?",1.1.2,"in thunder, lightning, or in rain?"


In [6]:
ngram_cv = CountVectorizer(ngram_range=(3,3))
mac_trigram_matrix = ngram_cv.fit_transform(macbeth_df['Lower'])

print(mac_trigram_matrix.shape)

(2586, 11378)


In [7]:
mac_trigrams = ngram_cv.get_feature_names()

print('There are {} unique trigrams from the {} lines of Macbeth'.format(len(mac_trigrams), mac_trigram_matrix.shape[0]))

There are 11378 unique trigrams from the 2586 lines of Macbeth


In [8]:
mac_trigrams[::700]

['abhorred tyrant with',
 'another yet seventh',
 'brow the round',
 'did line the',
 'for all your',
 'have you left',
 'in sow blood',
 'like lead upon',
 'must seem their',
 'of the sun',
 'revenges burn in',
 'spent swimmers that',
 'the gentle weal',
 'think you can',
 'torches enter sewer',
 'where did find',
 'you lack the']

In [9]:
mac_tri_df = pd.DataFrame(mac_trigram_matrix.toarray())
mac_tri_df.columns = mac_trigrams
mac_tri_df.shape

(2586, 11378)

In [10]:
mac_tri_df.head()

Unnamed: 0,abhorred tyrant with,abide no longer,about him like,about his throne,about the cauldron,about their necks,about you here,abroad unnatural deeds,absence is no,absolute fear of,...,your wife and,your wife withal,your wives your,your wrath if,yours for ever,yours you may,yourself but for,yourself hath it,youth pray you,youths that even
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Trigramming all of Shakespeare's corpus

In [11]:
plays_sonnets['Lower'] = plays_sonnets['Line'].apply(lambda x: x.lower())

plays_sonnets.head()

Unnamed: 0,Dataline,Play,Line,ActSceneLine,Lower
0,1,henry iv,ACT I,1.1.1,act i
1,2,henry iv,SCENE I. London. The palace.,1.1.1,scene i. london. the palace.
2,3,henry iv,"Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others",1.1.1,"enter king henry, lord john of lancaster, the earl of westmoreland, sir walter blunt, and others"
3,4,henry iv,"So shaken as we are, so wan with care,",1.1.1,"so shaken as we are, so wan with care,"
4,5,henry iv,"Find we a time for frighted peace to pant,",1.1.2,"find we a time for frighted peace to pant,"


In [12]:
ngram_cv = CountVectorizer(ngram_range=(3,3))
all_trigram_matrix = ngram_cv.fit_transform(plays_sonnets['Lower'])

In [13]:
all_trigrams = ngram_cv.get_feature_names()

print('There are {} unique trigrams from the {} lines of Shakespearean text'.format(len(all_trigrams), all_trigram_matrix.shape[0]))

There are 480628 unique trigrams from the 113553 lines of Shakespearean text


In [14]:
all_trigrams[34567:34577]

['as tis received',
 'as tis reported',
 'as tis said',
 'as tis the',
 'as tis thought',
 'as tis to',
 'as tis valued',
 'as tis with',
 'as titan face',
 'as titan rays']

In [15]:
sample(all_trigrams, 10)

['th inviting time',
 'saved the treacherous',
 'part us northumberland',
 'working days your',
 'make therefore ladies',
 'your royal preparation',
 'and learnedly for',
 'thou dost suspect',
 'befriend us as',
 'shows but little']

And now four-grams!

In [124]:
ngram_cv2 = CountVectorizer(ngram_range=(4,4))
all_fourgram_matrix = ngram_cv2.fit_transform(plays_sonnets['Lower'])

In [125]:
all_fourgrams = ngram_cv2.get_feature_names()

print('There are {} unique fourgrams from the {} lines of Shakespearean text'.format(len(all_fourgrams), all_fourgram_matrix.shape[0]))

There are 457661 unique fourgrams from the 113553 lines of Shakespearean text


In [127]:
all_fourgrams[400000:400010]

['unbolted villain into mortar',
 'unbonneted to as proud',
 'unbookish jealousy must construe',
 'unborn and accents yet',
 'unborn could never be',
 'unborn sorrow ripe in',
 'unbound the rest and',
 'unbounded stomach ever ranking',
 'unbow alas poor milan',
 'unbraced and suck up']

In [152]:
sample(all_fourgrams, 10)

['myself and so my',
 'by the joiner squirrel',
 'drowned we will inherit',
 'as nature was in',
 'and fate of him',
 'divinity that shapes our',
 'the house on purpose',
 'which straight she gave',
 'to make men glorious',
 'exeunt lucius quintus martius']

In [133]:
all_fourgrams.index('otter sir john why')

267400

In [149]:
all_fourgrams[267395:267405]

['otherwise that could beat',
 'otherwise tis labour well',
 'otherwise tis light and',
 'otherwise will henry ne',
 'otherwise would grow into',
 'otter sir john why',
 'ottomites reverend and gracious',
 'oublie ce que je',
 'oublie de elbow comment',
 'oublie dere is some']

In [137]:
plays_sonnets[plays_sonnets['Lower'].str.contains('an otter')]

Unnamed: 0,Dataline,Play,Line,ActSceneLine,Lower
2144,2145,henry iv,"What beast! why, an otter.",3.2.308,"what beast! why, an otter."
2145,2146,henry iv,"An otter, Sir John! Why an otter?",3.2.309,"an otter, sir john! why an otter?"


There seems to be a lot of non-English words in the ngrams, such as 'enseignez il faut', 'oublie ce que je', or 'he shall to partha'. This might be a good opportunity to whittle down these huge datasets with a non-english prefilter.

In [145]:
import nltk

eng_words = set(nltk.corpus.words.words())

In [169]:
not_eng = []

for title in all_trigrams:
    for word in title.split():
        if word not in eng_words:
            not_eng.append(word)
            
len(not_eng)

146248

In [170]:
not_eng = set(not_eng)

len(not_eng)

9686

In [172]:
sample(not_eng, 50)

['judases',
 'halberds',
 'calls',
 'loathed',
 'couronne',
 'pleaders',
 'mediators',
 'witches',
 'entertainest',
 'guichard',
 'practised',
 'wilful',
 'envenoms',
 'furies',
 'newest',
 'followest',
 'bounced',
 'crannies',
 'depths',
 'hopes',
 'damon',
 'intitle',
 'shores',
 'aldermen',
 'assence',
 'synods',
 'appelez',
 'pioner',
 'spieth',
 'dowle',
 'reliances',
 'lifteth',
 'auditors',
 'awakens',
 'cinna',
 'erboard',
 'alencon',
 'breese',
 'maidens',
 'centuries',
 'enkindled',
 'aeson',
 'impaled',
 'dissembling',
 'stubbornest',
 'inforced',
 'trowest',
 'gascony',
 'tangled',
 'parthian']

There are a lot of English words in the 'not english' list that would be removed. I wonder if these would be ignored anyway in POS tagging. I will create a new list anyway and see how it goes.

In [185]:
print('Starting trigram number:', len(all_trigrams))

eng_trigrams = []

for title in all_trigrams:
    count = sum([1 for word in title.split() if word in eng_words])
    if count == 3:
        eng_trigrams.append(title)

print('Filtered trigram number:', len(eng_trigrams))

Starting trigram number: 480628
Filtered trigram number: 348051


In [186]:
print('Starting fourgram number:', len(all_fourgrams))

eng_fourgrams = []

for title in all_fourgrams:
    count = sum([1 for word in title.split() if word in eng_words])
    if count == 4:
        eng_fourgrams.append(title)

print('Filtered fourgram number:', len(eng_fourgrams))

Starting fourgram number: 457661
Filtered fourgram number: 311903


In [187]:
all_trigrams_df = pd.DataFrame(all_trigrams, columns=['title'])
eng_trigrams_df = pd.DataFrame(eng_trigrams, columns=['title'])

all_fourgrams_df = pd.DataFrame(all_fourgrams, columns=['title'])
eng_fourgrams_df = pd.DataFrame(eng_fourgrams, columns=['title'])

In [190]:
all_trigrams_df['title_tokenized'] = all_trigrams_df['title'].apply(lambda x: nltk.word_tokenize(x))
all_trigrams_df['title_tokenized_pos'] = all_trigrams_df['title_tokenized'].apply(lambda x: nltk.pos_tag(x))
all_trigrams_df['title_pos'] = all_trigrams_df['title_tokenized_pos'].apply(lambda x: [i[1] for i in x])

all_trigrams_df.head()

Unnamed: 0,title,title_tokenized,title_tokenized_pos,title_pos
0,10 certain ladies,"[10, certain, ladies]","[(10, CD), (certain, JJ), (ladies, NNS)]","[CD, JJ, NNS]"
1,aaron and other,"[aaron, and, other]","[(aaron, NN), (and, CC), (other, JJ)]","[NN, CC, JJ]"
2,aaron and thou,"[aaron, and, thou]","[(aaron, NN), (and, CC), (thou, NN)]","[NN, CC, NN]"
3,aaron arm thy,"[aaron, arm, thy]","[(aaron, NN), (arm, NN), (thy, NN)]","[NN, NN, NN]"
4,aaron bid us,"[aaron, bid, us]","[(aaron, NN), (bid, NN), (us, PRP)]","[NN, NN, PRP]"


In [191]:
all_fourgrams_df['title_tokenized'] = all_fourgrams_df['title'].apply(lambda x: nltk.word_tokenize(x))
all_fourgrams_df['title_tokenized_pos'] = all_fourgrams_df['title_tokenized'].apply(lambda x: nltk.pos_tag(x))
all_fourgrams_df['title_pos'] = all_fourgrams_df['title_tokenized_pos'].apply(lambda x: [i[1] for i in x])

all_fourgrams_df.head()

Unnamed: 0,title,title_tokenized,title_tokenized_pos,title_pos
0,10 certain ladies or,"[10, certain, ladies, or]","[(10, CD), (certain, JJ), (ladies, NNS), (or, CC)]","[CD, JJ, NNS, CC]"
1,aaron and other goths,"[aaron, and, other, goths]","[(aaron, NN), (and, CC), (other, JJ), (goths, NNS)]","[NN, CC, JJ, NNS]"
2,aaron and thou look,"[aaron, and, thou, look]","[(aaron, NN), (and, CC), (thou, NN), (look, NN)]","[NN, CC, NN, NN]"
3,aaron arm thy heart,"[aaron, arm, thy, heart]","[(aaron, NN), (arm, NN), (thy, JJ), (heart, NN)]","[NN, NN, JJ, NN]"
4,aaron bid us hide,"[aaron, bid, us, hide]","[(aaron, NN), (bid, NN), (us, PRP), (hide, VB)]","[NN, NN, PRP, VB]"
