In [1]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_colwidth', None)

In [2]:
shake_df = pd.read_csv('../data/shake_corpus.csv')

shake_df.sample(5)

Unnamed: 0,Dataline,Play,Line,ActSceneLine,Player,PlayerLinenumber,Traditional,Modern
47103,47104,julius caesar,"What you have said, and show yourselves true Romans.",2.1.233,CASSIUS,59.0,tragedy,tragedy
30863,30864,cymbeline,"Pray, be not sick,",4.2.55,BELARIUS,18.0,tragedy,romance
40019,40020,henry v,"Angliae, et Haeres Franciae.",5.2.339,EXETER,74.0,history,tragedy
17639,17640,as you like it,"But, sure, he's proud, and yet his pride becomes him:",3.5.117,PHEBE,22.0,comedy,comedy
39008,39009,henry v,"Once more I come to know of thee, King Harry,",4.3.81,MONTJOY,17.0,history,tragedy


Subset shake_df in order to investigate tokenizing and n-gram generation with punctuation.

In [3]:
macbeth = shake_df[shake_df['Play'] == 'macbeth']
macbeth.head()

Unnamed: 0,Dataline,Play,Line,ActSceneLine,Player,PlayerLinenumber,Traditional,Modern
55779,55780,macbeth,ACT I,1.1.1,ADRIANO DE ARMADO,405.0,tragedy,tragedy
55780,55781,macbeth,SCENE I. A desert place.,1.1.1,ADRIANO DE ARMADO,405.0,tragedy,tragedy
55781,55782,macbeth,Thunder and lightning. Enter three Witches,1.1.1,ADRIANO DE ARMADO,405.0,tragedy,tragedy
55782,55783,macbeth,When shall we three meet again,1.1.1,First Witch,1.0,tragedy,tragedy
55783,55784,macbeth,"In thunder, lightning, or in rain?",1.1.2,First Witch,1.0,tragedy,tragedy


TreebankWordTokenizer performs the following steps: 
- split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
- treat most punctuation characters as separate tokens
- split off commas and single quotes, when followed by whitespace
- separate periods that appear at the end of line

In [6]:
twd = TreebankWordTokenizer()

macbeth['line_tokenized'] = macbeth['Line'].apply(lambda x: twd.tokenize(x.lower()))
macbeth.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  macbeth['line_tokenized'] = macbeth['Line'].apply(lambda x: twd.tokenize(x.lower()))


Unnamed: 0,Dataline,Play,Line,ActSceneLine,Player,PlayerLinenumber,Traditional,Modern,line_tokenized
55779,55780,macbeth,ACT I,1.1.1,ADRIANO DE ARMADO,405.0,tragedy,tragedy,"[act, i]"
55780,55781,macbeth,SCENE I. A desert place.,1.1.1,ADRIANO DE ARMADO,405.0,tragedy,tragedy,"[scene, i., a, desert, place, .]"
55781,55782,macbeth,Thunder and lightning. Enter three Witches,1.1.1,ADRIANO DE ARMADO,405.0,tragedy,tragedy,"[thunder, and, lightning., enter, three, witches]"


In [12]:
ngram_cv = CountVectorizer(ngram_range=(3,3), tokenizer=TreebankWordTokenizer().tokenize, analyzer='word')
macbeth_matrix = ngram_cv.fit_transform(macbeth['Line'])

macbeth_trigrams = ngram_cv.get_feature_names()

print('Number of trigrams from Macbeth:', len(macbeth_trigrams))

Number of trigrams from Macbeth: 15016


In [6]:
# mac_tri_df = pd.DataFrame(macbeth_trigrams, columns=['title'])
# mac_tri_df.head()

Unnamed: 0,title
0,! ' and
1,! ' exeunt
2,! ' the
3,! ' this
4,! ' to


In [15]:
# import string

# string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [27]:
# punct = list(string.punctuation)
# flat_list = ' '.join(macbeth_trigrams)

# present_punct = [i for i in punct if i in flat_list]

# present_punct

['!', "'", ',', '-', '.', ':', '?', '[', ']']

In [28]:
# titles_df = pd.read_csv('../data/clean_titles.csv')

In [34]:
# flat_title = ' '.join(titles_df.Title.tolist())

# title_present_punct = [i for i in punct if i in flat_title]
# title_present_punct

['!', "'", ',', '-', '.', '?']

In [18]:
mac_tri_df.shape

(15016, 4)

In [16]:
mac_tri_df['title_tokenized'] = mac_tri_df['title'].apply(lambda x: twd.tokenize(x))
mac_tri_df['title_pos'] = mac_tri_df['title_tokenized'].apply(lambda x: nltk.pos_tag(x))
mac_tri_df['pos_tag'] = mac_tri_df['title_pos'].apply(lambda x: [i[1] for i in x])

mac_tri_df.sample(10)

Unnamed: 0,title,title_tokenized,title_pos,pos_tag
132,' the world,"[', the, world]","[(', ''), (the, DT), (world, NN)]","['', DT, NN]"
11598,"the dead ,","[the, dead, ,]","[(the, DT), (dead, JJ), (,, ,)]","[DT, JJ, ,]"
9145,of death and,"[of, death, and]","[(of, IN), (death, NN), (and, CC)]","[IN, NN, CC]"
11583,the crown to,"[the, crown, to]","[(the, DT), (crown, NN), (to, TO)]","[DT, NN, TO]"
7030,into the seeds,"[into, the, seeds]","[(into, IN), (the, DT), (seeds, NNS)]","[IN, DT, NNS]"
3200,bear welcome in,"[bear, welcome, in]","[(bear, JJ), (welcome, NN), (in, IN)]","[JJ, NN, IN]"
7564,laugh in 's,"[laugh, in, 's]","[(laugh, NN), (in, IN), ('s, POS)]","[NN, IN, POS]"
4909,fellow to it,"[fellow, to, it]","[(fellow, NN), (to, TO), (it, PRP)]","[NN, TO, PRP]"
4319,dismay 'd not,"[dismay, 'd, not]","[(dismay, NN), ('d, MD), (not, RB)]","[NN, MD, RB]"
3800,chance will have,"[chance, will, have]","[(chance, NN), (will, MD), (have, VB)]","[NN, MD, VB]"


In [23]:
for index, row in mac_tri_df.iterrows():
    if ':' in row['title']:
        mac_tri_df.drop(index, inplace=True)
    elif '[' in row['title']:
        mac_tri_df.drop(index, inplace=True)
    elif ']' in row['title']:
        mac_tri_df.drop(index, inplace=True)

mac_tri_df.shape

(14296, 4)