#### LANGUAGE PARSING

<br>

# Discover Insights into Classic Texts

<hr>


In [10]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\xemyc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [11]:
from nltk import pos_tag, RegexpParser
from tokenize_words import word_sentence_tokenize
from chunk_counters import np_chunk_counter, vp_chunk_counter

### Import and Preprocess Text Data

In [2]:
text = open("dorian_gray.txt", encoding = 'utf-8').read().lower()

In [6]:
# breaks the sentences into arrays of words
word_tokenized_text = word_sentence_tokenize(text)
print(word_tokenized_text[0])

['the', 'picture', 'of', 'dorian', 'gray', 'by', 'oscar', 'wilde', 'the', 'preface', 'the', 'artist', 'is', 'the', 'creator', 'of', 'beautiful', 'things', '.']


In [5]:
# store and print 100th sentence word tokenized sentence
single_word_tokenized_sentence = word_tokenized_text[100]
print(single_word_tokenized_sentence)

['it', 'seems', 'to', 'be', 'the', 'one', 'thing', 'that', 'can', 'make', 'modern', 'life', 'mysterious', 'or', 'marvellous', 'to', 'us', '.']


### Part-of-Speech Tag Text

In [13]:
# part-of-speech tag each sentence and append to list of pos-tagged sentences
pos_tagged_text = []

for word in word_tokenized_text:
    pos_tagged_text.append(pos_tag(word))

In [14]:
# store and print 100th sentence part-of-speech tagged sentence
single_pos_sentence = pos_tagged_text[100]
print(single_pos_sentence)

[('it', 'PRP'), ('seems', 'VBZ'), ('to', 'TO'), ('be', 'VB'), ('the', 'DT'), ('one', 'CD'), ('thing', 'NN'), ('that', 'WDT'), ('can', 'MD'), ('make', 'VB'), ('modern', 'JJ'), ('life', 'NN'), ('mysterious', 'JJ'), ('or', 'CC'), ('marvellous', 'JJ'), ('to', 'TO'), ('us', 'PRP'), ('.', '.')]


### Chunk Sentences

In [16]:
# define noun phrase chunk grammar
np_chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"

# create noun phrase RegexpParser object
np_chunk_parser = RegexpParser(np_chunk_grammar)

In [17]:
# define verb phrase chunk grammar
vp_chunk_grammar = "VP: {<DT>?<JJ>*<NN><VB.*><RB.?>?}"

# create verb phrase RegexpParser object
vp_chunk_parser = RegexpParser(vp_chunk_grammar)

In [18]:
np_chunked_text = []
vp_chunked_text = []

# chunk each sentence and append to lists
for chunk in pos_tagged_text:
  np_chunked_text.append(np_chunk_parser.parse(chunk))
  vp_chunked_text.append(vp_chunk_parser.parse(chunk))

### Analyze Chunks

In [19]:
# store and print the most common NP-chunks
most_common_np_chunks = np_chunk_counter(np_chunked_text)
print(most_common_np_chunks)

[((('i', 'NN'),), 963), ((('henry', 'NN'),), 200), ((('lord', 'NN'),), 197), ((('life', 'NN'),), 170), ((('harry', 'NN'),), 136), ((('dorian', 'JJ'), ('gray', 'NN')), 127), ((('something', 'NN'),), 126), ((('nothing', 'NN'),), 93), ((('basil', 'NN'),), 85), ((('the', 'DT'), ('world', 'NN')), 70), ((('everything', 'NN'),), 69), ((('anything', 'NN'),), 68), ((('hallward', 'NN'),), 68), ((('the', 'DT'), ('man', 'NN')), 61), ((('the', 'DT'), ('room', 'NN')), 60), ((('face', 'NN'),), 57), ((('the', 'DT'), ('door', 'NN')), 56), ((('love', 'NN'),), 55), ((('art', 'NN'),), 52), ((('course', 'NN'),), 51), ((('the', 'DT'), ('picture', 'NN')), 46), ((('the', 'DT'), ('lad', 'NN')), 45), ((('head', 'NN'),), 44), ((('round', 'NN'),), 44), ((('hand', 'NN'),), 44), ((('sibyl', 'NN'),), 41), ((('the', 'DT'), ('table', 'NN')), 40), ((('the', 'DT'), ('painter', 'NN')), 38), ((('sir', 'NN'),), 38), ((('a', 'DT'), ('moment', 'NN')), 38)]


In [20]:
# store and print the most common VP-chunks
most_common_vp_chunks = vp_chunk_counter(vp_chunked_text)
print(most_common_vp_chunks)

[((('i', 'NN'), ('am', 'VBP')), 101), ((('i', 'NN'), ('was', 'VBD')), 40), ((('i', 'NN'), ('want', 'VBP')), 37), ((('i', 'NN'), ('know', 'VBP')), 33), ((('i', 'NN'), ('do', 'VBP'), ("n't", 'RB')), 32), ((('i', 'NN'), ('have', 'VBP')), 32), ((('i', 'NN'), ('had', 'VBD')), 31), ((('i', 'NN'), ('suppose', 'VBP')), 17), ((('i', 'NN'), ('think', 'VBP')), 16), ((('i', 'NN'), ('am', 'VBP'), ('not', 'RB')), 14), ((('i', 'NN'), ('thought', 'VBD')), 13), ((('i', 'NN'), ('believe', 'VBP')), 12), ((('dorian', 'JJ'), ('gray', 'NN'), ('was', 'VBD')), 11), ((('i', 'NN'), ('am', 'VBP'), ('so', 'RB')), 11), ((('henry', 'NN'), ('had', 'VBD')), 11), ((('i', 'NN'), ('did', 'VBD'), ("n't", 'RB')), 9), ((('i', 'NN'), ('met', 'VBD')), 9), ((('i', 'NN'), ('said', 'VBD')), 9), ((('i', 'NN'), ('am', 'VBP'), ('quite', 'RB')), 8), ((('i', 'NN'), ('see', 'VBP')), 8), ((('i', 'NN'), ('did', 'VBD'), ('not', 'RB')), 7), ((('i', 'NN'), ('have', 'VBP'), ('ever', 'RB')), 7), ((('life', 'NN'), ('has', 'VBZ')), 7), ((('i'

<hr>

*Note: the `chunk_counters` and `tokenize_words` were imported from `chunk_counters.py` and `tokenize_words.py` which are files created by Codecademy.