In [2]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [3]:
with open("sample.txt", "r", encoding="utf-8") as file:
    lines = [line.strip() for line in file]

In [6]:
lines

['[The Adventures of Buster Bear by Thornton W. Burgess 1920]',
 '',
 'I',
 '',
 'BUSTER BEAR GOES FISHING',
 '',
 '',
 'Buster Bear yawned as he lay on his comfortable bed of leaves and',
 'watched the first early morning sunbeams creeping through the Green',
 'Forest to chase out the Black Shadows. Once more he yawned, and slowly',
 'got to his feet and shook himself. Then he walked over to a big',
 'pine-tree, stood up on his hind legs, reached as high up on the trunk of',
 'the tree as he could, and scratched the bark with his great claws. After',
 'that he yawned until it seemed as if his jaws would crack, and then sat',
 'down to think what he wanted for breakfast.',
 '',
 'While he sat there, trying to make up his mind what would taste best, he',
 'was listening to the sounds that told of the waking of all the little',
 'people who live in the Green Forest. He heard Sammy Jay way off in the',
 'distance screaming, "Thief! Thief!" and grinned. "I wonder," thought',
 'Buster, "if 

In [14]:
lines[0] = lines[0]+'.'

In [4]:
line_text = [item for item in lines if item != ""]
all_text = " ".join(line_text)
all_text

'[The Adventures of Buster Bear by Thornton W. Burgess 1920] I BUSTER BEAR GOES FISHING Buster Bear yawned as he lay on his comfortable bed of leaves and watched the first early morning sunbeams creeping through the Green Forest to chase out the Black Shadows. Once more he yawned, and slowly got to his feet and shook himself. Then he walked over to a big pine-tree, stood up on his hind legs, reached as high up on the trunk of the tree as he could, and scratched the bark with his great claws. After that he yawned until it seemed as if his jaws would crack, and then sat down to think what he wanted for breakfast. While he sat there, trying to make up his mind what would taste best, he was listening to the sounds that told of the waking of all the little people who live in the Green Forest. He heard Sammy Jay way off in the distance screaming, "Thief! Thief!" and grinned. "I wonder," thought Buster, "if some one has stolen Sammy\'s breakfast, or if he has stolen the breakfast of some one 

In [5]:
len(all_text)

82557

In [6]:
len(all_text.split(' '))

15870

In [7]:
# lowercase
preprocess = str.lower(all_text)

In [8]:
doc = nlp(preprocess)
# sentences = [sent.text for sent in doc.sents]

# print(sentences)

In [None]:
filtered_tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]

# filtered_tokens = [token.text for token in doc if not token.is_punct]

In [13]:
len(filtered_tokens)

16346

In [42]:
tag_list = [token.pos_ for token in doc if not token.is_stop and not token.is_punct]
tag_desc = [spacy.explain(token.pos_) for token in doc if not token.is_stop and not token.is_punct]

In [23]:
len(filtered_tokens) == len(tag_list)

True

In [43]:
import pandas as pd

df = pd.DataFrame(filtered_tokens)
df.columns = ['word']

df['tag'] = tag_list
df['tag_desc'] = tag_desc


In [44]:
df.head()

Unnamed: 0,word,tag,tag_desc
0,adventures,NOUN,noun
1,buster,NOUN,noun
2,bear,NOUN,noun
3,thornton,PROPN,proper noun
4,w.,PROPN,proper noun


In [45]:
df.groupby(['tag', 'tag_desc']).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,word
tag,tag_desc,Unnamed: 2_level_1
ADJ,adjective,239
ADP,adposition,5
ADV,adverb,105
AUX,auxiliary,10
DET,determiner,1
INTJ,interjection,11
NOUN,noun,435
NUM,numeral,7
PRON,pronoun,5
PROPN,proper noun,88


In [57]:
df[df.tag.isin(['NOUN', 'PROPN'])][['tag', 'word']].groupby('word').count().sort_values(by='tag', ascending=False).head(20)

Unnamed: 0_level_0,tag
word,Unnamed: 1_level_1
buster,223
bear,148
joe,112
brown,101
farmer,100
boy,95
forest,66
otter,51
time,48
pool,46


Based on Noun, exploration, writers may made story with 3th person POV. Extracted based on noun, buster, bear, and joe may the main characters of this story. And it seems mostly took place in forest and pool.

In [49]:
agg_verb = df[df.tag == 'VERB'][['tag', 'word']].groupby('word').count().sort_values(by='tag', ascending=False).head(20).reset_index()
agg_verb

Unnamed: 0,word,tag
0,said,50
1,know,48
2,looked,39
3,laughing,34
4,come,27
5,think,25
6,saw,24
7,went,23
8,eat,23
9,smiling,23


In [53]:
import numpy as np

second_form = ['said', 'looked', 'saw', 'went', 'heard', 'caught', 
               'thought', 'knew', 'came', 'started', 'began']

agg_verb[agg_verb.word.isin(second_form)][['tag']].sum()*100.00 / agg_verb[['tag']].sum()

tag    53.578337
dtype: float64

based on top 20th verb, about 54% verb is on past tense form. it shows that the narrative story tell us about the past condition.

In [54]:
agg_adv = df[df.tag == 'ADV'][['tag', 'word']].groupby('word').count().sort_values(by='tag', ascending=False).head(20).reset_index()
agg_adv

Unnamed: 0,word,tag
0,away,38
1,right,26
2,billy,21
3,far,11
4,fast,9
5,instead,8
6,straight,8
7,long,8
8,pretty,7
9,jolly,7


In [16]:
import pandas as pd

context = []
ner = []

for ent in doc.ents:
    # print(f"{ent.text:<15} {ent.label_:<10} {spacy.explain(ent.label_)}")
    context.append(ent.text)
    ner.append(ent.label_)

df_ner = pd.DataFrame(context)
df_ner.columns = ['context']

df_ner['ner'] = ner

In [17]:
df_ner

Unnamed: 0,context,ner
0,thornton w. burgess,PERSON
1,1920,DATE
2,first,ORDINAL
3,sammy jay way,PERSON
4,this morning,TIME
...,...,...
343,two,CARDINAL
344,two,CARDINAL
345,two or three,CARDINAL
346,toad,PERSON


In [18]:
df_ner.groupby('ner').nunique()

Unnamed: 0_level_0,context
ner,Unnamed: 1_level_1
CARDINAL,13
DATE,14
GPE,2
ORDINAL,2
ORG,6
PERSON,52
PRODUCT,1
TIME,15


In [19]:
df_ner[df_ner.ner == 'PERSON'].groupby('context').count().sort_values(by='ner', ascending = False).head(20)

Unnamed: 0_level_0,ner
context,Unnamed: 1_level_1
joe,63
joe otter,39
sammy jay,23
peter,11
sun,6
farmer brown's,6
jerry muskrat,5
bobby coon,4
buster,4
gartersnake,3


In [63]:
df_ner[df_ner.ner == 'DATE'].groupby('context').count().sort_values(by='ner', ascending = False)

Unnamed: 0_level_0,ner
context,Unnamed: 1_level_1
morrow,5
the day,3
the spring,2
years,2
that day,2
all day,1
one day,1
all summer,1
1920,1
the livelong day,1


the story may occured in 1920 and the story may happen for years

In [None]:
df_ner[df_ner.ner == 'CARDINAL']

Unnamed: 0,context,ner
13,5,CARDINAL
30,two,CARDINAL
32,one,CARDINAL
41,two,CARDINAL
42,three,CARDINAL
43,one,CARDINAL
44,one,CARDINAL
47,one,CARDINAL
48,more than three,CARDINAL
69,one,CARDINAL


when writers state numbers less than 10, he prefer using words rather than numbers

In [64]:
df_ner[df_ner.ner == 'TIME']

Unnamed: 0,context,ner
4,this morning,TIME
64,this morning,TIME
111,late that morning,TIME
113,just a minute,TIME
125,this very morning,TIME
138,only a few minutes before,TIME
159,a minute,TIME
177,morning,TIME
198,next morning,TIME
210,another minute,TIME


NER without punctuation