<a href="https://colab.research.google.com/github/kleczekr/tolkenizer/blob/master/wordy_trials_12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy
from tabulate import tabulate
from collections import Counter
import pandas as pd

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
# This cell is meant to accommodate the Google Colab way of dealing with reading 
# files from Google Drive; feel free to ignore it if you are running the notebook
# on your local machine
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
with  open('/content/drive/My Drive/book_txt/hp/hp1.txt', 'r') as f:
  hp1 = f.read()

In [5]:
hp_chps = []

# retrieve chapters 1-16
for i in range(1,17):
  hp_chps.append(hp1.split('\nChapter '+str(i)+'\n')[1].split('\nChapter'+str(i+1)+'\n')[0])

# retrieve the text of the chapter 17
hp_chps.append(hp1.split('\nChapter 17\n')[1])

In [6]:
hp_spc = list()
for i in range(17):
  hp_spc.append(nlp(hp_chps[i]))
  print(f'finished processing chapter {i+1}')

finished processing chapter 1
finished processing chapter 2
finished processing chapter 3
finished processing chapter 4
finished processing chapter 5
finished processing chapter 6
finished processing chapter 7
finished processing chapter 8
finished processing chapter 9
finished processing chapter 10
finished processing chapter 11
finished processing chapter 12
finished processing chapter 13
finished processing chapter 14
finished processing chapter 15
finished processing chapter 16
finished processing chapter 17


In [7]:
list_pos = list()

# retrieve POS counts, sentence counts and token counts for all the chapters
for item in hp_spc:
  # Counter from Collections is a better way than building your own dict
  # or list
  poscount = Counter()
  for sentence in item.sents:
    poscount['sentence_count'] += 1
    for token in sentence:
      poscount[token.pos_] += 1
      poscount['token_count'] += 1
  # you store the Counter in a list of counters
  list_pos.append(poscount)

# It's difficult to work with list of counters. Convert it to DataFrame
# to perform easy and quick analyses. Remember to fill NaN values
pos_df = pd.DataFrame(list_pos).fillna(0)
# Index starts at 1, indicating chapter
pos_df.index += 1
# change name of the index
pos_df.index.names = ['chapter']

# Perform some calculations---raw numbers don't give you much
pos_df['adj_per_sent'] = pos_df.ADJ/pos_df.sentence_count
pos_df['adj_proportion'] = pos_df.ADJ/pos_df.token_count
pos_df['verb_per_sent'] = pos_df.VERB/pos_df.sentence_count
pos_df['verb_proportion'] = pos_df.VERB/pos_df.token_count
pos_df['noun_per_sent'] = pos_df.NOUN/pos_df.sentence_count
pos_df['noun_proportion'] = pos_df.NOUN/pos_df.token_count
pos_df['adv_per_sent'] = pos_df.ADV/pos_df.sentence_count
pos_df['adv_proportion'] = pos_df.ADV/pos_df.token_count
pos_df['propn_per_sent'] = pos_df.PROPN/pos_df.sentence_count
pos_df['propn_proportion'] = pos_df.PROPN/pos_df.token_count
pos_df['avg_sentence_len'] = pos_df.token_count/pos_df.sentence_count

In [8]:
print(tabulate(pos_df, headers='keys'))

  chapter    sentence_count    SPACE    token_count    DET    PROPN    PRON    VERB    CCONJ    PUNCT    ADP    NOUN    NUM    AUX    ADJ    PART    SCONJ    ADV    INTJ    X    SYM    adj_per_sent    adj_proportion    verb_per_sent    verb_proportion    noun_per_sent    noun_proportion    adv_per_sent    adv_proportion    propn_per_sent    propn_proportion    avg_sentence_len
---------  ----------------  -------  -------------  -----  -------  ------  ------  -------  -------  -----  ------  -----  -----  -----  ------  -------  -----  ------  ---  -----  --------------  ----------------  ---------------  -----------------  ---------------  -----------------  --------------  ----------------  ----------------  ------------------  ------------------
        1              7155     3101         102820   8786     7287    8685   14437     2545    19185   7541   11341    686   4465   4280    2727     1697   5543     500   12      2        0.598183         0.0416261          2.01775        

In [9]:
pos_df_slim = pos_df[['sentence_count', 'token_count', 'adj_per_sent',\
                     'adj_proportion', 'verb_per_sent', 'verb_proportion',\
                     'noun_per_sent', 'noun_proportion', 'adv_per_sent',\
                     'adv_proportion', 'propn_per_sent', 'propn_proportion',\
                     'avg_sentence_len']].copy()

In [10]:
print(tabulate(pos_df_slim, headers='keys'))

  chapter    sentence_count    token_count    adj_per_sent    adj_proportion    verb_per_sent    verb_proportion    noun_per_sent    noun_proportion    adv_per_sent    adv_proportion    propn_per_sent    propn_proportion    avg_sentence_len
---------  ----------------  -------------  --------------  ----------------  ---------------  -----------------  ---------------  -----------------  --------------  ----------------  ----------------  ------------------  ------------------
        1              7155         102820        0.598183         0.0416261          2.01775           0.14041           1.58505          0.1103           0.774703         0.0539097          1.01845            0.0708714             14.3704
        2              6766          96943        0.590304         0.0411995          2.01448           0.140598          1.57567          0.109972         0.769731         0.0537223          1.02439            0.0714956             14.328
        3              6510          

In [11]:
# retrieve a list of paragraphs for each chapter!
hp_chps_paras = list()
# create empty lists for each chapter---later you will populate em with paragraphs
for i in range(17):
  hp_chps_paras.append([])

for i in range(17):
  for element in hp_chps[i].splitlines():
    if len(element) > 50:
      hp_chps_paras[i].append(element)
      print(f'added paragraph of length {len(element)}')
    else:
      print(f'rejected paragraph of length {len(element)}')
  print(f'\nfinished processing chapter {i+1}\n')
# for i in range(17):
#   hp_chps.append(nlp(hp_chps[i]))
#   print(f'finished processing chapter {i+1}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
rejected paragraph of length 0
rejected paragraph of length 0
rejected paragraph of length 0
added paragraph of length 224
rejected paragraph of length 0
added paragraph of length 178
rejected paragraph of length 0
rejected paragraph of length 38
rejected paragraph of length 0
added paragraph of length 159
rejected paragraph of length 0
rejected paragraph of length 22
rejected paragraph of length 0
added paragraph of length 295
rejected paragraph of length 0
added paragraph of length 367
rejected paragraph of length 0
added paragraph of length 77
rejected paragraph of length 0
added paragraph of length 370
rejected paragraph of length 0
rejected paragraph of length 8
rejected paragraph of length 0
added paragraph of length 113
rejected paragraph of length 0
added paragraph of length 275
rejected paragraph of length 0
added paragraph of length 170
rejected paragraph of length 0
added paragraph of length 65
rejected paragra

In [12]:
hp_para_spc = list()

# create empty lists for each chapter---later you will populate em with paragraphs
for i in range(17):
  hp_para_spc.append([])

for i in range(17):
  for another_i in range(len(hp_chps_paras[i])):
    hp_para_spc[i].append(nlp(hp_chps_paras[i][another_i]))
    print(f'finished processing paragraph {another_i+1} in chapter {i+1}')
  print(f'\nfinished processing chapter {i+1}\n')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
finished processing paragraph 481 in chapter 10
finished processing paragraph 482 in chapter 10
finished processing paragraph 483 in chapter 10
finished processing paragraph 484 in chapter 10
finished processing paragraph 485 in chapter 10
finished processing paragraph 486 in chapter 10
finished processing paragraph 487 in chapter 10
finished processing paragraph 488 in chapter 10
finished processing paragraph 489 in chapter 10
finished processing paragraph 490 in chapter 10
finished processing paragraph 491 in chapter 10
finished processing paragraph 492 in chapter 10
finished processing paragraph 493 in chapter 10
finished processing paragraph 494 in chapter 10
finished processing paragraph 495 in chapter 10
finished processing paragraph 496 in chapter 10
finished processing paragraph 497 in chapter 10
finished processing paragraph 498 in chapter 10
finished processing paragraph 499 in chapter 10
finished processing par

In [13]:
list_pos = list()
chapter_count = 1
# retrieve POS counts, sentence counts and token counts for all the chapters
for chapter in hp_para_spc:
  for paragraph in chapter:
    poscount = Counter()
    poscount['chapter'] = chapter_count
    for sent in paragraph.sents:
      poscount['sentence_count'] += 1
      for tok in sent:
        poscount[tok.pos_] += 1
        poscount['token_count'] += 1
  # you store the Counter in a list of counters
    list_pos.append(poscount)
  chapter_count += 1

# It's difficult to work with list of counters. Convert it to DataFrame
# to perform easy and quick analyses. Remember to fill NaN values
pos_df = pd.DataFrame(list_pos).fillna(0)
# Index starts at 1, indicating paragraph number
pos_df.index += 1
# change name of the index
pos_df.index.names = ['paragraph']

# Perform some calculations---raw numbers don't give you much
pos_df['adj_per_sent'] = pos_df.ADJ/pos_df.sentence_count
pos_df['adj_proportion'] = pos_df.ADJ/pos_df.token_count
pos_df['verb_per_sent'] = pos_df.VERB/pos_df.sentence_count
pos_df['verb_proportion'] = pos_df.VERB/pos_df.token_count
pos_df['noun_per_sent'] = pos_df.NOUN/pos_df.sentence_count
pos_df['noun_proportion'] = pos_df.NOUN/pos_df.token_count
pos_df['adv_per_sent'] = pos_df.ADV/pos_df.sentence_count
pos_df['adv_proportion'] = pos_df.ADV/pos_df.token_count
pos_df['propn_per_sent'] = pos_df.PROPN/pos_df.sentence_count
pos_df['propn_proportion'] = pos_df.PROPN/pos_df.token_count
pos_df['avg_sentence_len'] = pos_df.token_count/pos_df.sentence_count

In [14]:
pos_df.tail(30)

Unnamed: 0_level_0,chapter,sentence_count,PROPN,token_count,CCONJ,PUNCT,ADP,NOUN,NUM,AUX,ADJ,PART,VERB,SCONJ,PRON,ADV,DET,INTJ,X,SYM,adj_per_sent,adj_proportion,verb_per_sent,verb_proportion,noun_per_sent,noun_proportion,adv_per_sent,adv_proportion,propn_per_sent,propn_proportion,avg_sentence_len
paragraph,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
21471,17,2,1.0,14,0.0,2.0,0.0,2.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,2.0,3.0,0.0,0.0,0.0,0.5,0.071429,1.0,0.142857,1.0,0.142857,1.0,0.142857,0.5,0.071429,7.0
21472,17,4,2.0,26,0.0,9.0,1.0,2.0,0.0,1.0,2.0,1.0,4.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.5,0.076923,1.0,0.153846,0.5,0.076923,0.0,0.0,0.5,0.076923,6.5
21473,17,1,1.0,17,0.0,2.0,2.0,3.0,0.0,0.0,2.0,0.0,2.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,2.0,0.117647,2.0,0.117647,3.0,0.176471,0.0,0.0,1.0,0.058824,17.0
21474,17,1,3.0,25,0.0,6.0,3.0,4.0,1.0,1.0,1.0,0.0,3.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.04,3.0,0.12,4.0,0.16,1.0,0.04,3.0,0.12,25.0
21475,17,4,3.0,44,0.0,9.0,2.0,9.0,0.0,1.0,5.0,1.0,6.0,0.0,1.0,2.0,5.0,0.0,0.0,0.0,1.25,0.113636,1.5,0.136364,2.25,0.204545,0.5,0.045455,0.75,0.068182,11.0
21476,17,1,5.0,28,0.0,6.0,5.0,5.0,1.0,0.0,2.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,2.0,0.071429,1.0,0.035714,5.0,0.178571,0.0,0.0,5.0,0.178571,28.0
21477,17,2,2.0,34,1.0,4.0,5.0,6.0,1.0,3.0,0.0,0.0,3.0,0.0,3.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,1.5,0.088235,3.0,0.176471,1.0,0.058824,1.0,0.058824,17.0
21478,17,3,6.0,35,1.0,11.0,2.0,4.0,1.0,0.0,4.0,0.0,3.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.333333,0.114286,1.0,0.085714,1.333333,0.114286,0.333333,0.028571,2.0,0.171429,11.666667
21479,17,3,6.0,52,1.0,6.0,2.0,3.0,5.0,4.0,3.0,0.0,7.0,4.0,3.0,4.0,4.0,0.0,0.0,0.0,1.0,0.057692,2.333333,0.134615,1.0,0.057692,1.333333,0.076923,2.0,0.115385,17.333333
21480,17,2,1.0,11,0.0,2.0,0.0,2.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.5,0.090909,1.0,0.181818,1.0,0.181818,0.5,0.090909,0.5,0.090909,5.5


In [15]:
print(tabulate(pos_df[:150], headers='keys'))

  paragraph    chapter    sentence_count    PROPN    token_count    CCONJ    PUNCT    ADP    NOUN    NUM    AUX    ADJ    PART    VERB    SCONJ    PRON    ADV    DET    INTJ    X    SYM    adj_per_sent    adj_proportion    verb_per_sent    verb_proportion    noun_per_sent    noun_proportion    adv_per_sent    adv_proportion    propn_per_sent    propn_proportion    avg_sentence_len
-----------  ---------  ----------------  -------  -------------  -------  -------  -----  ------  -----  -----  -----  ------  ------  -------  ------  -----  -----  ------  ---  -----  --------------  ----------------  ---------------  -----------------  ---------------  -----------------  --------------  ----------------  ----------------  ------------------  ------------------
          1          1                 2        5             54        2        7      3       3      1      5      6       3       6        2       6      4      1       0    0      0        3                0.111111          3   