<a href="https://colab.research.google.com/github/kleczekr/tolkenizer/blob/master/fantastic_quirks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy
from tabulate import tabulate
from collections import Counter
import pandas as pd

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
# This cell is meant to accommodate the Google Colab way of dealing with reading 
# files from Google Drive; feel free to ignore it if you are running the notebook
# on your local machine
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
with  open('/content/drive/My Drive/book_txt/hp/hp1.txt', 'r') as f:
  hp1 = f.read()

In [6]:
hp_chps = []

# retrieve chapters 1-16
for i in range(1,17):
  hp_chps.append(hp1.split('\nChapter '+str(i)+'\n')[1].split('\nChapter '+str(i+1)+'\n')[0])

# retrieve the text of the chapter 17
hp_chps.append(hp1.split('\nChapter 17\n')[1])

In [7]:
# retrieve a list of paragraphs for each chapter!
hp_chps_paras = list()
# create empty lists for each chapter---later you will populate em with paragraphs
for i in range(17):
  hp_chps_paras.append([])

for i in range(17):
  for element in hp_chps[i].splitlines():
    if len(element) > 50:
      hp_chps_paras[i].append(element)
      print(f'added paragraph of length {len(element)}')
    else:
      print(f'rejected paragraph of length {len(element)}')
  print(f'\nfinished processing chapter {i+1}\n')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
added paragraph of length 189
rejected paragraph of length 0
added paragraph of length 518
rejected paragraph of length 0
added paragraph of length 163
rejected paragraph of length 0
added paragraph of length 106
rejected paragraph of length 0
rejected paragraph of length 30
rejected paragraph of length 0
added paragraph of length 289
rejected paragraph of length 0
added paragraph of length 104
rejected paragraph of length 0
added paragraph of length 231
rejected paragraph of length 0
added paragraph of length 131
rejected paragraph of length 0
rejected paragraph of length 20
rejected paragraph of length 0
added paragraph of length 88
rejected paragraph of length 0
added paragraph of length 225
rejected paragraph of length 0
added paragraph of length 62
rejected paragraph of length 0
added paragraph of length 99
rejected paragraph of length 0
rejected paragraph of length 32
rejected paragraph of length 0
added paragraph o

In [8]:
hp_para_spc = list()

# create empty lists for each chapter---later you will populate em with paragraphs
for i in range(17):
  hp_para_spc.append([])

for i in range(17):
  for another_i in range(len(hp_chps_paras[i])):
    hp_para_spc[i].append(nlp(hp_chps_paras[i][another_i]))
    print(f'finished processing paragraph {another_i+1} in chapter {i+1}')
  print(f'\nfinished processing chapter {i+1}\n')

finished processing paragraph 1 in chapter 1
finished processing paragraph 2 in chapter 1
finished processing paragraph 3 in chapter 1
finished processing paragraph 4 in chapter 1
finished processing paragraph 5 in chapter 1
finished processing paragraph 6 in chapter 1
finished processing paragraph 7 in chapter 1
finished processing paragraph 8 in chapter 1
finished processing paragraph 9 in chapter 1
finished processing paragraph 10 in chapter 1
finished processing paragraph 11 in chapter 1
finished processing paragraph 12 in chapter 1
finished processing paragraph 13 in chapter 1
finished processing paragraph 14 in chapter 1
finished processing paragraph 15 in chapter 1
finished processing paragraph 16 in chapter 1
finished processing paragraph 17 in chapter 1
finished processing paragraph 18 in chapter 1
finished processing paragraph 19 in chapter 1
finished processing paragraph 20 in chapter 1
finished processing paragraph 21 in chapter 1
finished processing paragraph 22 in chapter

In [10]:
list_pos = list()
chapter_count = 1
# retrieve POS counts, sentence counts and token counts for all the chapters
for chapter in hp_para_spc:
  for paragraph in chapter:
    poscount = Counter()
    poscount['chapter'] = chapter_count
    poscount['text'] = paragraph.text
    for sent in paragraph.sents:
      poscount['sentence_count'] += 1
      for tok in sent:
        poscount[tok.pos_] += 1
        poscount['token_count'] += 1
  # you store the Counter in a list of counters
    list_pos.append(poscount)
  chapter_count += 1

In [11]:
# It's difficult to work with list of counters. Convert it to DataFrame
# to perform easy and quick analyses. Remember to fill NaN values
pos_df = pd.DataFrame(list_pos).fillna(0)
# change name of the index
pos_df.index.names = ['paragraph']

In [12]:
# Perform some calculations---raw numbers don't give you much
pos_df['adj_per_sent'] = pos_df.ADJ/pos_df.sentence_count
pos_df['adj_proportion'] = pos_df.ADJ/pos_df.token_count
pos_df['verb_per_sent'] = pos_df.VERB/pos_df.sentence_count
pos_df['verb_proportion'] = pos_df.VERB/pos_df.token_count
pos_df['noun_per_sent'] = pos_df.NOUN/pos_df.sentence_count
pos_df['noun_proportion'] = pos_df.NOUN/pos_df.token_count
pos_df['adv_per_sent'] = pos_df.ADV/pos_df.sentence_count
pos_df['adv_proportion'] = pos_df.ADV/pos_df.token_count
pos_df['propn_per_sent'] = pos_df.PROPN/pos_df.sentence_count
pos_df['propn_proportion'] = pos_df.PROPN/pos_df.token_count
pos_df['avg_sentence_len'] = pos_df.token_count/pos_df.sentence_count

In [13]:
# reduce the number of columns in the dataframe to most essential ones
pos_df_slim = pos_df[['chapter', 'text', 'sentence_count', 'token_count', 'adj_per_sent',\
                     'adj_proportion', 'verb_per_sent', 'verb_proportion',\
                     'noun_per_sent', 'noun_proportion', 'adv_per_sent',\
                     'adv_proportion', 'propn_per_sent', 'propn_proportion',\
                     'avg_sentence_len']].copy()

In [14]:
# add several more columns to the dataframe---counts of standard deviations above/below
# the mean for: proportion of adjectives (std_adj), proportion of verbs (std_v), proportion of nouns (std_n),
# proportion of adverbs (std_adv) and proportion of proper nouns (std_pn)
pos_df_slim['std_adj'] = (pos_df_slim.adj_proportion - pos_df_slim.adj_proportion.mean(axis=0)) / pos_df_slim.adj_proportion.std(axis=0)
pos_df_slim['std_v'] = (pos_df_slim.verb_proportion - pos_df_slim.verb_proportion.mean(axis=0)) / pos_df_slim.verb_proportion.std(axis=0)
pos_df_slim['std_n'] = (pos_df_slim.noun_proportion - pos_df_slim.noun_proportion.mean(axis=0)) / pos_df_slim.noun_proportion.std(axis=0)
pos_df_slim['std_adv'] = (pos_df_slim.adv_proportion - pos_df_slim.adv_proportion.mean(axis=0)) / pos_df_slim.adv_proportion.std(axis=0)
pos_df_slim['std_pn'] = (pos_df_slim.propn_proportion - pos_df_slim.propn_proportion.mean(axis=0)) / pos_df_slim.propn_proportion.std(axis=0)

In [15]:
# calculate weirdness of a sentence---the sum of absolute values of the calculated number
# of standard deviations from the mean that you've calculated before.
pos_df_slim['weirdness'] = abs(pos_df_slim.std_adj) + abs(pos_df_slim.std_v) + \
abs(pos_df_slim.std_n) + abs(pos_df_slim.std_adv) + abs(pos_df_slim.std_pn)

In [16]:
pos_df_slim.head()

Unnamed: 0_level_0,chapter,text,sentence_count,token_count,adj_per_sent,adj_proportion,verb_per_sent,verb_proportion,noun_per_sent,noun_proportion,adv_per_sent,adv_proportion,propn_per_sent,propn_proportion,avg_sentence_len,std_adj,std_v,std_n,std_adv,std_pn,weirdness
paragraph,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,"Mr. and Mrs. Dursley, of number four, Privet D...",2,54,3.0,0.111111,3.0,0.111111,1.5,0.055556,2.0,0.074074,2.5,0.092593,27.0,1.815252,-0.718454,-0.878098,0.448246,0.272761,4.132811
1,1,Mr. Dursley was the director of a firm called ...,4,94,2.5,0.106383,1.75,0.074468,3.75,0.159574,1.5,0.06383,1.75,0.074468,23.5,1.693521,-1.425326,0.786088,0.217925,-0.03578,4.15864
2,1,"The Dursleys had everything they wanted, but t...",6,155,1.166667,0.045161,3.166667,0.122581,2.333333,0.090323,1.0,0.03871,2.333333,0.090323,25.833333,0.117314,-0.497198,-0.321864,-0.346846,0.234118,1.517341
3,1,"When Mr. and Mrs. Dursley woke up on the dull,...",3,72,2.333333,0.097222,3.333333,0.138889,2.333333,0.097222,2.333333,0.097222,3.0,0.125,24.0,1.45767,-0.1826,-0.211478,0.968682,0.824446,3.644875
4,1,"None of them noticed a large, tawny owl flutte...",1,14,2.0,0.142857,1.0,0.071429,4.0,0.285714,0.0,0.0,0.0,0.0,14.0,2.632582,-1.48396,2.804184,-1.21715,-1.30348,9.441356


In [20]:
print(tabulate(pos_df_slim[pos_df_slim.weirdness>10], headers='keys'))

  paragraph    chapter  text                                                                                  sentence_count    token_count    adj_per_sent    adj_proportion    verb_per_sent    verb_proportion    noun_per_sent    noun_proportion    adv_per_sent    adv_proportion    propn_per_sent    propn_proportion    avg_sentence_len    std_adj      std_v      std_n    std_adv     std_pn    weirdness
-----------  ---------  ----------------------------------------------------------------------------------  ----------------  -------------  --------------  ----------------  ---------------  -----------------  ---------------  -----------------  --------------  ----------------  ----------------  ------------------  ------------------  ---------  ---------  ---------  ---------  ---------  -----------
        319          4  (Order of Merlin, First Class, Grand Sorc., Chf. Warlock,                                          2             16               0         0                      0