In [1]:
# Distribution for the book
from collections import Counter
import re
from collections import defaultdict


with open("J. K. Rowling - Harry Potter 1 - Sorcerer's Stone.txt", 'r', encoding='utf-8') as file:
    text = file.read()

# Convert every word to lower case and also remove puncuation
text = re.sub(r'[^\w\s]-', '', text.lower())

# Get a list of the words
words = text.split()

# Count of words in the book
len_words = len(words)

# Count the frequency of each word
word_freq = Counter(words)

# 20 most common words
most_common_words = word_freq.most_common(20)

# Find the count of the top 20 words used
total_top_word_freq = 0
for tup in most_common_words:
    total_top_word_freq += tup[1]

# Get the percent of time each word is used in the top 20
word_percent_dict = defaultdict(lambda: float)
for word, freq in most_common_words:
    word_percent_dict[word] = (freq/total_top_word_freq)

print(f"The length of Harry Potter and the Sorcer's Stone: {len_words}")
print(f'The number of times the top 20 most common words are used: {total_top_word_freq}')

print("Top 20 most common words in Harry Potter and the Sorcerer's Stone and the percent of time used:")
print(word_percent_dict)

The length of Harry Potter and the Sorcer's Stone: 77762
The number of times the top 20 most common words are used: 21620
Top 20 most common words in Harry Potter and the Sorcerer's Stone and the percent of time used:
defaultdict(<function <lambda> at 0x0000019117C06020>, {'the': 0.16577243293246993, 'and': 0.08589269195189639, 'to': 0.08487511563367253, 'a': 0.07715078630897318, 'he': 0.06877890841813136, 'of': 0.057585568917668827, 'was': 0.053422756706753005, 'his': 0.04310823311748381, 'in': 0.04278445883441258, 'harry': 0.041766882516188715, 'it': 0.03432007400555041, 'had': 0.03214616096207216, 'said': 0.030527289546716005, 'you': 0.028769657724329326, 'at': 0.028769657724329326, 'they': 0.02673450508788159, 'on': 0.025948196114708603, 'that': 0.024421831637372802, 'as': 0.024144310823311747, 'i': 0.023080481036077707})


In [2]:
# Get the Zipf Distribution
denom = 0
for i in range(1, 21):
    denom += 1/(i+2.7)


zipf_list = list()
for k in range(1, 21):
    zipf_val = (1/(k+2.7))/(denom)
    zipf_list.append(zipf_val)

print(f'The Zipf Distrubtion: {zipf_list}')

The Zipf Distrubtion: [0.1367020048361731, 0.1076164718923065, 0.08873638910418255, 0.07549215192445381, 0.06568797634984942, 0.05813763424067133, 0.05214406370039594, 0.04727078671905052, 0.04323054853793509, 0.03982656833809768, 0.036919519554294934, 0.03440798761182589, 0.032216396044193664, 0.030287270532565302, 0.028576125304736754, 0.027047990261702702, 0.025674995832174648, 0.02443465786926766, 0.023308636769301408, 0.02228182457682117]


In [3]:
from scipy import stats

observed = [x*total_top_word_freq for x in list(word_percent_dict.values())]
expected = [x*total_top_word_freq for x in zipf_list]

test_result = stats.chisquare(observed, expected)

print(f'The test statistic is {test_result.statistic} with a p-value of {test_result.pvalue}')

The test statistic is 259.05401881632434 with a p-value of 4.516633532582998e-44
