In [3]:
# Convert the Shakespeare text into a lowercase, word-frequency list:
# 1. `tr 'A-Z' 'a-z'`         → map all uppercase letters to lowercase.
# 2. `< file.txt`             → read the input text file.
# 3. `tr -sc 'A-Za-z' '\n'`   → replace all non-alphabetic characters with newlines,effectively splitting the text into words.
# 4. `sort`                   → sort all words alphabetically.
# 5. `uniq -c`                → collapse duplicates, prefixing each word with its count.
# 6. `sort -n -r`             → sort the result numerically in descending order (most frequent words first).
!tr 'A-Z' 'a-z' < '../data/shakespeare/Shakespeare_clean_full.txt' | tr -sc 'A-Za-z' '\n' | sort | uniq -c | sort -n -r

6219 the
5559 and
4877 i
4252 to
3449 of
3063 a
3031 you
2559 my
2439 that
2370 in
2210 is
1963 not
1896 it
1770 me
1634 with
1616 s
1568 for
1470 this
1442 be
1439 he
1431 but
1368 his
1273 have
1241 your
1180 as
1173 what
1170 d
1146 thou
1112 so
1088 him
1082 will
1029 do
 880 o
 828 all
 822 shall
 811 her
 798 we
 796 no
 775 by
 742 if
 742 are
 729 on
 674 thy
 662 our
 647 lord
 632 come
 627 thee
 608 good
 605 now
 575 from
 573 caesar
 571 she
 569 love
 563 enter
 560 let
 532 here
 527 there
 527 or
 524 at
 521 they
 517 antony
 507 which
 505 well
 505 more
 491 ll
 483 would
 476 then
 476 am
 475 hamlet
 457 when
 454 was
 451 how
 447 did
 436 man
 428 go
 426 their
 411 know
 408 hath
 403 upon
 403 them
 402 an
 390 brutus
 388 say
 386 should
 384 night
 371 us
 365 yet
 365 than
 365 one
 363 sir
 363 must
 361 iago
 359 make
 353 like
 352 may
 350 mark
 342 tis
 338 

In [4]:
# import the content of shakespeare.txt as a string and store it in the variable shake_text
shake_text = open('../data/shakespeare/Shakespeare_clean_full.txt', 'r').read()

In [5]:
from nltk import regexp_tokenize

# Define a regex pattern for tokenization:
# - \b[A-Za-z]+\b : match whole words consisting of alphabetic characters
# - [.,'?]        : also match punctuation marks ., ' ?
# (?x) flag       : allows verbose regex with whitespace and comments
regex_pattern = r'''(?x)
    \b[A-Za-z]+\b
    | [.,'?]
'''

# Tokenize the Shakespeare text (converted to lowercase) according to the regex pattern.
shake_tokenized = regexp_tokenize(shake_text.lower(), regex_pattern, discard_empty=True)

# Dictionary to hold token → frequency counts
token_dict = {}

# Count occurrences of each token
for token in shake_tokenized:
    if token in token_dict:
        token_dict[token] = token_dict[token] + 1
    else:
        token_dict[token] = 1

# Sort the dictionary by frequency (descending order)
token_dict = dict(sorted(token_dict.items(), key=lambda item: item[1], reverse=True))

# Print the frequencies and corresponding tokens
for token in token_dict:
    print(str(token_dict[token]) + " " + token)

# Print the number of unique tokens
print(len(token_dict))

18692 ,
8379 .
6219 the
5559 and
5514 '
4877 i
4252 to
3449 of
3063 a
3031 you
2559 my
2439 that
2370 in
2364 ?
2210 is
1963 not
1896 it
1770 me
1634 with
1616 s
1568 for
1470 this
1442 be
1439 he
1431 but
1368 his
1273 have
1241 your
1180 as
1173 what
1170 d
1146 thou
1112 so
1088 him
1082 will
1029 do
880 o
828 all
822 shall
811 her
798 we
796 no
775 by
742 if
742 are
729 on
674 thy
662 our
647 lord
632 come
627 thee
608 good
605 now
575 from
573 caesar
571 she
569 love
563 enter
560 let
532 here
527 there
527 or
524 at
521 they
517 antony
507 which
505 more
505 well
491 ll
483 would
476 then
476 am
475 hamlet
457 when
454 was
451 how
447 did
436 man
428 go
426 their
411 know
408 hath
403 upon
403 them
402 an
390 brutus
388 say
386 should
384 night
371 us
365 one
365 yet
365 than
363 must
363 sir
361 iago
359 make
353 like
352 may
350 mark
342 tis
338 some
336 othello
335 where
333 see
333 such
331 give
327 t
324 why
315 romeo
313 lady
311 speak
310 out
307 most
295 had
291 macbeth
2