# Exercise 1: The Books

In [53]:
# Import the libs

import numpy as np
import requests
import tiktoken
from urllib.parse import urlparse

In [2]:
gbt4_tokenizer = tiktoken.get_encoding("cl100k_base")
gbt4_tokenizer.n_vocab

100277

In [3]:
# all books have the same url format;
# they are unique by numerical code
baseurl = 'https://www.gutenberg.org/cache/epub/'

bookurls = [
    # code       title
    ['84',    'Frankenstein'    ],
    ['64317', 'GreatGatsby'     ],
    ['11',    'AliceWonderland' ],
    ['1513',  'RomeoJuliet'     ],
    ['76',    'HuckFinn'        ],
    ['219',   'HeartDarkness'   ],
    ['2591',  'GrimmsTales'     ],
    ['2148',  'EdgarAllenPoe'   ],
    ['36',    'WarOfTheWorlds'  ],
    ['829',   'GulliversTravels']
]

In [51]:
infos = np.zeros( (len(bookurls),3) )

for idx,(c,b) in enumerate(bookurls):
    
    text = ( requests.get(baseurl+c+"/pg"+c+".txt") ).text
    tokens = gbt4_tokenizer.encode(text)

    compression = len(tokens)/len(text)

    infos[idx,1] = len(tokens)
    infos[idx,0] = len(text)
    infos[idx,2] = compression*100

In [52]:
print("| Book Title       | Characters  | Tokens    | Compression |\n"+"-"*60)

for i in range(10):
    print(f"| {bookurls[i][1]:16} | {int(infos[i,0]):<11,} | {int(infos[i,1]):<8,}  | {infos[i,2]:>10.2f}% |\n")

| Book Title       | Characters  | Tokens    | Compression |
------------------------------------------------------------
| Frankenstein     | 446,544     | 102,419   |      22.94% |

| GreatGatsby      | 296,858     | 70,343    |      23.70% |

| AliceWonderland  | 167,674     | 41,457    |      24.72% |

| RomeoJuliet      | 167,426     | 43,761    |      26.14% |

| HuckFinn         | 602,714     | 159,125   |      26.40% |

| HeartDarkness    | 232,885     | 56,483    |      24.25% |

| GrimmsTales      | 549,736     | 137,252   |      24.97% |

| EdgarAllenPoe    | 632,131     | 144,315   |      22.83% |

| WarOfTheWorlds   | 363,420     | 84,580    |      23.27% |

| GulliversTravels | 611,742     | 143,560   |      23.47% |



# Exercise 2: Repeat With Websites

In [54]:
weburls = [
    'http://python.org/',
    'https://pytorch.org/',
    'https://en.wikipedia.org/wiki/List_of_English_words_containing_Q_not_followed_by_U',
    'https://sudoku.com/',
    'https://reddit.com/',
    'https://visiteurope.com/en/',
    'https://sincxpress.com/',
    'https://openai.com/',
    'https://theuselessweb.com/',
    'https://maps.google.com/',
    'https://pigeonsarentreal.co.uk/',
]

In [None]:
web_infos = np.zeros( (len(weburls),3) )

for idx,web in enumerate(weburls):
    
    text = ( requests.get(baseurl+c+"/pg"+c+".txt") ).text
    tokens = gbt4_tokenizer.encode(text)

    compression = len(tokens)/len(text)

    infos[idx,1] = len(tokens)
    infos[idx,0] = len(text)
    infos[idx,2] = compression*100