In [None]:
# Information

# Comparisons
# Referencing lecture notes & web pages from the Internet:
# https://github.com/ianmcloughlin/2223-S1-fund-data-analysis/blob/main/notebooks/01-information.ipynb
# https://tdan.com/what-alice-in-wonderland-can-teach-us-about-data-analytics/22498
# https://richardbrath.wordpress.com/2021/10/31/58-ways-to-visualize-alice-in-wonderland/
# https://www.itsoc.org/about/shannon
# https://www.educative.io/answers/how-to-generate-a-random-string-in-python
# https://www.geeksforgeeks.org/python-generate-random-string-of-given-length/
# http://www.unit-conversion.info/texttools/random-string-generator/
# https://towardsdatascience.com/fundamentals-of-statistics-for-data-scientists-and-data-analysts-69d93a05aae7
# https://miro.com/templates/random-words/
# https://www.pewresearch.org/methods/2018/01/26/how-different-weighting-methods-work/

# making http request for Internet resources
import urllib.request

# the URL for the book Alice in Wonderland
alice_url = 'https://www.gutenberg.org/files/11/11-0.txt'

# Retrieving the book
wonderland = list(urllib.request.urlopen(alice_url))

# Decoding all lines and stripping line endings
wonderland = [line.decode('utf-8-sig').strip() for line in wonderland]

# Retrieving & printing a specified paragraph
para = ''.join(wonderland[58:63])
print(para)

In [None]:
# The above paragraph printed but it contains typos in some areas

# Cleaning the above paragraph of typos
# Lower case the paragraph
madhatter = para.lower()

# Creating variable, chars, with all letters, also including a space character
chars = 'abcdefghijklmnopqrstuvwxyz '

# Stripping anything that isn't in the variable chars
madhatter = ''.join([c for c in madhatter if c in chars])
print(madhatter)

In [None]:
# The paragraph now has no punctuation, these weren't designated within the chars variable.

# Generating a random sequence of letters from the chars variable
# by using the random module in Python
import random
print(random.choice(chars))

# Retrieving the length of the above paragraph
N = len(madhatter)
print(N)

# Generating N random characters derived from the chars variable
gen = random.choices(chars, k = N)

# Joining random characters together to form strings
gen = ''.join(gen)
print(gen)

In [None]:
# Retrieving the entire book Alice in Wonderland as one large lower-case string
entirebook = ''.join(wonderland[26:]).lower()

# Creating weights:
# A weight will count the occurrences of each character throughout
# the entire book
weights = [entirebook.count(c) for c in chars]
weights

# Generating a string using the weights and joining together
wgen = random.choices(chars, weights = weights, k = N)
wgen = ''.join(wgen)
print(wgen)

In [None]:
# Creating weights which include the previous character
twofer = {c:{d:entirebook.count(c+d) for d in chars} for c in chars}
twofer

In [None]:
# Looping through the chars variable with a for loop
for i in range(len(chars)):
    # Printing all characters in the variable and how many times
    # each character appears in the book Alice in Wonderland
    print(f'{chars[i]}:{weights[i]}')

In [None]:
# Starting with a space character in pairs variable
pairs = ' '

# Conduct the following N-1 times
for i in range(1, N):
    # Creating the weights where the previous character is the last character in pairs
    wt = twofer[pairs[-1]]
    # Turning wt into a list that is ordered by chars
    wt = [wt[c] for c in chars]
    # Randomly choosing the next character using those weights
    nextchar = random.choices(chars, weights=wt, k=1)[0]
    # Appending the characters into pairs
    pairs = pairs + nextchar
pairs

In [None]:
# Exercise 1: Adapt the code above to generate a 1000 character long string with weights based on the previous two characters
# Editing the code below after more research

# Starting with a space
pairs = ' '

# Initialising the length of the string to 1,000 characters
N = 1000

# Conduct the following for loop N-1 times
for i in range(1, N):
    # Getting the weights where the previous character is the last two characters in pairs
    wt = twofer[pairs[-1]]
    # Turning wt into a list that is ordered by chars
    wt = [wt[c] for c in chars]
    # Randomly choosing the next character using the weights
    nextchar = random.choices(chars, weights=wt, k=1)[0]
    # Appending the character into pairs
    pairs = pairs + nextchar
pairs

In [None]:
# Entropy

# Referencing lecture notes & web pages from the Internet:
# https://github.com/ianmcloughlin/2223-S1-fund-data-analysis/blob/main/notebooks/01-information.ipynb
# https://www.analyticsvidhya.com/blog/2020/11/entropy-a-key-concept-for-all-data-science-beginners/
# https://machinelearningmastery.com/what-is-information-entropy/

# The word entropy refers to scenarios with less organisation, control and less constraints/restrictions
# If I was offered a coffee, is it a mocha or cappuccino (between 2 types)?
# If the vendor had only mocha, the entropy (uncertainty) would be 0:
# P(Coffee Sachet == Mocha) = 1
# P(Coffee Sachet == Cappuccino) = 1 - 1 = 0
# Guaranteed Mocha

# However, in the case of mocha & cappuccino available to me:
# P(Coffee Sachet == Mocha) = 0.5
# P(Coffee Sachet == Cappuccino) = 1 - 0.5 = 0.5
# 50/50 likelihood of mocha or cappuccino

# importing Math functions
import math
print(-math.log(0.5,2))
print(-math.log(0.25,2))
print(-math.log(0.75,2))

In [None]:
# Information content for the calculations above:
# 1 bit
# 2 bits
# 0.4 bits

p = [0.5, 0.5]
print(-sum([p_i * math.log(p_i, 2) for p_i in p]))

# if 1 is sent with probability of 3/4, followed by 0 sent with probability of 1/4, what's the overall information content?
# Claude Shannon proposed calculating the average content for all messages sent
p = [0.25, 0.75]
print(-sum([p_i * math.log(p_i, 2) for p_i in p]))

In [None]:
# import plotting & numerical capabilities
import matplotlib.pyplot as plt
import numpy as np

# acquiring the range of p values
p_of_1 = np.linspace(0.01, 0.99, 1000)
entropy = -(p_of_1 * np.log2(p_of_1) + (1.0 - p_of_1) * np.log2(1.0 - p_of_1))

# creating the plot and labelling the axes
fig, ax = plt.subplots(figsize = (14, 6))
ax.plot(p_of_1, entropy, color = 'red')
ax.set_xlabel('Probability the bit is 1 and not 0')
ax.set_ylabel('Entropy')

In [None]:
# Fire Alarms
# calculating the information content of a fire alarm
mins_in_year = 365 * 24 * 60
print(mins_in_year)

# probability of the fire alarm activating
p_alarm = 10/mins_in_year
print(p_alarm)

In [None]:
# plugging data into formula below to calculate value of information content of alarm activating
print(-math.log(p_alarm, 2))
# versus alarm silenced
print(-math.log(1.0 - p_alarm, 2))

In [None]:
# Information content regarding the fire alarm
p = [p_alarm, 1.0 - p_alarm]
entropy = -sum([p_i * math.log(p_i, 2) for p_i in p])
print(f'{entropy} bits')

In [None]:
# PIN Codes
# PIN Codes are normally 4 digits with 10,000 possible combinations
-math.log(1.0/10000, 2)

In [None]:
# Entropy of a 4 digit PIN code
p = [1.0/10000 for i in range(10000)]
entropy = -sum([p_i * math.log(p_i, 2) for p_i in p])
print(f'{entropy} bits')

In [None]:
# Entropy of a 6 digit PIN code
p = [1.0/1000000 for i in range(1000000)]
entropy = -sum([p_i * math.log(p_i, 2) for p_i in p])
print(f'{entropy} bits')

In [None]:
# Exercise 2: Explain why the log of zero is undefined:

# Log0 is deemed undefined by mathematicians, if you have a number to the power of another number, it is impossible to get a result of 0.
# https://www.rapidtables.com/math/algebra/logarithm/Logarithm_of_0.html

# When we can't find a number for x in the following formula:
# logb(x), the base (b) to the power of x will equal 0
# Therefore, Base (b) log of zero, ie, logb(0) is not defined