In [None]:
# importing numerical arrays
import numpy as np
# importing plotting capabilities
import matplotlib.pyplot as plt
# importing statistics
import scipy.stats as ss

In [None]:
# Normal Distribution - Scaling
# The number pi
np.pi
# The square root of 2
np.sqrt(2)
# Squaring the result
1.4142135623730951**2

In [None]:
# called the scale in numpy
sigma = 1.0
sigma

In [None]:
# a number, close to 0.4
1.0 / np.sqrt(2 * np.pi * sigma**2)

In [None]:
# various values of sigma for comparison
sigma_vals = np.linspace(1.0, 10.0, 1000)
# plot formula
plt.plot(sigma_vals, 1.0 / np.sqrt(2 * np.pi * sigma_vals**2))

In [None]:
# Normal Distribution - the e part
# a number like pi
np.e

In [None]:
# range of x values for plotting
x = np.linspace(-4.0, 4.0, 1000)
# plot of e^x, built into numpy as np.exp
plt.plot(x, np.exp(x))

In [None]:
# range of x values for plotting
x = np.linspace(-4.0, 4.0, 1000)
# e^-x
plt.plot(x, np.exp(-(x**2 / 2)), label = r'$e^{-x}$')
plt.legend()

In [None]:
# These are involved in the exponent
# Fixing these values in subsequent plots
mu, sigma = 0.0, 3.0
# Range of x values
x = np.linspace(-9.0, 9.0, 1000)
# plotting full e part of normal pdf
plt.plot(x, np.exp(-((x - mu)**2 / (2 * sigma**2))))

In [None]:
# changing values of mu
# setting initial values
mu, sigma = 0.0, 3.0
# x values for plot
x = np.linspace(-9.0, 9.0, 1000)
# plotting the function for several different values of mu
for mu in [-1.0, 0.0, 1.0, 2.0]:
    plt.plot(x, np.exp(-((x - mu)**2 / (2 * sigma**2))), label = rf'$\mu = {mu}$')
plt.legend()

In [None]:
# changing values of mu
# setting initial values
mu, sigma = 0.0, 3.0
# x values for plot
x = np.linspace(-9.0, 9.0, 1000)
# plotting the function for several different values of mu
for sigma in [1.0, 2.0, 3.0]:
    plt.plot(x, np.exp(-((x - mu)**2 / (2 * sigma**2))), label = rf'$\mu = {mu}$')
plt.legend()

In [None]:
# Normal Distribution - Pi and e together
# setting values for mu and sigma
mu, sigma = 0.0, 1.0
# range of x values
x = np.linspace(-3.0, 3.0, 1000)
# first part with square root
part1 = 1.0 / np.sqrt(2 * np.pi * sigma**2)
# second part with e
part2 = np.exp(-((x - mu)**2 / (2 * sigma**2)))
# output is the product
y = part1 * part2

# plot x & y
plt.plot(x, y, label = 'with')
# show without scaling by the first part
plt.plot(x, part2, label = 'without')

plt.legend()

In [None]:
# changing values of sigma
# Sigma occurs in both part 1 and part 2
# Height of curve is also affected as a result

# setting initial values of mu & sigma
mu, sigma = 0.0, 1.0
# range of x values
x = np.linspace(-9.0, 9.0, 1000)
# different values of sigma
for sigma in [0.5, 1.0, 2.0, 4.5]:
    # First part with square root
    part1 = 1.0 / np.sqrt(2 * np.pi * sigma**2)
    # Second part with e
    part2 = np.exp(-((x - mu)**2 / (2 * sigma**2)))
    # output is the product
    y = part1 * part2
    # plot x & y
    plt.plot(x, y, label = '$\sigma = {sigma}$')
plt.legend()

In [None]:
# Percentiles
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.norm.html

# picking random values on the curve
np.random.normal(0.0, 1.0)

In [None]:
# picking random values on the curve - max 10,000,000
np.random.normal(0.0, 1.0, 10000000).max()

In [None]:
# x value at the 99th percentile
# Meaning 99% of all randomly generated values will be below this
ss.norm().ppf(0.99)

In [None]:
# The 50th percentile aka the mean value
ss.norm().ppf(0.50)

In [None]:
# Percentile of 0.0 on x axis
ss.norm().cdf(0.0)

In [None]:
# Probability of generating a value of less than 3.0 and greater than -3.0
ss.norm().cdf(3.0) - ss.norm().cdf(-3.0)

In [None]:
# Six sigma where sigma = 1.0 and mu = 0.0
ss.norm().cdf(6.0) - ss.norm().cdf(-6.0)

In [None]:
# Quartiles
# Note the quartile is usually the range and not the point
# 3rd quartile aka 75th percentile
ss.norm().ppf(0.75)

In [None]:
# 2nd quartile aka 50th percentile
ss.norm().ppf(0.5)

In [None]:
# 1st quartile aka 25th percentile
ss.norm().ppf(0.25)

In [None]:
# Box Plots
# Read in the morley data
data = np.genfromtxt('data/morley.csv', skip_header=1, delimiter=',', deletechars='"')
# Removing the first column
data = data[:,1:]

In [None]:
# selecting data for the first experiment
exper_1 = data[:20, 2]
exper_1

In [None]:
# selecting data for the second experiment
exper_2 = data[20:40, 2]
exper_2

In [None]:
# creating a box plot
plt.boxplot(np.array([exper_1, exper_2]).T)

In [None]:
# a quick way to get the five experiments in different columns
data[:,2].reshape(5, 20).T

In [None]:
# Inter-Quartile Range
exper_1

In [None]:
# Creating a figure
fig, ax = plt.subplots()
# adding a box plot
ax.boxplot(exper_1)
# plotting red dots at (1, value) for each value in exper_1
ax.plot(np.ones(exper_1.shape), exper_1, 'r.')

In [None]:
exper_1

In [None]:
# sorted values from the first experiment
np.sort(exper_1)

In [None]:
# three ways to calculate the mean
exper_1.mean(), np.mean(exper_1), sum(exper_1) / len(exper_1)

In [None]:
# median of exper_1
np.median(exper_1)

In [None]:
# median is the 50th percentile
np.percentile(exper_1, 50)

In [None]:
# upper quartile value
q3 = np.percentile(exper_1, 75)
q3

In [None]:
# median of the top half of values
np.median(np.sort(exper_1)[10:])

In [None]:
# upper quartile value
q1 = np.percentile(exper_1, 25)
q1

In [None]:
# median of the bottom half of values
np.median(np.sort(exper_1)[:10])

In [None]:
# inter-quartile range
iqr = np.percentile(exper_1, 75) - np.percentile(exper_1, 25)
iqr

In [None]:
# max upper whisker
q3 + iqr

In [None]:
# min lower whisker
q1 - iqr

In [None]:
# creating a figure
fig, ax = plt.subplots()
# adding the box plot
ax.boxplot(exper_1)
# plotting red dots at (1, value) for each value in exper_1
ax.plot(np.ones(exper_1.shape), exper_1, 'r.')
# adding the max whisker lengths
ax.plot([1.0, 1.0], [q1 - iqr, q3 + iqr], 'bx')

In [None]:
# creating a figure
fig, ax = plt.subplots()
# adding the box plot
ax.boxplot(exper_1)
# plotting red dots at (1, value) for each value in exper_1
ax.plot(np.ones(exper_1.shape), exper_1, 'r.')
# adding the max whisker lengths
ax.plot([1.0, 1.0], [q1 - iqr, q3 + iqr], 'bx')
# plotting the mean
ax.plot([1.0], [np.mean(exper_1)], 'go')

In [None]:
# values above the mean
exper_1[exper_1 > exper_1.mean()]

In [None]:
# values below the mean
exper_1[exper_1 < exper_1.mean()]

In [None]:
# Exercise 1
# Create box plots on a single set of axes for all 5 experiments in the Morley data set
# Research TBC

In [None]:
# Exercise 2
# Create box plots for all of the numerical variables in Fisher's Iris data set

In [None]:
# Data Set
# Creating a series of x vectors
xs = np.array([np.linspace(i, i + 2.0, 20) for i in range(10)])

In [None]:
# creating a corresponding series of y vectors
ys = np.array([-1.0 * xs[i] + 2 * i for i in range(len(xs))])
# adding noise
ys = ys + np.random.normal(0.0, 1.0, ys.shape)

In [None]:
# creating global list of x & y values
x = xs.flatten()
y = ys.flatten()

In [None]:
# plot
plt.plot(x, y, '.')
# points appear to go bottom left to top right

In [None]:
# Best Fit Line
# y = mx + c

# Best Fit Line slope and y intercept
m, c = np.polyfit(x, y, 1)

In [None]:
# creating a steadily increasing x range for plotting
xbestfit = np.linspace(x.min(), x.max(), 1000)
# creating corresponding y range
ybestfit = m * xbestfit + c

In [None]:
# plot points
plt.plot(x, y, '.')
# plot line
plt.plot(xbestfit, ybestfit)

In [None]:
# Colours
# Plot the same points, but now with a different colour for each range.
for i in range(len(xs)):
    plt.plot(xs[i], ys[i], '.')

In [None]:
# Same plot as above but with best fit lines included
for i in range(len(xs)):
    plt.plot(xs[i], ys[i], '.')
    m, c = np.polyfit(xs[i], ys[i], 1)
    xbestfit = np.linspace(xs[i].min(), xs[i].max(), 50)
    ybestfit = m * xbestfit + c
    plt.plot(xbestfit, ybestfit, color = 'lightgrey')
# Refer to Simpson's Paradox

In [None]:
# Exercise 3
# Adapt the above code and plots so that the overall plot is inversely proportional and the individual groups are directly proportional
# Research TBC