In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
heights = np.arange(60, 78, .1)

In [None]:
heights

In [None]:
heights.size

In [None]:
np.random.seed(0)
random_fluctuation = np.random.normal(scale=10, size=heights.size)
weights = 4 * heights - 130 + random_fluctuation

In [None]:
measurements = np.array([heights, weights]).T

In [None]:
measurements

In [None]:
plt.scatter(measurements[:, 0], measurements[:, 1])
plt.xlabel('Height (in)')
plt.ylabel('Weight (lb)')

In [None]:
plt.scatter(measurements[:, 0], measurements[:, 1])
plt.xlabel('Height (in)')
plt.ylabel('Weight (lb)')
plt.axis('equal')

In [None]:
centered_data = np.array([heights - heights.mean(), weights - weights.mean()])

In [None]:
centered_data.shape

In [None]:
plt.scatter(centered_data[0], centered_data[1])
plt.axhline(0, c='black')
plt.axvline(0, c='black')
plt.xlabel('Centered Height (in)')
plt.ylabel('Centered Weight (lb)')
plt.axis('equal')

In [None]:
from math import sin, cos
angle = np.radians(-90)

In [None]:
rotation_matrix = np.array([[cos(angle), -sin(angle)], [sin(angle), cos(angle)]])

In [None]:
rotated_data = rotation_matrix @ centered_data

In [None]:
plt.scatter(centered_data[0], centered_data[1], label='Original Data')
plt.scatter(rotated_data[0], rotated_data[1], c='y', label='Rotated Data')
plt.axhline(0, c='black')
plt.axvline(0, c='black')
plt.legend()
plt.axis('equal')

In [None]:
data_labels = ['unrotated', 'rotated']
data_list = [centered_data, rotated_data]
for data_label, data in zip(data_labels, data_list):
    y_values = data[1]
    penalty = y_values @ y_values / y_values.size
    print(f"The penalty score for the {data_label} data is {penalty:.2f}")

In [None]:
for data_label, data in zip(data_labels, data_list):
    y_var = data[1].var()
    penalty = data[1] @ data[1] / data[0].size
    assert round(y_var, 14) == round(penalty, 14)
    print(f"The y-axis variance for the {data_label} data is {y_var:.2f}")

In [None]:
for data_label, data in zip(data_labels, data_list):
    x_var = data[0].var()
    print(f"The x-axis variance for the {data_label} data is {x_var:.2f}")

In [None]:
total_variance = centered_data[0].var() + centered_data[1].var()
assert total_variance == rotated_data[0].var() + rotated_data[1].var()

In [None]:
for data_label, data in zip(data_labels, data_list):
    percent_x_axis_var = 100 * data[0].var() / total_variance
    percent_y_axis_var = 100 * data[1].var() / total_variance
    print(f"In the {data_label} data, {percent_x_axis_var:.2f}% of the total variance is distributed across the x-axis")
    print(f"The remaining {percent_y_axis_var:.2f}% of the total variance is distributed across the y-axis\n")