In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
heights = np.arange(60, 78, .1)

In [None]:
heights

In [None]:
heights.size

In [None]:
np.random.seed(0)
random_fluctuation = np.random.normal(scale=10, size=heights.size)
weights = 4 * heights - 130 + random_fluctuation

In [None]:
measurements = np.array([heights, weights]).T

In [None]:
measurements

In [None]:
plt.scatter(measurements[:, 0], measurements[:, 1])
plt.xlabel('Height (in)')
plt.ylabel('Weight (lb)')

In [None]:
plt.scatter(measurements[:, 0], measurements[:, 1])
plt.xlabel('Height (in)')
plt.ylabel('Weight (lb)')
plt.axis('equal')

In [None]:
centered_data = np.array([heights - heights.mean(), weights - weights.mean()])

In [None]:
centered_data.shape

In [None]:
plt.scatter(centered_data[0], centered_data[1])
plt.axhline(0, c='black')
plt.axvline(0, c='black')
plt.xlabel('Centered Height (in)')
plt.ylabel('Centered Weight (lb)')
plt.axis('equal')

In [None]:
from math import sin, cos
angle = np.radians(-90)

In [None]:
rotation_matrix = np.array([[cos(angle), -sin(angle)], [sin(angle), cos(angle)]])

In [None]:
rotated_data = rotation_matrix @ centered_data

In [None]:
plt.scatter(centered_data[0], centered_data[1], label='Original Data')
plt.scatter(rotated_data[0], rotated_data[1], c='y', label='Rotated Data')
plt.axhline(0, c='black')
plt.axvline(0, c='black')
plt.legend()
plt.axis('equal')

In [None]:
data_labels = ['unrotated', 'rotated']
data_list = [centered_data, rotated_data]
for data_label, data in zip(data_labels, data_list):
    y_values = data[1]
    penalty = y_values @ y_values / y_values.size
    print(f"The penalty score for the {data_label} data is {penalty:.2f}")

In [None]:
for data_label, data in zip(data_labels, data_list):
    y_var = data[1].var()
    penalty = data[1] @ data[1] / data[0].size
    assert round(y_var, 14) == round(penalty, 14)
    print(f"The y-axis variance for the {data_label} data is {y_var:.2f}")

In [None]:
for data_label, data in zip(data_labels, data_list):
    x_var = data[0].var()
    print(f"The x-axis variance for the {data_label} data is {x_var:.2f}")

In [None]:
total_variance = centered_data[0].var() + centered_data[1].var()
assert total_variance == rotated_data[0].var() + rotated_data[1].var()

In [None]:
for data_label, data in zip(data_labels, data_list):
    percent_x_axis_var = 100 * data[0].var() / total_variance
    percent_y_axis_var = 100 * data[1].var() / total_variance
    print(f"In the {data_label} data, {percent_x_axis_var:.2f}% of the total variance is distributed across the x-axis")
    print(f"The remaining {percent_y_axis_var:.2f}% of the total variance is distributed across the y-axis\n")

In [None]:
def rotate(angle, data=centered_data):
    angle = np.radians(-angle)
    rotation_matrix = np.array([[cos(angle), -sin(angle)], [sin(angle), cos(angle)]])
    return rotation_matrix @ data
angles = np.arange(1, 180, .1)
x_variances = [rotate(angle)[0].var() for angle in angles]
percent_x_variances = 100 * np.array(x_variances) / total_variance
optimal_index = np.argmax(percent_x_variances)
optimal_angle = angles[optimal_index]
plt.plot(angles, percent_x_variances)
plt.axvline(optimal_angle, c='k')
plt.xlabel('Angle (degrees)')
plt.ylabel('% x-axis coverage')
plt.show()
max_coverage = percent_x_variances[optimal_index]
max_x_var = x_variances[optimal_index]
print(f"The horizontal variance is maximized to approximately {int(max_x_var)} after a {optimal_angle:.1f} degree rotation.")
print(f"That rotation distributes {max_coverage:.2f}% of the total variance onto the x-axis.")

In [None]:
best_rotated_data = rotate(optimal_angle)
plt.scatter(best_rotated_data[0], best_rotated_data[1])
plt.axhline(0, c='black')
plt.axvline(0, c='black')
plt.axis('equal')

In [None]:
optimal_angle

In [None]:
x_values = best_rotated_data[0]
sorted_x_values = sorted(x_values)
cluster_size = int(x_values.size / 3)
small_cutoff = max(sorted_x_values[:cluster_size])
large_cutoff = min(sorted_x_values[-cluster_size:])
print(f"A 1D threshold of {small_cutoff:.2f} separates the small-sized "
"and medium-sized customers.")
print(f"A 1D threshold of {large_cutoff:.2f} separates the medium-sized "
"and large-sized customers.")

In [None]:
def plot_customer_segments(horizontal_2d_data):
    small, medium, large = [], [], []
    cluster_labels = ['Small', 'Medium', 'Large']
    for x_value, y_value in horizontal_2d_data.T:
        if x_value <= small_cutoff:
            small.append([x_value, y_value])
        elif small_cutoff < x_value < large_cutoff:
            medium.append([x_value, y_value])
        else:
            large.append([x_value, y_value])
    for i, cluster in enumerate([small, medium, large]):
        cluster_x_values, cluster_y_values = np.array(cluster).T
        plt.scatter(cluster_x_values, cluster_y_values, color=['g', 'b', 'y'][i], label=cluster_labels[i])
    plt.axhline(0, c='black')
    plt.axvline(large_cutoff, c='black', linewidth=3, linestyle='--')
    plt.axvline(small_cutoff, c='black', linewidth=3, linestyle='--')
    plt.axis('equal')
    plt.legend()

In [None]:
plot_customer_segments(best_rotated_data)

In [None]:
zero_y_values = np.zeros(x_values.size)

In [None]:
reproduced_data = rotate(-optimal_angle, np.array([x_values, zero_y_values]))

In [None]:
plt.plot(reproduced_data[0], reproduced_data[1], c='k',
label='Reproduced Data')
plt.scatter(centered_data[0], centered_data[1], c='y',
label='Original Data')
plt.axis('equal')
plt.legend()

In [None]:
np.random.seed(1)
new_heights = np.arange(60, 78, .11)
random_fluctuations = np.random.normal(scale=10, size=new_heights.size)
new_weights = 4 * new_heights - 130 + random_fluctuations
new_centered_data = np.array([new_heights - heights.mean(),
new_weights - weights.mean()])
plt.scatter(new_centered_data[0], new_centered_data[1], c='y',
label='New Customer Data')
plt.plot(reproduced_data[0], reproduced_data[1], c='k',
label='First Principal Direction')
plt.xlabel('Centralized Height (in)')
plt.ylabel('Centralized Weight (lb)')
plt.axis('equal')
plt.legend()

In [None]:
new_horizontal_data = rotate(optimal_angle, data=new_centered_data)
plot_customer_segments(new_horizontal_data)