%<br>
import libraries, load dataset

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import sklearn
import sklearn.datasets
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

In [None]:
import sklearn.decomposition

In [None]:
import cc_plt_utils

In [None]:
RANDOM_SEED = 4242
FEATURE_SUBSET = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)']
PCA_COMPONENTS=2

In [None]:
cc_plt_utils.set_theme_lecture()
cmap_sns = cc_plt_utils.get_sns_palette_for_plt(
    palette='tab10',
    n_colors=3
)

In [None]:
iris_dataset = sklearn.datasets.load_iris()
df = pd.DataFrame(
    data=iris_dataset['data'],
    columns=iris_dataset['feature_names']
)
df['target'] = iris_dataset['target']
df['target_labels'] = [iris_dataset['target_names'][x] for x in iris_dataset['target']]

%<br>
 3D plot

In [None]:
fig = plt.figure('3D Interactive plot', figsize=(6,6))
ax = Axes3D(fig, auto_add_to_figure=False)
fig.add_axes(ax)

In [None]:
sc = ax.scatter(
    df[FEATURE_SUBSET[0]],
    df[FEATURE_SUBSET[1]],
    df[FEATURE_SUBSET[2]],
    s=40,
    edgecolors='white',
    linewidth=0.5,
    c=df['target'],
    cmap=cmap_sns,
    marker='o',
    alpha=1
)
ax.set_xlabel(FEATURE_SUBSET[0])
ax.set_ylabel(FEATURE_SUBSET[1])
ax.set_zlabel(FEATURE_SUBSET[2])

%<br>
 apply PCA

In [None]:
model_pca = sklearn.decomposition.PCA(
    n_components=PCA_COMPONENTS,
    random_state=RANDOM_SEED
).fit(df[FEATURE_SUBSET])

In [None]:
reduced_dataset = model_pca.transform(df[FEATURE_SUBSET])
df['pca_0'] = reduced_dataset[:, 0]
df['pca_1'] = reduced_dataset[:, 1]
plt.figure('PCA projection')
ax = sns.scatterplot(
    data=df,
    x='pca_0',
    y='pca_1',
    hue='target_labels',
    palette='tab10',
    marker='o',
    alpha=1
)
# remove legend
plt.legend([], [], frameon=False)
plt.show()

%<br>
visualize elbow

In [None]:
cumulative_sum = np.cumsum(model_pca.explained_variance_ratio_)
# get num of dimensions that encode at least 95% of information
num_dimensions = np.argmax(cumulative_sum >= 0.95) + 1

In [None]:
sns.lineplot(cumulative_sum)

%%