In [None]:
from plotly import express

from hcve_lib.utils import notebook_init

notebook_init()
from deps.common import get_variables_cached
from pandas import DataFrame
from hcve_lib.custom_types import Estimator

from typing import List, Tuple
from sklearn.preprocessing import StandardScaler
from deps.pipelines import get_preprocessing
from hcve_lib.wrapped_sklearn import DFPipeline
from sklearn.decomposition import PCA
import plotly.graph_objs as go

from hcve_lib.functional import pipe, starmap
from hcve_lib.data import format_identifier
from hcve_lib.wrapped_sklearn import DFStandardScaler

from scipy.stats import pearsonr
from hcve_lib.visualisation import p, h2
from hcve_lib.formatting import format_percents
from toolz.curried import sorted
from sklearn.mixture import GaussianMixture
from pandas import Series
from hcve_lib.functional import mapl

data, metadata, X, y = get_variables_cached()
%autoreload  2

# PCA

In [64]:

def get_pipeline(X: DataFrame, **kwargs) -> DFPipeline:
    return DFPipeline([
        *get_preprocessing_steps(X),
        ('standardize', StandardScaler()),
        ('projection', PCA(**kwargs))
    ])

def get_preprocessing_steps(X: DataFrame) -> List[Tuple[str, Estimator]]:
    preprocessing, categorical_features, continuous_features = get_preprocessing(X)
    return [('scaler', DFStandardScaler()), *preprocessing]

In [65]:
pca_3d = get_pipeline(X, n_components=3)
X_r3d = pca_3d.fit_transform(X, y)

In [66]:
figure = express.scatter_3d(
    x=X_r3d.T[0], y=X_r3d.T[1], z=X_r3d.T[2],
    color=data['STUDY'],
)
figure.update_traces(marker=dict(size=1.5,))
figure.update_layout(margin=go.layout.Margin(l=0, r=0, b=0, t=0))
figure.update_layout(legend= {'itemsizing': 'constant'})

In [80]:
import plotly.express as px

X_processed = DFPipeline(get_preprocessing_steps(X)).fit_transform(X, y)

for num, (component_explained, projected_x) in enumerate(zip(pca_3d[-1].explained_variance_ratio_, X_r3d.T)):
    h2(f'PC{num}')
    p(f'Explained: {format_percents(component_explained)}')
    correlations =  pipe(
        X_processed.iteritems(),
        starmap(lambda name, original_x: (
            format_identifier(name, metadata),
            pearsonr(original_x, projected_x)[0])
        ),
        sorted(
            key = lambda i: abs(i[1])
        ),
        DataFrame,
    )
    fig = px.bar(correlations, x=1, y=0, orientation='h', height=600)
    fig.update_layout(
        title='Correlation with PC',
        xaxis_title="Correlation",
        yaxis_title="Feature",
        yaxis_tickmode='linear',
    )
    fig.update_yaxes(showgrid=True)
    fig.show()

## Clustering

In [32]:
bics = {}
for n_components in range(1,30):
    model = GaussianMixture(n_components=n_components, random_state=0).fit(X_processed)
    bics[n_components] = model.bic(X_processed)

fig = px.bar(Series(bics))
fig.update_layout(
        title='BIC',
        xaxis_title="n clusters",
        yaxis_title="BIC",
    )
fig.show()

In [59]:
clusters = GaussianMixture(n_components=4, random_state=0).fit_predict(X_processed)

pca_3d = get_pipeline(X, n_components=3)
X_r3d = pca_3d.fit_transform(X, y)


figure = express.scatter_3d(
    DataFrame(X_r3d).assign(color=mapl(str, sorted(clusters))),
    x=0,
    y=1,
    z=2,
    color='color',
)
figure.update_traces(marker=dict(size=1.5,))
figure.update_layout(margin=go.layout.Margin(l=0, r=0, b=0, t=0))
figure.update_layout(legend= {'itemsizing': 'constant'})