In [None]:
from pathlib import Path
import pandas as pd
import random
import os
from types import NoneType
from typing import Union
from logging import Logger
from sklearn import preprocessing
import numpy as np
import plotly.express as px
import umap

In [None]:
# Load features data
df = pd.read_csv('smoking_dataset/smoking.csv')

In [None]:
df = df.replace('M', 0).replace('F', 1).replace('Y',1).replace('N',0)
df = df.drop(['ID'], axis=1)
df['features'] = df.drop(['smoking'], axis = 1).values.tolist()
df.columns

In [None]:
def seed_everything(seed: Union[int, float] = 42, logger: Logger = None) -> None:
    """"
    Seed everything so we can reproduce results
    """
    assert isinstance(seed, (int, float))
    assert isinstance(logger, (NoneType, Logger))

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

    if logger != None:
        logger.debug(f"[STATUS] Random generators reset with seed: {seed}")

def umap_scatter_plot_nd(df: pd.DataFrame, umap_components:np.ndarray, export_path:Path, color_by: str = 'CB1 Normalized to Max', hover_data = ('Compound'), seed:int=42, logger:Logger=None, fname_identifier:str = 'umap'):
    """
    Takes in df with data & UMAP components. Plots them on an interactive html scatterplot
    
    """
    
    assert isinstance(df, pd.DataFrame)
    assert isinstance(export_path, Path)
    assert isinstance(color_by, str)
    assert isinstance(fname_identifier, str)
    assert isinstance(logger, (NoneType, Logger))
    assert color_by in df.columns
    
    seed_everything(seed = seed, logger=logger)
    fig1 = px.scatter(
            data_frame=df.sort_values(by=color_by),
            color=color_by,
            x = 'UMAP_1',
            y = 'UMAP_2',
            width = 1400,
            height= 1024,
            # dimensions=[f"UMAP_{x}" for x in range(1, umap_components.shape[1] + 1)],
            # title=f'{fname_identifier}',
            
            hover_data=hover_data,
            opacity = 0.8
        )

    export_path.mkdir(parents=True, exist_ok=True)
    save_filepath = Path(export_path, f'{fname_identifier}.png')
    fig1.write_image(file=save_filepath)


In [None]:
df['features']

In [None]:
# UMAP & Seed parameters
n_components = 2
min_dist = 1
seed = 42

df['gender'] = df['gender'].replace(1,'Female').replace(0,'Male')
df['smoking'] = df['smoking'].replace(1,'Yes').replace(0,'No')
df['tartar'] = df['tartar'].replace(1,'Yes').replace(0,'No')
df['dental caries'] = df['dental caries'].replace(1,'Yes').replace(0,'No')

feature_arr = np.stack(df['features'].tolist(), axis=0)
scaled_feature_arr = preprocessing.StandardScaler().fit_transform(feature_arr)
umap_model = umap.UMAP(n_components=n_components, min_dist=min_dist)
umap_components = umap_model.fit_transform(scaled_feature_arr)
umap_subset_df = df.join(pd.DataFrame(umap_components, columns=[f"UMAP_{x}" for x in range(1, n_components + 1)]))



In [None]:
df = df.drop('features', axis=1)

for column in df.columns:
    umap_scatter_plot_nd(df = umap_subset_df, 
                        color_by = column, 
                        umap_components=umap_components,
                        hover_data = ("smoking", 'gender', 'age'),
                        export_path = Path(), 
                        seed=seed, 
                        fname_identifier=f'smoking-umap-3features-{column}')
