In [1]:
import pandas as pd

df = pd.read_csv("all_sets_df.csv")


In [None]:
df

In [None]:
METADATA_COLS = ["type", "row", "col", "well", "set", "type.f", "slide"]

# Get feature columns (all columns except metadata)
FEATURE_COLS = [col for col in df.columns if col not in METADATA_COLS]
HOVER_COLS = ["row", "col", "well", "set"]

TREATMENTS = set(["gene"])
CONTROLS = set(df["type"]) - TREATMENTS
print(f"Unique types: {set(df['type'])}")  # Unique types
print(f"Treatments: {TREATMENTS}")
print(f"Controls: {CONTROLS}")

In [4]:
import plotly.graph_objects as go


def plot_3d_scatter(
    df, x_col, y_col, z_col, type_col="type", title="3D Scatter Plot", hover_data=None
):
    """
    Create a 3D scatter plot colored by type using customdata for hover info.

    Parameters:
    -----------
    df : pandas DataFrame
        The data to plot
    x_col, y_col, z_col : str
        Column names for the x, y, and z axes
    type_col : str, default="type"
        Column name for the categorical variable to color by
    title : str, default="3D Scatter Plot"
        Title for the plot
    hover_data : list, default=None
        List of column names to include in hover information (e.g., ["row", "col", "well"])
    """
    fig = go.Figure()

    # Default hover data if none provided
    if hover_data is None:
        hover_data = []

    # Always include type_col in hover_data if not already present
    if type_col not in hover_data:
        hover_data = [type_col] + hover_data

    # Plot points for each type with different colors
    for type_name in df[type_col].unique():
        mask = df[type_col] == type_name
        df_filtered = df[mask]

        customdata_values = df_filtered[hover_data].values

        # Construct hovertemplate string dynamically
        ht = ""
        for i, col_name in enumerate(hover_data):
            # Use the corresponding customdata index for each column name
            ht += f"<b>{col_name}</b>: %{{customdata[{i}]}}<br>"
        ht += "<extra></extra>"  # Hide the default trace info box like 'trace 0'

        trace = go.Scatter3d(
            x=df_filtered[x_col],
            y=df_filtered[y_col],
            z=df_filtered[z_col],
            mode="markers",
            name=str(type_name),  # Ensure name is a string
            marker=dict(size=6, opacity=0.7),
            customdata=customdata_values,
            hovertemplate=ht,
        )

        fig.add_trace(trace)

    # Update layout
    fig.update_layout(
        title=title,
        scene=dict(
            xaxis_title=x_col,
            yaxis_title=y_col,
            zaxis_title=z_col,
        ),
        width=1200,
        height=1200,
        showlegend=True,
    )
    fig.show()
    return fig


def save_plot(fig, filename):
    """Save the plot as HTML."""
    fig.write_html(filename)

In [5]:
import pandas as pd


def create_dimension_reduction_df(
    transformed_data,
    n_components,
    prefix="UMAP",
    original_df=None,
    type_col="type",
    hover_data=None,
):
    """Create a DataFrame from UMAP results with optional metadata from original DataFrame."""
    transformed_data = transformed_data[:, :n_components]
    columns = [f"{prefix}{i + 1}" for i in range(n_components)]
    result_df = pd.DataFrame(transformed_data, columns=columns)

    if original_df is not None:
        # Add type column if it exists
        if type_col in original_df.columns:
            result_df[type_col] = original_df[type_col]
        else:
            raise ValueError(f"type_col '{type_col}' not found in original DataFrame.")

        # Add hover data columns if specified
        if hover_data is not None:
            for col in hover_data:
                if col in original_df.columns:
                    result_df[col] = original_df[col]
                else:
                    raise ValueError(
                        f"hover_data column '{col}' not found in original DataFrame."
                    )

    return result_df

In [6]:
from sklearn.preprocessing import StandardScaler


def preprocess_data(df, feature_cols):
    """Standardize the feature data."""
    X = df[feature_cols]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X, X_scaled


X, X_scaled = preprocess_data(df, FEATURE_COLS)

In [None]:
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import numpy as np


def perform_pca(X_scaled):
    """Perform PCA and return transformed data and PCA object."""
    pca = PCA()
    X_pca = pca.fit_transform(X_scaled)
    return X_pca, pca


def plot_explained_variance(pca):
    """Plot the cumulative explained variance ratio."""
    explained_variance_ratio = pca.explained_variance_ratio_
    cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

    plt.figure(figsize=(10, 6))
    plt.plot(
        range(1, len(explained_variance_ratio) + 1), cumulative_variance_ratio, "bo-"
    )
    plt.xlabel("Number of Components")
    plt.ylabel("Cumulative Explained Variance Ratio")
    plt.title("PCA Explained Variance Ratio")
    plt.grid(True)
    plt.show()

    return cumulative_variance_ratio


X_pca, pca = perform_pca(X_scaled)
plot_explained_variance(pca)


# Create PCA DataFrame and plot
pca_df = create_dimension_reduction_df(
    X_pca, 3, "PC", df, type_col="type", hover_data=HOVER_COLS
)

pca_fig = plot_3d_scatter(
    pca_df,
    "PC1",
    "PC2",
    "PC3",
    title="3D PCA Projection",
    hover_data=HOVER_COLS,  # Pass the list of *additional* columns to show
    type_col="type",  # Specify the column used for coloring/grouping
)

save_plot(pca_fig, "pca_3d_plot.html")


In [None]:
# Apply UMAP for dimensionality reduction

from umap import UMAP


def perform_umap(data, n_components=3, random_state=42, n_neighbors=200, min_dist=0.8):
    """Perform UMAP dimensionality reduction on the input data."""
    umap_model = UMAP(
        n_components=n_components,
        random_state=random_state,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
    )
    umap_result = umap_model.fit_transform(data)
    return umap_result, umap_model


# Perform UMAP and create DataFrame
X_umap, umap_model = perform_umap(X_scaled)
umap_df = create_dimension_reduction_df(X_umap, 3, "UMAP", df, hover_data=HOVER_COLS)

# Plot UMAP results
umap_fig = plot_3d_scatter(
    umap_df,
    "UMAP1",
    "UMAP2",
    "UMAP3",
    title="3D UMAP Projection",
    type_col="type",
    hover_data=HOVER_COLS,
)

save_plot(umap_fig, "umap_3d_plot.html")
