In [None]:
import pandas as pd
from constants import FEATURE_COLS


df = pd.read_csv("data/cleaned_gene_array_data.csv")
df_data = df[FEATURE_COLS].values
gene_df = df[df["type"] == "gene"]
gene_data = gene_df[FEATURE_COLS].values

df

# Compute Outliers


In [None]:
from scipy.spatial import distance
import numpy as np


# Calculate the mean vector and covariance matrix of the reference data
gene_mean = np.mean(gene_data, axis=0)
gene_cov = np.cov(gene_data, rowvar=False)


# Calculate Mahalanobis distance for each row in the dataframe
df["mahalanobis_distance"] = df.apply(
    lambda row: distance.mahalanobis(
        row[FEATURE_COLS], gene_mean, np.linalg.inv(gene_cov)
    ),
    axis=1,
)

# Calculate the 95th percentile threshold using only the gene data
threshold = np.percentile(
    df[df["type"] == "gene"]["mahalanobis_distance"], 100 - (100 / 3520) * 100
)

# Create a new column indicating whether each sample exceeds the threshold
df["mahalanobis_outlier"] = df["mahalanobis_distance"] > threshold

outlier_summary = df.groupby("type")["mahalanobis_outlier"].agg(["count", "sum"])
outlier_summary["percentage"] = (
    outlier_summary["sum"] / outlier_summary["count"]
) * 100
print(outlier_summary)

In [None]:
from sklearn.ensemble import IsolationForest


clf = IsolationForest(random_state=0, contamination=100 / 3520).fit(gene_data)
outlier_scores = clf.predict(df[FEATURE_COLS].values)

# outlier_scores = IsolationForest(
#     random_state=0, contamination=((176) * 2 + 100) / 3520
# ).fit_predict(df[FEATURE_COLS].values)

df["isolation_forest_outlier"] = outlier_scores == -1

outlier_summary = df.groupby("type")["isolation_forest_outlier"].agg(["count", "sum"])
outlier_summary["percentage"] = (
    outlier_summary["sum"] / outlier_summary["count"]
) * 100
print(outlier_summary)


In [None]:
import sklearn.neighbors


# Detect outliers using LocalOutlierFactor
outlier_scores = sklearn.neighbors.LocalOutlierFactor(contamination=0.15).fit_predict(
    df[FEATURE_COLS].values
)

df["lof_outlier"] = outlier_scores == -1

# Count how many samples of each type exceed the threshold
outlier_summary = df.groupby("type")["lof_outlier"].agg(["count", "sum"])
outlier_summary["percentage"] = (
    outlier_summary["sum"] / outlier_summary["count"]
) * 100
print(outlier_summary)


In [5]:
import pandas as pd


def create_dimension_reduction_df(
    transformed_data,
    n_components,
    prefix,
    df,
):
    transformed_data = transformed_data[:, :n_components]
    columns = [f"{prefix}{i + 1}" for i in range(n_components)]
    result_df = pd.DataFrame(transformed_data, columns=columns)

    # Add metadata columns from original DataFrame
    for col in df.columns:
        if col not in FEATURE_COLS:
            result_df[col] = df[col].values

    return result_df

# Visualization


In [None]:
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import numpy as np
from plotting_utils import plot_3d_scatter


def perform_pca(X_scaled, plot_explained_variance=True):
    """Perform PCA and return transformed data and PCA object."""
    pca = PCA()
    X_pca = pca.fit_transform(X_scaled)
    if plot_explained_variance:
        explained_variance_ratio = pca.explained_variance_ratio_
        cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

        plt.figure(figsize=(10, 6))
        plt.plot(
            range(1, len(explained_variance_ratio) + 1),
            cumulative_variance_ratio,
            "bo-",
        )
        plt.xlabel("Number of Components")
        plt.ylabel("Cumulative Explained Variance Ratio")
        plt.title("PCA Explained Variance Ratio")
        plt.grid(True)
        plt.show()
    return X_pca


HOVER_COLS = [
    "row",
    "col",
    "set",
]

X_pca = perform_pca(df_data, plot_explained_variance=True)


pca_df = create_dimension_reduction_df(
    X_pca,
    3,
    "PC",
    df,
)

pca_fig = plot_3d_scatter(
    pca_df,
    "PC1",
    "PC2",
    "PC3",
    title="3D PCA Projection",
    hover_data=HOVER_COLS,
    type_col="type",
    outlier_col="isolation_forest_outlier",
)


In [None]:
# Apply UMAP for dimensionality reduction

from umap import UMAP


def perform_umap(data, n_components=3, random_state=42, n_neighbors=200, min_dist=0.8):
    """Perform UMAP dimensionality reduction on the input data."""
    umap_model = UMAP(
        n_components=n_components,
        random_state=random_state,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
    )
    umap_result = umap_model.fit_transform(data)
    return umap_result


# Perform UMAP and create DataFrame
X_umap = perform_umap(df_data)
umap_df = create_dimension_reduction_df(X_umap, 3, "UMAP", df)

# Plot UMAP results
umap_fig = plot_3d_scatter(
    umap_df,
    "UMAP1",
    "UMAP2",
    "UMAP3",
    title="3D UMAP Projection",
    type_col="type",
    hover_data=HOVER_COLS,
    outlier_col="isolation_forest_outlier",
)
