In [112]:
# start coding here

In [113]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

import altair as alt
from altair_saver import save as alt_save

## Does the first principal component just align to the axis of overall metastatic potential?

In [114]:
metmap_tissues = [
    'brain',
    'lung',
    'liver',
    'bone',
    'kidney'
]

In [115]:
all_df = pd.read_excel(snakemake.input['mm_potential'], sheet_name=f"metp500.all5", index_col=0)
all_df.head()

In [116]:
met_potential_df = pd.DataFrame(index=all_df.index.values.tolist())

In [117]:
for metmap_tissue in metmap_tissues:
    tissue_df = pd.read_excel(snakemake.input['mm_potential'], sheet_name=f"metp500.{metmap_tissue}", index_col=0)
    met_potential_df[metmap_tissue] = tissue_df['mean']

In [118]:
met_potential_df.head()

In [119]:
X = met_potential_df.values
pca = PCA(n_components=2)
X_prime = pca.fit_transform(X)

In [120]:
pca.explained_variance_ratio_

In [121]:
pca.singular_values_

In [122]:
pca_df = pd.DataFrame(index=all_df.index.values.tolist(), data=X_prime, columns=["PC1", "PC2"])
pca_df["all5_mean"] = all_df["mean"]
pca_df["all5_penetrance"] = all_df["penetrance"]
pca_df["all5_penetrance_is_zero"] = (all_df["penetrance"] == 0.0)

pca_df = pca_df.reset_index()

In [123]:
pca_df.head()

In [124]:
(all_df["penetrance"] != 0.0).sum()

In [145]:
scales = alt.selection_interval(bind='scales')

penetrance_plot = alt.Chart(pca_df).mark_circle(size=40, opacity=1).encode(
    x=alt.X("PC2:Q"),
    y=alt.Y("PC1:Q"),
    color=alt.Color("all5_penetrance_is_zero:N", legend=alt.Legend(title="Metastasis Penetrance is Zero"), scale=alt.Scale(domain=[False, True])),
    tooltip=[
        alt.Tooltip("index:N", title="Cell Line"),
        alt.Tooltip("all5_mean:Q", title="Overall Metastasis Potential"),
        alt.Tooltip("all5_penetrance:Q", title="Overall Metastasis Penetrance")
    ]
).add_selection(
    scales
).properties(
    width=500,
    height=500
)
penetrance_plot

In [126]:
alt_save(penetrance_plot, snakemake.output['pca_plot'])

In [144]:
scales = alt.selection_interval(bind='scales')

alt.Chart(pca_df).mark_circle(size=40, opacity=1).encode(
    x=alt.X("PC2:Q"),
    y=alt.Y("PC1:Q"),
    color=alt.Color("all5_mean:Q", legend=alt.Legend(title="Overall Metastasis Potential")),
    tooltip=[
        alt.Tooltip("index:N", title="Cell Line"),
        alt.Tooltip("all5_mean:Q", title="Overall Metastasis Potential")
    ]
).add_selection(
    scales
).properties(
    width=500,
    height=500
)

In [146]:
scales = alt.selection_interval(bind='scales')

alt.Chart(pca_df).mark_circle(size=40, opacity=1).encode(
    x=alt.X("PC2:Q"),
    y=alt.Y("PC1:Q"),
    color=alt.Color("all5_mean:Q", legend=alt.Legend(title="Overall Metastasis Potential"), scale=alt.Scale(domain=[-4, -3])),
    tooltip=[
        alt.Tooltip("index:N", title="Cell Line"),
        alt.Tooltip("all5_mean:Q", title="Overall Metastasis Potential")
    ]
).add_selection(
    scales
).properties(
    width=500,
    height=500
)

# Filtering out cell lines with `penetrance == 0`

372 cell lines have non-zero overall penetrance

In [129]:
nonzero_potential_df = met_potential_df.loc[all_df['penetrance'] != 0]
nonzero_potential_df.head()

In [130]:
nonzero_all_df = all_df.loc[all_df['penetrance'] != 0]

In [131]:
X = nonzero_potential_df.values
pca = PCA(n_components=2)
X_prime = pca.fit_transform(X)

In [132]:
pca.explained_variance_ratio_

In [133]:
nonzero_pca_df = pd.DataFrame(index=nonzero_potential_df.index.values.tolist(), data=X_prime, columns=["PC1", "PC2"])
nonzero_pca_df["all5_mean"] = nonzero_all_df["mean"]
nonzero_pca_df["all5_penetrance"] = nonzero_all_df["penetrance"]

nonzero_pca_df = nonzero_pca_df.reset_index()

In [148]:
scales = alt.selection_interval(bind='scales')

alt.Chart(nonzero_pca_df).mark_circle(size=60, opacity=1).encode(
    x=alt.X("PC2:Q"),
    y=alt.Y("PC1:Q"),
    color=alt.Color("all5_mean:Q", legend=alt.Legend(title="Overall Metastasis Potential")),
    tooltip=[
        alt.Tooltip("index:N", title="Cell Line"),
        alt.Tooltip("all5_mean:Q", title="Overall Metastasis Potential")
    ]
).add_selection(
    scales
).properties(
    title="Principal Components: Overall Metastasis Potential",
    width=500,
    height=500
)

In [147]:
scales = alt.selection_interval(bind='scales')

alt.Chart(nonzero_pca_df).mark_circle(size=60, opacity=1).encode(
    x=alt.X("PC2:Q"),
    y=alt.Y("PC1:Q"),
    color=alt.Color("all5_penetrance:Q", legend=alt.Legend(title="Overall Metastasis Penetrance")),
    tooltip=[
        alt.Tooltip("index:N", title="Cell Line"),
        alt.Tooltip("all5_mean:Q", title="Overall Metastasis Potential")
    ]
).add_selection(
    scales
).properties(
    title="Principal Components: Overall Metastasis Potential",
    width=500,
    height=500
)