In [33]:
# start coding here

In [34]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

import altair as alt
from altair_saver import save as alt_save

## Does the first principal component just align to the axis of overall metastatic potential?

In [35]:
metmap_tissues = [
    'brain',
    'lung',
    'liver',
    'bone',
    'kidney'
]

In [36]:
all_df = pd.read_excel(snakemake.input['mm_potential'], sheet_name=f"metp500.all5", index_col=0)
all_df.head()

In [37]:
met_potential_df = pd.DataFrame(index=all_df.index.values.tolist())

In [38]:
for metmap_tissue in metmap_tissues:
    tissue_df = pd.read_excel(snakemake.input['mm_potential'], sheet_name=f"metp500.{metmap_tissue}", index_col=0)
    met_potential_df[metmap_tissue] = tissue_df['mean']

In [39]:
met_potential_df.head()

In [40]:
X = met_potential_df.values
pca = PCA(n_components=None)
X_prime = pca.fit_transform(X)

In [41]:
pca.explained_variance_ratio_

In [42]:
pca.singular_values_

In [46]:
pca_df = pd.DataFrame(index=all_df.index.values.tolist(), data=X_prime[:,0:2], columns=["PC1", "PC2"])
pca_df["all5_mean"] = all_df["mean"]
pca_df["all5_penetrance"] = all_df["penetrance"]
pca_df["all5_penetrance_is_zero"] = (all_df["penetrance"] == 0.0)

pca_df = pca_df.reset_index()

In [47]:
pca_df.head()

In [48]:
(all_df["penetrance"] == 0.0).sum()

In [49]:
scales = alt.selection_interval(bind='scales')

penetrance_plot = alt.Chart(pca_df).mark_circle(size=40, opacity=1).encode(
    x=alt.X("PC2:Q"),
    y=alt.Y("PC1:Q"),
    color=alt.Color("all5_penetrance_is_zero:N", legend=alt.Legend(title="Metastasis Penetrance is Zero"), scale=alt.Scale(domain=[False, True])),
    tooltip=[
        alt.Tooltip("index:N", title="Cell Line"),
        alt.Tooltip("all5_mean:Q", title="Overall Metastasis Potential"),
        alt.Tooltip("all5_penetrance:Q", title="Overall Metastasis Penetrance")
    ]
).add_selection(
    scales
).properties(
    width=500,
    height=500
)
penetrance_plot

In [50]:
alt_save(penetrance_plot, snakemake.output['pca_plot_1'])

In [52]:
scales = alt.selection_interval(bind='scales')

alt.Chart(pca_df).mark_circle(size=40, opacity=1).encode(
    x=alt.X("PC2:Q"),
    y=alt.Y("PC1:Q"),
    color=alt.Color("all5_mean:Q", legend=alt.Legend(title="Overall Metastasis Potential")),
    tooltip=[
        alt.Tooltip("index:N", title="Cell Line"),
        alt.Tooltip("all5_mean:Q", title="Overall Metastasis Potential")
    ]
).properties(
    width=500,
    height=500
)

## Plot the PCA loadings

In [53]:
components = pca.components_

In [56]:
# Rows: components
# Cols: features
components.shape

In [82]:
loadings_df = pd.DataFrame(data=[], columns=["PC", "feature", "loading"])

for i in range(components.shape[0]):
    for j in range(components.shape[1]):
        loadings_df = loadings_df.append({
            "PC": f"PC{i+1}",
            "feature": met_potential_df.columns.values.tolist()[j],
            "loading": components[i, j]
        }, ignore_index=True)

components[0,:] # how much each feature contributes to the first principal component

In [83]:
loadings_df.head()

From http://strata.uga.edu/8370/lecturenotes/principalComponents.html

> Because the sum of the squares of all loadings for an individual principal component must sum to one, we can calculate what the loadings would be if all variables contributed equally to that principal component. Any variable that has a larger loading than this value contributes more than one variable’s worth of information and would be regarded as an important contributor to that principal component.

In [117]:
loadings_df["large"] = loadings_df["loading"].apply(lambda a: abs(a) >= np.sqrt(1/components.shape[0]))
loadings_df["loading_str"] = loadings_df["loading"].apply(lambda a: f"{a:.3f}")

loadings_table = alt.Chart(loadings_df).mark_text(fontWeight=600, fontSize=16).encode(
    y=alt.Y("feature:O", axis=alt.Axis(title="Organ (for Metastasis Potential)")),
    x=alt.X("PC:O", axis=alt.Axis(orient="top", labelAngle=0, title="Principal Component")),
    text=alt.Text("loading_str:N"),
    opacity=alt.Opacity("large:N", scale=alt.Scale(domain=[False, True]), legend=None)
).properties(
    width=300,
    height=300
)

loadings_table

In [118]:
alt_save(loadings_table, snakemake.output["pca_plot_5"])

## Plot the explained variance ratio

In [112]:
variance_df = pd.DataFrame(index=range(components.shape[0]))

variance_df["explained_variance_ratio"] = pca.explained_variance_ratio_ * 100
variance_df["PC"] = variance_df.index.to_series().apply(lambda i: f"PC{i+1}")

In [113]:
variance_df.head()

> A second criteria we should consider is which principal components explain more than one variable’s worth of information. Since we have eight variables, if each variable contributed equally, they would each contribute 12.5% to the total variance, indicated by the red line.

In [115]:
percent_variance_plot = alt.Chart(variance_df).mark_bar().encode(
    x=alt.X("PC:O"),
    y=alt.Y("explained_variance_ratio:Q", axis=alt.Axis(title="Percent Variance Explained"))
)

percent_variance_plot

In [None]:
alt_save(percent_variance_plot, snakemake.output["pca_plot_6"])

In [18]:
scales = alt.selection_interval(bind='scales')

alt.Chart(pca_df).mark_circle(size=40, opacity=1).encode(
    x=alt.X("PC2:Q"),
    y=alt.Y("PC1:Q"),
    color=alt.Color("all5_mean:Q", legend=alt.Legend(title="Overall Metastasis Potential"), scale=alt.Scale(domain=[-4, -3])),
    tooltip=[
        alt.Tooltip("index:N", title="Cell Line"),
        alt.Tooltip("all5_mean:Q", title="Overall Metastasis Potential")
    ]
).add_selection(
    scales
).properties(
    width=500,
    height=500
)

# Filtering out cell lines with `penetrance == 0`

372 cell lines have non-zero overall penetrance

In [19]:
nonzero_potential_df = met_potential_df.loc[all_df['penetrance'] != 0]
nonzero_potential_df.head()

In [20]:
nonzero_all_df = all_df.loc[all_df['penetrance'] != 0]

In [21]:
X = nonzero_potential_df.values
pca = PCA(n_components=2)
X_prime = pca.fit_transform(X)

In [22]:
pca.explained_variance_ratio_

In [23]:
nonzero_pca_df = pd.DataFrame(index=nonzero_potential_df.index.values.tolist(), data=X_prime, columns=["PC1", "PC2"])
nonzero_pca_df["all5_mean"] = nonzero_all_df["mean"]
nonzero_pca_df["all5_penetrance"] = nonzero_all_df["penetrance"]

nonzero_pca_df = nonzero_pca_df.merge(nonzero_potential_df, how="left", left_index=True, right_index=True)
nonzero_pca_df = nonzero_pca_df.reset_index()

In [24]:
nonzero_potential_plot = alt.Chart(nonzero_pca_df).mark_circle(size=60, opacity=1).encode(
    x=alt.X("PC2:Q"),
    y=alt.Y("PC1:Q"),
    color=alt.Color("all5_mean:Q", legend=alt.Legend(title="Overall Metastasis Potential")),
    tooltip=[
        alt.Tooltip("index:N", title="Cell Line"),
        alt.Tooltip("all5_mean:Q", title="Overall Metastasis Potential")
    ]
).properties(
    title="Principal Components: Overall Metastasis Potential",
    width=500,
    height=500
)

nonzero_penetrance_plot = alt.Chart(nonzero_pca_df).mark_circle(size=60, opacity=1).encode(
    x=alt.X("PC2:Q"),
    y=alt.Y("PC1:Q"),
    color=alt.Color("all5_penetrance:Q", legend=alt.Legend(title="Overall Metastasis Penetrance")),
    tooltip=[
        alt.Tooltip("index:N", title="Cell Line"),
        alt.Tooltip("all5_mean:Q", title="Overall Metastasis Potential")
    ]
).properties(
    title="Principal Components: Overall Metastasis Potential",
    width=500,
    height=500
)

alt.hconcat(nonzero_potential_plot, nonzero_penetrance_plot).resolve_scale(
    color='independent'
)

In [25]:
tooltip = [
    alt.Tooltip("index:N", title="Cell Line"),
    alt.Tooltip("all5_mean:Q", title="Potential"),
    alt.Tooltip("all5_penetrance:Q", title="Penetrance")
]

nonzero_potential_pc1_plot = alt.Chart(nonzero_pca_df).mark_circle(size=60, opacity=1).encode(
    x=alt.X("all5_mean:Q"),
    y=alt.Y("PC1:Q"),
    color=alt.Color("all5_mean:Q", legend=alt.Legend(title="Potential")),
    tooltip=tooltip
).properties(
    title="PC1 vs. Potential",
    width=400,
    height=400
)

nonzero_penetrance_pc1_plot = alt.Chart(nonzero_pca_df).mark_circle(size=60, opacity=1).encode(
    x=alt.X("all5_penetrance:Q"),
    y=alt.Y("PC1:Q"),
    color=alt.Color("all5_penetrance:Q", legend=alt.Legend(title="Penetrance")),
    tooltip=tooltip
).properties(
    title="PC1 vs. Penetrance",
    width=400,
    height=400
)

nonzero_potential_pc2_plot = alt.Chart(nonzero_pca_df).mark_circle(size=60, opacity=1).encode(
    x=alt.X("all5_mean:Q"),
    y=alt.Y("PC2:Q"),
    color=alt.Color("all5_mean:Q", legend=alt.Legend(title="Potential")),
    tooltip=tooltip
).properties(
    title="PC2 vs. Potential",
    width=400,
    height=400
)

nonzero_penetrance_pc2_plot = alt.Chart(nonzero_pca_df).mark_circle(size=60, opacity=1).encode(
    x=alt.X("all5_penetrance:Q"),
    y=alt.Y("PC2:Q"),
    color=alt.Color("all5_penetrance:Q", legend=alt.Legend(title="Penetrance")),
    tooltip=tooltip
).properties(
    title="PC2 vs. Penetrance",
    width=400,
    height=400
)

pc1_and_pc2_plot = alt.vconcat(alt.hconcat(nonzero_potential_pc1_plot, nonzero_penetrance_pc1_plot).resolve_scale(
    color='independent'
), alt.hconcat(nonzero_potential_pc2_plot, nonzero_penetrance_pc2_plot).resolve_scale(
    color='independent'
)).resolve_scale(
    color='independent'
)

pc1_and_pc2_plot

In [26]:
alt_save(pc1_and_pc2_plot, snakemake.output['pca_plot_2'])

In [27]:
def get_tissue_plot(tissue):
    return alt.Chart(nonzero_pca_df).mark_circle(size=40, opacity=1).encode(
        x=alt.X(f"{tissue}:Q"),
        y=alt.Y("PC1:Q"),
        color=alt.Color("all5_mean:Q", legend=alt.Legend(title="Overall Potential"), scale=alt.Scale(domain=[-4, 3])),
        tooltip=tooltip
    ).properties(
        title=f"PC1 vs. {tissue} potential",
        width=300,
        height=300
    )

brain_plot = get_tissue_plot("brain")
lung_plot = get_tissue_plot("lung")
liver_plot = get_tissue_plot("liver")
bone_plot = get_tissue_plot("bone")
kidney_plot = get_tissue_plot("kidney")

pc1_by_organ_plot = alt.hconcat(brain_plot, lung_plot, liver_plot, bone_plot, kidney_plot)

pc1_by_organ_plot

In [28]:
alt_save(pc1_by_organ_plot, snakemake.output['pca_plot_3'])

In [29]:
def get_overall_by_tissue_plot(tissue):
    return alt.Chart(nonzero_pca_df).mark_circle(size=40, opacity=1).encode(
        x=alt.X(f"{tissue}:Q"),
        y=alt.Y("all5_mean:Q", axis=alt.Axis(title="Overall Potential")),
        color=alt.Color("all5_mean:Q", legend=alt.Legend(title="Overall Potential"), scale=alt.Scale(domain=[-4, 3])),
        tooltip=tooltip
    ).properties(
        title=f"Overall vs. {tissue} potential",
        width=300,
        height=300
    )

brain_plot = get_overall_by_tissue_plot("brain")
lung_plot = get_overall_by_tissue_plot("lung")
liver_plot = get_overall_by_tissue_plot("liver")
bone_plot = get_overall_by_tissue_plot("bone")
kidney_plot = get_overall_by_tissue_plot("kidney")

overall_by_organ_plot = alt.hconcat(brain_plot, lung_plot, liver_plot, bone_plot, kidney_plot)

overall_by_organ_plot

In [30]:
alt_save(overall_by_organ_plot, snakemake.output['pca_plot_4'])