In [1]:
# If installed from pip, import lostruct as ls will work
import lostruct.lostruct as ls

# PCoA from skbio.stats is the best implementation of R's MDS algorithm
from skbio.stats.ordination import pcoa

# Much of the output from CyVCF2 and lostruct are numpy arrays
import numpy as np

import pandas as pd
import plotly.express as px
from sklearn.manifold import MDS
import umap
import hdbscan
import plotly.io as pio
pio.renderers.default = "notebook_connected"

In [2]:
# Two VCF utility functions are proivded. get_samples() and get_landmarks()

# This will be the same order of the resulting data
samples = ls.get_samples("chr1-filtered.vcf.gz")
samples[0:5]

['HM017-I', 'HM018', 'HM022-I', 'HM029', 'HM030']

In [3]:
# Utility function: Get list of landmarks (chromosome, scaffolds, contigs, etc..)
landmarks = ls.get_landmarks("chr1-filtered.vcf.gz")
landmarks[0:5]

['chl_Mt', 'chr1', 'chr2', 'chr3', 'chr4']

In [4]:
# Docstrings are also provided
help(ls.get_samples)

Help on function get_samples in module lostruct.lostruct:

get_samples(vcf_file)
    Get the samples from a VCF/BCF file. This is the order the data will remain in as well.



In [5]:
# Parse VCF to get windows and positions of each SNP within each window
windows, positions = ls.parse_vcf("chr1-filtered.vcf.gz", "chr1", 95, ls.Window.SNP)
# ls.Window.SNP specifies window sizes are by SNP count. ls.Window.BP specifies windows are in base pair lengths.

# *** ls.Window.BP is not yet implemented, however. ***
# Please see: https://github.com/jguhlin/lostruct-py/issues/8

# Accumulate output of eigen_windows
result = list()
for x in windows:
    result.append(ls.eigen_windows(x, 10, 1))

# Convert to numpy array
result = np.vstack(result)

# Get PCA distances comparison matrix
pc_dists = ls.get_pc_dists(result)

# Get PCoA value of pc_dists matrix (this is equivalent to R's MDS)
# PLEASE NOTE: See section below: Working with Large Datasets
# For recommended ways to run pcoa
mds = pcoa(pc_dists)


In [6]:
px.scatter(y=mds.samples["PC1"], title="MDS Coordinate 1 (y-axis) compared to Window (x-axis)")

In [7]:
px.scatter(y=mds.samples["PC2"], title="MDS Coordinate 2 (y-axis) compared to Window (x-axis)")

In [8]:
px.scatter(x=mds.samples["PC1"], y=mds.samples["PC2"], title="MDS Coordinate 1 (x-axis) and MDS Coordinate 2 (y-axis)")

## Performing Analaysis Genome-Wide

In [None]:
landmarks = ls.get_landmarks("chr1-filtered.vcf.gz")

results = list()
snp_positions = list()

for landmark in landmarks:
    windows, positions = ls.parse_vcf("my.vcf.gz", landmark, 95)
    for i, window in enumerate(windows):
        results.append(ls.eigen_windows(window, 10, 1))
        snp_positions.append(positions[i])

While the above will not work due to a missing file, it is the appropriate way to get the results for each window for all landmarks (chromosomes, scaffolds, contigs, etc...). Here, we keep track of snp_positions as well, and len(snp_positions) == len(results) so they can be further investigated.

The code will then remain the same:

In [None]:
# Convert to numpy array
results = np.vstack(results)

# Get PCA distances comparison matrix
pc_dists = ls.get_pc_dists(results)

# Get PCoA value of pc_dists matrix (this is equivalent to R's MDS)
mds = pcoa(pc_dists)

## Comparison to R Version

In [11]:
mds_coords = pd.read_csv("lostruct-results/mds_coords.csv")
np.corrcoef(mds.samples['PC1'], mds_coords['MDS1'].to_numpy())[0][1]
# R-value is:

0.9978494016481024

In [12]:
px.scatter(x=mds.samples["PC1"], y=mds_coords['MDS1'])

## Working with Large Datasets

In [13]:
# PCOA for reduced memory consumption and faster clustering
mds = pcoa(pc_dists, method="fsvd", inplace=True, number_of_dimensions=10)
np.corrcoef(mds.samples["PC1"], mds_coords['MDS1'].to_numpy())[0][1]

-0.9978242061901393

In [14]:
px.scatter(y=[mds.samples["PC1"], mds_coords['MDS1']], title="")

In [15]:
px.scatter(x=mds.samples["PC1"], y=mds_coords['MDS1'])

# Some looks at other methods of clustering / comparing

In [16]:
embedding = MDS(n_components=10, dissimilarity="precomputed", n_jobs=-1, n_init=32)
mds = embedding.fit_transform(pc_dists)
px.scatter(y=[mds[:,0], mds_coords['MDS1']], title="Blue is using Python MDS, Red is PCoA method")

In [17]:
import phate
phater = phate.PHATE(n_components=10, knn_dist='precomputed', mds_solver='smacof', mds='metric')
comparison_phate = phater.fit_transform(pc_dists)

Calculating PHATE...
  Running PHATE on precomputed distance matrix with 124 observations.
  Calculating graph and diffusion operator...
    Calculating affinities...
  Calculated graph and diffusion operator in 0.02 seconds.
  Calculating optimal t...
    Automatically selected t = 12
  Calculated optimal t in 0.02 seconds.
  Calculating diffusion potential...
  Calculating metric MDS...
  Calculated metric MDS in 0.55 seconds.
Calculated PHATE in 0.60 seconds.


In [18]:
mds = pcoa(pc_dists)
px.scatter(y=[mds_coords['MDS1'], mds.samples["PC1"], comparison_phate[:,0]], title="Green is PHATE")
# https://github.com/KrishnaswamyLab/PHATE
# Moon, van Dijk, Wang, Gigante et al. Visualizing Transitions and Structure for Biological Data Exploration. 2019. Nature Biotechnology.

In [19]:
reducer = umap.UMAP()
embedding = reducer.fit_transform(pc_dists)
px.scatter(x=embedding[:, 0], y=embedding[:, 1])
# UMAP: https://umap-learn.readthedocs.io/en/latest/index.html
# McInnes, L, Healy, J, UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction, ArXiv e-prints 1802.03426, 2018

In [20]:
hdbscan_labels = hdbscan.HDBSCAN().fit_predict(embedding)
px.scatter(x=embedding[:, 0], y=embedding[:, 1], color=hdbscan_labels)
# hdbscan: https://hdbscan.readthedocs.io/en/latest/index.html

In [21]:
reducer = umap.UMAP(n_components=3)
embedding = reducer.fit_transform(pc_dists)
hdbscan_labels = hdbscan.HDBSCAN().fit_predict(embedding)
fig = px.scatter_3d(x=embedding[:, 0], y=embedding[:, 1], z=embedding[:, 2], color=hdbscan_labels, width=800, height=600)
fig.show()

### Future code adding in weights.. Please ignore this..
If weights are urgently needed, please open a github issue

In [None]:
# Code originally used to generate and save random weights. Saved and static now to feed into R and compare with lostruct R
#weights = np.random.random_sample(len(samples))
#np.savetxt("random_weights.txt", weights, delimiter="\t")
#weights = np.loadtxt("random_weights.txt", delimiter="\t")
#weights

In [None]:
result = list()
for x in windows:
    result.append(ls.eigen_windows(x, 10, weights))
result = np.vstack(result)
pc_dists = ls.get_pc_dists(result)
mds = pcoa(pc_dists)
mds_likelostructr = mds
mds_coords = pd.read_csv("lostruct-results/mds_coords.csv")
print("Weights compared to unweighted Lostruct R:")
print(np.corrcoef(mds.samples['PC1'], mds_coords['MDS1'].to_numpy()))