In [12]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import ipyrad.analysis as ipa
import numpy as np
import pandas as pd
import toytree


data1 = "./vcfs/Lapa_GC_Final-CLUST85_SNP40_MIN80.vcf"
data2 = "./vcfs/Lapa_GC_Final-CLUST93_SNP5_MIN90.vcf"
metadata = "sequence_summary.csv"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Extract sample names and group from metadata

In [2]:
df = pd.read_csv(metadata)
gps = set(df["Group"])
imap = {}
for gp in gps:
    imap[gp] = df[df["Group"] == gp]["Sample"].values
imap
print(len(df))

48


In [3]:
pca = ipa.pca(
    data=data1,
    imap=imap,
#    minmap=minmap,
    mincov=0.75,
    impute_method="Sample",
    quiet=False,
)
pca.run(nreplicates=25)
pca.draw(height=600, width=800)


Converting vcf to HDF5 using default ld_block_size: 20000
Typical RADSeq data generated by ipyrad/stacks will ignore this value.
You can use the ld_block_size parameter of the PCA() constructor to change
this value.

Indexing VCF to HDF5 database file
hdf5 file exists. Use `force=True` to overwrite.
> /home/isaac/ipyrad/ipyrad/ipyrad/analysis/snps_extracter.py(84)parse_names_from_hdf5()
-> self.dbnames = [i.decode() for i in io5["snps"].attrs["names"]]
(Pdb) c
Samples: 48
Sites before filtering: 60505
Filtered (indels): 0
Filtered (bi-allel): 6869
Filtered (mincov): 690
Filtered (minmap): 11144
Filtered (combined): 16873
Sites after filtering: 43632
Sites containing missing values: 39100 (89.61%)
Missing values in SNP matrix: 220454 (10.53%)
Imputation (null; sets to 0): 100.0%, 0.0%, 0.0%
Subsampling SNPs: 3917/43632


(<toyplot.canvas.Canvas at 0x7ff9c9a31d50>,
 <toyplot.coordinates.Cartesian at 0x7ff956d84850>,
 <toyplot.mark.Point at 0x7ff9c9a24b10>)

# High clust, high min_sample

In [4]:
pca = ipa.pca(
    data=data2,
    imap=imap,
    #minmap=minmap,
    #mincov=0.75,
    impute_method="sample",
    ld_block_size=200
)
pca.run(nreplicates=25)
pca.draw(height=600, width=800)

Indexing VCF to HDF5 database file
hdf5 file exists. Use `force=True` to overwrite.
> /home/isaac/ipyrad/ipyrad/ipyrad/analysis/snps_extracter.py(84)parse_names_from_hdf5()
-> self.dbnames = [i.decode() for i in io5["snps"].attrs["names"]]
(Pdb) c
Samples: 48
Sites before filtering: 1679
Filtered (indels): 0
Filtered (bi-allel): 62
Filtered (mincov): 3
Filtered (minmap): 47
Filtered (combined): 108
Sites after filtering: 1571
Sites containing missing values: 1182 (75.24%)
Missing values in SNP matrix: 3387 (4.49%)
Imputation: 'sampled'; (0, 1, 2) = 93.0%, 5.0%, 2.0%
Subsampling SNPs: 473/1571


(<toyplot.canvas.Canvas at 0x7ff956c5cf10>,
 <toyplot.coordinates.Cartesian at 0x7ff956c5cb90>,
 <toyplot.mark.Point at 0x7ff9c95de750>)

## Reassembly

In [8]:
# hdf5 converts * to _ in sample names internally
# 'L337Laptir_C19' <- Not in the raw data
imap3 = {'G4': ['L060Lapobs_C08', 'L064Lapobs_C04', 'L066Lapobs_C08',
        'L078Lapobs_C01', 'L100Lapobs_C11', 'L102Lapobs_C05',
        'L063Lapobs_C04', 'L069Lapobs_C10', 'L070Lapobs_C10',
        'L081Laptir_C09', 'L082Laptir_C09_REX', 'L083Laptir_C13',
        'L084Laptir_C13', 'L115Lapobs_C05', 'L306Laptir_C09',
        'L307Laptir_C09', 'L327Lapobs_C11'],
 'G5': ['L074Lapmic_C06', 'L073Lapmic_C06', 'L129_Lapaft_C14',
        'L130_Lapaft_C14', 'L318Lapmic_C07', 'L319Lapmic_C07',
        'L333Laptir_C15', 'L335Laptir_C15'],
 'G3': ['L087Laptir_C03', 'L088Laptir_C03', 'L123_Lapaft_C02',
        'L309Laptir_C12_PUR', 'L310Laptir_C12'],
 'G6': ['L092Laptir_C20', 'L057Laptir_C17', 'L067Laptir_C17_REX',
        'L091Laptir_C20', 'L110_Laptir_C21', 'L167Laptir_C18_REC',
        'L321Laptir_C16', 'L322Laptir_C16', 'L332Laptir_C18_REX',
        'L336Laptir_C19'],
 'G2': ['L061Laposo_C04'],
 'G1': ['L062Laposo_C04', 'L135_Laposo_C01', 'L136_Laposo_C10',
        'L137_Laposo_C10', 'L138_Laposo_C05', 'L139_Laposo_C05'],
       }

In [16]:
data3 = "victor_outfiles/victor.snps.hdf5"
pca = ipa.pca(
    data=data3,
    imap=imap3,
    #minmap=minmap,
    #mincov=0.75,
    impute_method="sample",
    ld_block_size=200
)
pca.run(nreplicates=2)
pca.draw(height=600, width=800)

pca.run_tsne(subsample=True, perplexity=4.0, n_iter=100000, seed=123)
pca.draw(height=600, width=800)

Samples: 47
Sites before filtering: 943964
Filtered (indels): 60489
Filtered (bi-allel): 41625
Filtered (mincov): 103912
Filtered (minmap): 829481
Filtered (combined): 842387
Sites after filtering: 101577
Sites containing missing values: 95151 (93.67%)
Missing values in SNP matrix: 1691614 (35.43%)
Imputation: 'sampled'; (0, 1, 2) = 85.9%, 7.7%, 6.4%
Subsampling SNPs: 13371/101577
Subsampling SNPs: 13371/101577


(<toyplot.canvas.Canvas at 0x7ff95231d510>,
 <toyplot.coordinates.Cartesian at 0x7ff95231d590>,
 <toyplot.mark.Point at 0x7ff951fc3810>)

In [21]:
for p in range(1, 10, 2):
    pca.run_tsne(subsample=True, perplexity=p, n_iter=100000, seed=123)
    pca.draw(height=600, width=800)

Subsampling SNPs: 13371/101577
Subsampling SNPs: 13371/101577
Subsampling SNPs: 13371/101577
Subsampling SNPs: 13371/101577
Subsampling SNPs: 13371/101577


In [10]:
data3 = "victor_outfiles/victor.snps.hdf5"
for minc in [0.25, 0.5, 0.9]:
    pca = ipa.pca(
        data=data3,
        imap=imap3,
        #minmap=minmap,
        mincov=minc,
        impute_method="sample",
        ld_block_size=200
    )
    pca.run(nreplicates=2)
    pca.draw(height=600, width=800)

Samples: 47
Sites before filtering: 943964
Filtered (indels): 60489
Filtered (bi-allel): 41625
Filtered (mincov): 520645
Filtered (minmap): 829481
Filtered (combined): 845003
Sites after filtering: 98961
Sites containing missing values: 92535 (93.51%)
Missing values in SNP matrix: 1594398 (34.28%)
Imputation: 'sampled'; (0, 1, 2) = 86.3%, 7.8%, 5.9%
Subsampling SNPs: 12957/98961
Samples: 47
Sites before filtering: 943964
Filtered (indels): 60489
Filtered (bi-allel): 41625
Filtered (mincov): 799000
Filtered (minmap): 829481
Filtered (combined): 875959
Sites after filtering: 68005
Sites containing missing values: 61579 (90.55%)
Missing values in SNP matrix: 701742 (21.96%)
Imputation: 'sampled'; (0, 1, 2) = 88.7%, 7.5%, 3.8%
Subsampling SNPs: 8812/68005
Samples: 47
Sites before filtering: 943964
Filtered (indels): 60489
Filtered (bi-allel): 41625
Filtered (mincov): 919062
Filtered (minmap): 829481
Filtered (combined): 922438
Sites after filtering: 21526
Sites containing missing values: 1

In [11]:
data3 = "victor_outfiles/victor.snps.hdf5"
for minc in [0.25, 0.5, 0.9]:
    pca = ipa.pca(
        data=data3,
        imap=imap3,
        #minmap=minmap,
        mincov=minc,
        impute_method="None",
        ld_block_size=200
    )
    pca.run(nreplicates=2)
    pca.draw(height=600, width=800)

Samples: 47
Sites before filtering: 943964
Filtered (indels): 60489
Filtered (bi-allel): 41625
Filtered (mincov): 520645
Filtered (minmap): 829481
Filtered (combined): 845003
Sites after filtering: 98961
Sites containing missing values: 92535 (93.51%)
Missing values in SNP matrix: 1594398 (34.28%)
Imputation (null; sets to 0): 100.0%, 0.0%, 0.0%
Subsampling SNPs: 12957/98961
Samples: 47
Sites before filtering: 943964
Filtered (indels): 60489
Filtered (bi-allel): 41625
Filtered (mincov): 799000
Filtered (minmap): 829481
Filtered (combined): 875959
Sites after filtering: 68005
Sites containing missing values: 61579 (90.55%)
Missing values in SNP matrix: 701742 (21.96%)
Imputation (null; sets to 0): 100.0%, 0.0%, 0.0%
Subsampling SNPs: 8812/68005
Samples: 47
Sites before filtering: 943964
Filtered (indels): 60489
Filtered (bi-allel): 41625
Filtered (mincov): 919062
Filtered (minmap): 829481
Filtered (combined): 922438
Sites after filtering: 21526
Sites containing missing values: 15100 (70

# Look at trees

In [22]:
phyfile = "./victor_outfiles/victor.phy"
# init raxml object with input data and (optional) parameter options
rax = ipa.raxml(data=phyfile, T=40, N=10)

# print the raxml command string for prosperity
print(rax.command)

# run the command, (options: block until finishes; overwrite existing)
rax.run(block=True, force=True)

/home/isaac/miniconda3/envs/ipyrad_py37/bin/raxmlHPC-PTHREADS-AVX2 -f a -T 40 -m GTRGAMMA -n test -w /media/4TB/isaac/ipyrad-test/victor/analysis-raxml -s /media/4TB/isaac/ipyrad-test/victor/victor_outfiles/victor.phy -p 54321 -N 10 -x 12345
job test finished successfully


In [26]:
tre = toytree.tree(rax.trees.bipartitions)

# draw the tree
tre = tre.root(wildcard="L061Laposo_C04")
tre.draw(tip_labels_align=True, node_labels="support");