In [1]:
import ipyrad.analysis as ipa
import pandas as pd
import numpy as np


In [6]:
# Edited metadata file by hand to remove HOOC0024, HOOC0039, HOOC0047 (failed library prep)
metadata = pd.read_csv("Hoplo_meta_data_2.csv", index_col="Seq ID")
metadata.loc["reference"] = ['ref', 'ref', 'ref', 'ref']
metadata= metadata.drop (['HOOC0024', 'HOOC0039', 'HOOC0047'])
metadata

Unnamed: 0_level_0,Sample ID,Location,Vegetation zone,longitude
Seq ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HOOC0001,HBG 1,Badagry,Mangrove Forest,BAIK
HOOC0002,HBG 2,Badagry,Mangrove Forest,BAIK
HOOC0003,HBG3,Badagry,Mangrove Forest,BAIK
HOOC0004,HBG4,Badagry,Mangrove Forest,BAIK
HOOC0005,HBG5,Badagry,Mangrove Forest,BAIK
...,...,...,...,...
HOOC0093,HIG7,Igbeti,Giunea savanna,EIBSI
HOOC0094,HIG8,Igbeti,Giunea savanna,EIBSI
HOOC0095,HIG9,Igbeti,Giunea savanna,EIBSI
HOOC0096,HIG10,Igbeti,Giunea savanna,EIBSI


# Plot the data without modification

In [4]:
data = "./hoploreference_outfiles/hoploreference.snps.hdf5"
pca = ipa.pca(data, impute_method=None)
pca.run()
canvas, axes = pca.draw()


Samples: 94
Sites before filtering: 280267
Filtered (indels): 0
Filtered (bi-allel): 5412
Filtered (mincov): 33436
Filtered (minmap): 0
Filtered (subsample invariant): 0
Filtered (minor allele frequency): 0
Filtered (combined): 38415
Sites after filtering: 242095
Sites containing missing values: 242064 (99.99%)
Missing values in SNP matrix: 9615592 (42.25%)
SNPs (total): 242095
SNPs (unlinked): 51011
Imputation (null; sets to 0): 100.0%, 0.0%, 0.0%
Subsampling SNPs: 51011/242095


# Group and color by vegetation zone

In [7]:
imap = metadata.groupby("Vegetation zone").groups
pca = ipa.pca(data=data, imap=imap, impute_method=None, mincov=0.9)
pca.run()
pca.draw(width=1000, height=600)

Samples: 94
Sites before filtering: 280267
Filtered (indels): 0
Filtered (bi-allel): 5412
Filtered (mincov): 276475
Filtered (minmap): 44301
Filtered (subsample invariant): 0
Filtered (minor allele frequency): 0
Filtered (combined): 276744
Sites after filtering: 3766
Sites containing missing values: 3735 (99.18%)
Missing values in SNP matrix: 24902 (7.03%)
SNPs (total): 3766
SNPs (unlinked): 1395
Imputation (null; sets to 0): 100.0%, 0.0%, 0.0%
Subsampling SNPs: 1395/3766


(<toyplot.canvas.Canvas at 0x7fe8f68ad430>,
 <toyplot.coordinates.Cartesian at 0x7fe8f5054bf0>)

In [None]:
# Group and color by location

In [8]:
imap = metadata.groupby("Location").groups
pca = ipa.pca(data=data, imap=imap)
pca.run()
pca.draw(width=1000, height=600)

Samples: 94
Sites before filtering: 280267
Filtered (indels): 0
Filtered (bi-allel): 5412
Filtered (mincov): 33436
Filtered (minmap): 79653
Filtered (subsample invariant): 0
Filtered (minor allele frequency): 0
Filtered (combined): 83305
Sites after filtering: 197205
Sites containing missing values: 197174 (99.98%)
Missing values in SNP matrix: 6187048 (33.38%)
SNPs (total): 197205
SNPs (unlinked): 41318
Imputation (null; sets to 0): 100.0%, 0.0%, 0.0%
Subsampling SNPs: 41318/197205


(<toyplot.canvas.Canvas at 0x7fe8f566eea0>,
 <toyplot.coordinates.Cartesian at 0x7fe8f4effa70>)

# Aggregate forest and savanna types into 2 broad categories

In [9]:
imap = metadata.groupby("Vegetation zone").groups
himap = {}
himap['forest'] = imap["Mangrove Forest"].tolist() + imap["Rainforest"].tolist()
himap['savanna'] = imap["Derived savanna"].tolist() + imap["Giunea savanna"].tolist()

pca = ipa.pca(data=data, imap=himap)
pca.run()
pca.draw(width=1000, height=600)

Samples: 93
Sites before filtering: 280267
Filtered (indels): 0
Filtered (bi-allel): 5412
Filtered (mincov): 37090
Filtered (minmap): 13623
Filtered (subsample invariant): 3084
Filtered (minor allele frequency): 0
Filtered (combined): 42296
Sites after filtering: 238503
Sites containing missing values: 238472 (99.99%)
Missing values in SNP matrix: 9314029 (41.99%)
SNPs (total): 238503
SNPs (unlinked): 50377
Imputation (null; sets to 0): 100.0%, 0.0%, 0.0%
Subsampling SNPs: 50377/238503


(<toyplot.canvas.Canvas at 0x7fe8f3dab1d0>,
 <toyplot.coordinates.Cartesian at 0x7fe8f3da93a0>)

# Remove bad samples and replot by vegetation zone

In [10]:
# Remove just the most distant outlier and the reference
# clean_metadata = metadata.drop(['HOOC0085', 'reference'])
# Remove all the samples we carried over from the previous sequencing run
clean_metadata = metadata.drop(['HOOC0084', 'HOOC0085', 'HOOC0086', 'HOOC0087', 
                                'HOOC0088', 'HOOC0089', 'HOOC0090', 'HOOC0091', 'reference'])
imap = clean_metadata.groupby("Vegetation zone").groups
imap = {x:y.tolist() for x,y in imap.items()}
pca = ipa.pca(data=data, imap=imap, impute_method=None, mincov=0.9)
pca.run()
pca.draw(width=1000, height=600)

Samples: 85
Sites before filtering: 280267
Filtered (indels): 0
Filtered (bi-allel): 5307
Filtered (mincov): 269208
Filtered (minmap): 47983
Filtered (subsample invariant): 8792
Filtered (minor allele frequency): 0
Filtered (combined): 270199
Sites after filtering: 10425
Sites containing missing values: 10232 (98.15%)
Missing values in SNP matrix: 63281 (7.14%)
SNPs (total): 10425
SNPs (unlinked): 3676
Imputation (null; sets to 0): 100.0%, 0.0%, 0.0%
Subsampling SNPs: 3676/10425


(<toyplot.canvas.Canvas at 0x7fe8f5055d30>,
 <toyplot.coordinates.Cartesian at 0x7fe8f3daa6f0>)

# Remove bad samples and replot by forest/savanna

In [11]:
clean_metadata = metadata.drop(['HOOC0084', 'HOOC0085', 'HOOC0086', 'HOOC0087', 
                                'HOOC0088', 'HOOC0089', 'HOOC0090', 'HOOC0091', 'reference'])
imap = clean_metadata.groupby("Vegetation zone").groups
himap = {}
himap['forest'] = imap["Mangrove Forest"].tolist() + imap["Rainforest"].tolist()
himap['savanna'] = imap["Derived savanna"].tolist() + imap["Giunea savanna"].tolist()

pca = ipa.pca(data=data, imap=himap, impute_method='sample', mincov=0.9)
pca.run()
pca.draw(width=1000, height=600)

Samples: 85
Sites before filtering: 280267
Filtered (indels): 0
Filtered (bi-allel): 5307
Filtered (mincov): 269208
Filtered (minmap): 14712
Filtered (subsample invariant): 8792
Filtered (minor allele frequency): 0
Filtered (combined): 270198
Sites after filtering: 10426
Sites containing missing values: 10233 (98.15%)
Missing values in SNP matrix: 63289 (7.14%)
SNPs (total): 10426
SNPs (unlinked): 3677
Imputation: 'sampled'; (0, 1, 2) = 93.5%, 4.4%, 2.1%
Subsampling SNPs: 3677/10426


(<toyplot.canvas.Canvas at 0x7fe8f3dab4a0>,
 <toyplot.coordinates.Cartesian at 0x7fe8f3da9640>)

# Look at the samples with most missing data, and remove samples with excessive missingness

In [13]:
display(pca.missing.sort_values(by="missing").tail(20))

Unnamed: 0,missing
HOOC0034,0.02
HOOC0031,0.02
HOOC0020,0.02
HOOC0096,0.03
HOOC0056,0.03
HOOC0023,0.04
HOOC0006,0.18
HOOC0062,0.18
HOOC0054,0.22
HOOC0057,0.31


In [14]:
missing_cutoff = 0.1
miss_samples = pca.missing[pca.missing["missing"]  > missing_cutoff].index.tolist()
print(miss_samples)

['HOOC0005', 'HOOC0006', 'HOOC0014', 'HOOC0021', 'HOOC0022', 'HOOC0029', 'HOOC0030', 'HOOC0038', 'HOOC0046', 'HOOC0053', 'HOOC0054', 'HOOC0057', 'HOOC0062', 'HOOC0093']


In [22]:
clean_metadata = metadata.drop(['HOOC0084', 'HOOC0085', 'HOOC0086', 'HOOC0087', 
                                'HOOC0088', 'HOOC0089', 'HOOC0090', 'HOOC0091', 'reference'])
clean_metadata = clean_metadata.drop(miss_samples + ['HOOC0023'])

imap = clean_metadata.groupby("Vegetation zone").groups
himap = {}
himap['forest'] = imap["Mangrove Forest"].tolist() + imap["Rainforest"].tolist()
himap['savanna'] = imap["Derived savanna"].tolist() + imap["Giunea savanna"].tolist()

pca = ipa.pca(data=data, imap=himap, impute_method='sample', mincov=0.5, quiet=False)
pca.run()
pca.draw(width=800, height=400)

Samples: 70
Sites before filtering: 280267
Filtered (indels): 0
Filtered (bi-allel): 4945
Filtered (mincov): 96842
Filtered (minmap): 32254
Filtered (subsample invariant): 41104
Filtered (minor allele frequency): 0
Filtered (combined): 87120
Sites after filtering: 173444
Sites containing missing values: 153590 (88.55%)
Missing values in SNP matrix: 1705039 (14.04%)
SNPs (total): 173444
SNPs (unlinked): 38148
Imputation: 'sampled'; (0, 1, 2) = 78.0%, 15.0%, 7.0%
Subsampling SNPs: 38148/173444


(<toyplot.canvas.Canvas at 0x7fe8f3e879e0>,
 <toyplot.coordinates.Cartesian at 0x7fe8f3e87a70>)

In [25]:
imap = clean_metadata.groupby("Vegetation zone").groups
imap = {x:y.tolist() for x,y in imap.items()}
pca = ipa.pca(data=data, imap=imap, impute_method='sample', mincov=0.9)
pca.run(nreplicates=1)
pca.draw(width=800, height=400)

Samples: 70
Sites before filtering: 280267
Filtered (indels): 0
Filtered (bi-allel): 4945
Filtered (mincov): 181439
Filtered (minmap): 66258
Filtered (subsample invariant): 41104
Filtered (minor allele frequency): 0
Filtered (combined): 168109
Sites after filtering: 92455
Sites containing missing values: 72601 (78.53%)
Missing values in SNP matrix: 249718 (3.86%)
SNPs (total): 92455
SNPs (unlinked): 26642
Imputation: 'sampled'; (0, 1, 2) = 84.6%, 11.2%, 4.2%
Subsampling SNPs: 26642/92455


(<toyplot.canvas.Canvas at 0x7fe8f3e877a0>,
 <toyplot.coordinates.Cartesian at 0x7fe8f3f31010>)

In [26]:
imap = clean_metadata.groupby("longitude").groups
imap = {x:y.tolist() for x,y in imap.items()}
pca = ipa.pca(data=data, imap=imap, impute_method='sample', mincov=0.6)
pca.run(nreplicates=1)
pca.draw(width=1000, height=600)

Samples: 70
Sites before filtering: 280267
Filtered (indels): 0
Filtered (bi-allel): 4945
Filtered (mincov): 109280
Filtered (minmap): 42383
Filtered (subsample invariant): 41104
Filtered (minor allele frequency): 0
Filtered (combined): 98905
Sites after filtering: 161659
Sites containing missing values: 141805 (87.72%)
Missing values in SNP matrix: 1328448 (11.74%)
SNPs (total): 161659
SNPs (unlinked): 36491
Imputation: 'sampled'; (0, 1, 2) = 78.1%, 15.0%, 6.9%
Subsampling SNPs: 36491/161659


(<toyplot.canvas.Canvas at 0x7fe8f3e4d730>,
 <toyplot.coordinates.Cartesian at 0x7fe8f3f33e30>)

# Try TSNE as an alternate clustering idea

In [144]:
pca.run_tsne(perplexity=10)
pca.draw()

Subsampling SNPs: 26642/92455




(<toyplot.canvas.Canvas at 0x193a9da50>,
 <toyplot.coordinates.Cartesian at 0x1930acd90>)

# Debug and not useful below here

In [79]:
imap = {"pop1":['1A_0', '1B_0', '1C_0'],
        "pop2":['2E_0', '2F_0', '2G_0', '2H_0'],
        "pop3":['3I_0', '3J_0', '3K_0', '3L_0']}
sdat = "/tmp/ipyrad-test/se_outfiles/se.snps.hdf5"
spca = ipa.pca(data=sdat, imap=imap)
spca.run()
print(spca.names)
spca.draw()

Samples: 11
Sites before filtering: 4310
Filtered (indels): 9
Filtered (bi-allel): 80
Filtered (mincov): 0
Filtered (minmap): 1
Filtered (subsample invariant): 344
Filtered (minor allele frequency): 0
Filtered (combined): 432
Sites after filtering: 3878
Sites containing missing values: 4 (0.10%)
Missing values in SNP matrix: 4 (0.01%)
SNPs (total): 3878
SNPs (unlinked): 980
Imputation (null; sets to 0): 100.0%, 0.0%, 0.0%
Subsampling SNPs: 980/3878
['1A_0', '1B_0', '1C_0', '2E_0', '2F_0', '2G_0', '2H_0', '3I_0', '3J_0', '3K_0', '3L_0']


(<toyplot.canvas.Canvas at 0x19316a950>,
 <toyplot.coordinates.Cartesian at 0x1932ff410>)