In [1]:
import ipyrad.analysis as ipa
import pandas as pd
import numpy as np


In [2]:
# Edited metadata file by hand to remove HOOC0024, HOOC0039, HOOC0047 (failed library prep)
metadata = pd.read_csv("Hoplo_metadata_FIXED.csv", index_col="Seq ID")
metadata= metadata.drop (['HOOC0024', 'HOOC0039', 'HOOC0047'])
metadata.index = [x+".trimmed" for x in metadata.index]
metadata

Unnamed: 0,Vegetation zone,longitude,Location,Sample ID,Lat,Long
HOOC0001.trimmed,Mangrove Forest,BAIK,Badagry,HBG 1,3.003,6.468
HOOC0002.trimmed,Mangrove Forest,BAIK,Ikorodu,HIK3,3.487,6.628
HOOC0003.trimmed,Mangrove Forest,EIBSI,Epe,HEP6,3.978,6.579
HOOC0004.trimmed,Rainforest,EIBSI,Ibadan,HIB 9,3.900,7.442
HOOC0005.trimmed,Rainforest,ADIF,Ifetedo,H IF 1,4.696,7.203
...,...,...,...,...,...,...
HOOC0092.trimmed,Rainforest,BAIK,Abeokuta,HAB10,3.326,7.157
HOOC0093.trimmed,Derived savanna,EIBSI,Iwo,HIW2,4.121,7.585
HOOC0094.trimmed,Derived savanna,ADIF,Ado,H AD 4,5.217,7.599
HOOC0095.trimmed,Derived savanna,EIBSI,Soku,HSK8,3.746,7.899


# Plot the data without modification

In [3]:
data = "/home/iovercast/hoplo_assembly/Hoplo-SE_outfiles/Hoplo-SE.snps.hdf5"
pca = ipa.pca(data, impute_method=None)
pca.run()
canvas, axes = pca.draw()
    

Samples: 93
Sites before filtering: 160046
Filtered (indels): 33425
Filtered (bi-allel): 25083
Filtered (mincov): 23268
Filtered (minmap): 0
Filtered (subsample invariant): 1332
Filtered (minor allele frequency): 0
Filtered (combined): 63185
Sites after filtering: 103723
Sites containing missing values: 103723 (100.00%)
Missing values in SNP matrix: 4885869 (50.65%)
SNPs (total): 103723
SNPs (unlinked): 24931
Imputation (null; sets to 0): 100.0%, 0.0%, 0.0%
Subsampling SNPs: 24931/103723


# Group and color by vegetation zone

In [5]:
imap = metadata.groupby("Vegetation zone").groups
pca = ipa.pca(data=data, imap=imap, impute_method='sample', mincov=0.5)
pca.run()
pca.draw(width=1000, height=600)

Samples: 93
Sites before filtering: 160046
Filtered (indels): 33425
Filtered (bi-allel): 25083
Filtered (mincov): 94801
Filtered (minmap): 32894
Filtered (subsample invariant): 1332
Filtered (minor allele frequency): 0
Filtered (combined): 112112
Sites after filtering: 54796
Sites containing missing values: 54796 (100.00%)
Missing values in SNP matrix: 1648724 (32.35%)
SNPs (total): 54796
SNPs (unlinked): 15530
Imputation: 'sampled'; (0, 1, 2) = 83.6%, 13.2%, 3.2%
Subsampling SNPs: 15530/54796


(<toyplot.canvas.Canvas at 0x7fec42563bc0>,
 <toyplot.coordinates.Cartesian at 0x7fec42580dd0>)

In [6]:
# Group and color by location

In [9]:
imap = metadata.groupby("Location").groups

pca = ipa.pca(data=data, imap=imap, impute_method='sample', mincov=0.7)
pca.run()
pca.draw(width=1000, height=600)

Samples: 93
Sites before filtering: 160046
Filtered (indels): 33425
Filtered (bi-allel): 25083
Filtered (mincov): 132121
Filtered (minmap): 159895
Filtered (subsample invariant): 1332
Filtered (minor allele frequency): 0
Filtered (combined): 166854
Sites after filtering: 54
Sites containing missing values: 54 (100.00%)
Missing values in SNP matrix: 771 (15.35%)
SNPs (total): 54
SNPs (unlinked): 6
Imputation: 'sampled'; (0, 1, 2) = 84.7%, 5.6%, 9.7%
Subsampling SNPs: 6/54


(<toyplot.canvas.Canvas at 0x7fec427e0410>,
 <toyplot.coordinates.Cartesian at 0x7fec425829c0>)

# Aggregate forest and savanna types into 2 broad categories

In [10]:
imap = metadata.groupby("Vegetation zone").groups
himap = {}
himap['forest'] = imap["Mangrove Forest"].tolist() + imap["Rainforest"].tolist()
himap['savanna'] = imap["Derived savanna"].tolist() + imap["Giunea savanna"].tolist()
pca = ipa.pca(data=data, imap=imap, impute_method='sample', mincov=0.7)
pca.run()
pca.draw(width=1000, height=600)

Samples: 93
Sites before filtering: 160046
Filtered (indels): 33425
Filtered (bi-allel): 25083
Filtered (mincov): 132121
Filtered (minmap): 32894
Filtered (subsample invariant): 1332
Filtered (minor allele frequency): 0
Filtered (combined): 142380
Sites after filtering: 24528
Sites containing missing values: 24528 (100.00%)
Missing values in SNP matrix: 544915 (23.89%)
SNPs (total): 24528
SNPs (unlinked): 8057
Imputation: 'sampled'; (0, 1, 2) = 83.4%, 13.4%, 3.2%
Subsampling SNPs: 8057/24528


(<toyplot.canvas.Canvas at 0x7fec424c36b0>,
 <toyplot.coordinates.Cartesian at 0x7fec426927b0>)

# Remove bad samples and replot by vegetation zone

In [11]:
# Remove just the most distant outlier and the reference
# clean_metadata = metadata.drop(['HOOC0085', 'reference'])
# Remove all the samples we carried over from the previous sequencing run
clean_metadata = metadata.drop(['HOOC0084', 'HOOC0085', 'HOOC0086', 'HOOC0087', 
                                'HOOC0088', 'HOOC0089', 'HOOC0090', 'HOOC0091', 'reference'])
imap = clean_metadata.groupby("Vegetation zone").groups
imap = {x:y.tolist() for x,y in imap.items()}
pca = ipa.pca(data=data, imap=imap, impute_method=None, mincov=0.9)
pca.run()
pca.draw(width=1000, height=600)

KeyError: "['HOOC0084', 'HOOC0085', 'HOOC0086', 'HOOC0087', 'HOOC0088', 'HOOC0089', 'HOOC0090', 'HOOC0091', 'reference'] not found in axis"

# Remove bad samples and replot by forest/savanna

In [12]:
clean_metadata = metadata.drop(['HOOC0084', 'HOOC0085', 'HOOC0086', 'HOOC0087', 
                                'HOOC0088', 'HOOC0089', 'HOOC0090', 'HOOC0091', 'reference'])
imap = clean_metadata.groupby("Vegetation zone").groups
himap = {}
himap['forest'] = imap["Mangrove Forest"].tolist() + imap["Rainforest"].tolist()
himap['savanna'] = imap["Derived savanna"].tolist() + imap["Giunea savanna"].tolist()

pca = ipa.pca(data=data, imap=himap, impute_method='sample', mincov=0.9)
pca.run()
pca.draw(width=1000, height=600)

KeyError: "['HOOC0084', 'HOOC0085', 'HOOC0086', 'HOOC0087', 'HOOC0088', 'HOOC0089', 'HOOC0090', 'HOOC0091', 'reference'] not found in axis"

# Look at the samples with most missing data, and remove samples with excessive missingness

In [13]:
display(pca.missing.sort_values(by="missing").tail(30))

Unnamed: 0,missing
HOOC0020.trimmed,0.08
HOOC0083.trimmed,0.09
HOOC0015.trimmed,0.09
HOOC0073.trimmed,0.1
HOOC0034.trimmed,0.12
HOOC0040.trimmed,0.15
HOOC0057.trimmed,0.2
HOOC0056.trimmed,0.2
HOOC0089.trimmed,0.36
HOOC0091.trimmed,0.38


In [14]:
missing_cutoff = 0.1
miss_samples = pca.missing[pca.missing["missing"]  > missing_cutoff].index.tolist()
print(miss_samples)

['HOOC0005.trimmed', 'HOOC0006.trimmed', 'HOOC0014.trimmed', 'HOOC0021.trimmed', 'HOOC0022.trimmed', 'HOOC0023.trimmed', 'HOOC0029.trimmed', 'HOOC0030.trimmed', 'HOOC0034.trimmed', 'HOOC0038.trimmed', 'HOOC0040.trimmed', 'HOOC0046.trimmed', 'HOOC0053.trimmed', 'HOOC0054.trimmed', 'HOOC0056.trimmed', 'HOOC0057.trimmed', 'HOOC0062.trimmed', 'HOOC0084.trimmed', 'HOOC0085.trimmed', 'HOOC0086.trimmed', 'HOOC0087.trimmed', 'HOOC0088.trimmed', 'HOOC0089.trimmed', 'HOOC0090.trimmed', 'HOOC0091.trimmed', 'HOOC0093.trimmed']


In [15]:
#cleametadata = metadata.drop(['HOOC0084', 'HOOC0085', 'HOOC0086', 'HOOC0087', 
#                                'HOOC0088', 'HOOC0089', 'HOOC0090', 'HOOC0091', 'reference'])
metadata = metadata.drop(miss_samples)

imap = metadata.groupby("Vegetation zone").groups
himap = {}
himap['forest'] = imap["Mangrove Forest"].tolist() + imap["Rainforest"].tolist()
himap['savanna'] = imap["Derived savanna"].tolist() + imap["Giunea savanna"].tolist()

pca = ipa.pca(data=data, imap=himap, impute_method='sample', mincov=0.5, quiet=False)
pca.run()
pca.draw(width=800, height=400)

Samples: 67
Sites before filtering: 160046
Filtered (indels): 31418
Filtered (bi-allel): 22981
Filtered (mincov): 77081
Filtered (minmap): 12854
Filtered (subsample invariant): 14429
Filtered (minor allele frequency): 0
Filtered (combined): 93890
Sites after filtering: 64976
Sites containing missing values: 63146 (97.18%)
Missing values in SNP matrix: 885649 (20.34%)
SNPs (total): 64976
SNPs (unlinked): 18032
Imputation: 'sampled'; (0, 1, 2) = 82.5%, 14.5%, 2.9%
Subsampling SNPs: 18032/64976


(<toyplot.canvas.Canvas at 0x7fec42934b60>,
 <toyplot.coordinates.Cartesian at 0x7fec426937d0>)

In [16]:
imap = metadata.groupby("Vegetation zone").groups
imap = {x:y.tolist() for x,y in imap.items()}
pca = ipa.pca(data=data, imap=imap, impute_method='sample', mincov=0.5)
pca.run(nreplicates=1)
pca.draw(width=800, height=400)

Samples: 67
Sites before filtering: 160046
Filtered (indels): 31418
Filtered (bi-allel): 22981
Filtered (mincov): 77081
Filtered (minmap): 35213
Filtered (subsample invariant): 14429
Filtered (minor allele frequency): 0
Filtered (combined): 94052
Sites after filtering: 64814
Sites containing missing values: 62984 (97.18%)
Missing values in SNP matrix: 880970 (20.29%)
SNPs (total): 64814
SNPs (unlinked): 17993
Imputation: 'sampled'; (0, 1, 2) = 82.7%, 13.8%, 3.5%
Subsampling SNPs: 17993/64814


(<toyplot.canvas.Canvas at 0x7fec42e27ef0>,
 <toyplot.coordinates.Cartesian at 0x7fec428afe00>)

In [17]:
imap = metadata.groupby("Location").groups
imap = {x:y.tolist() for x,y in imap.items()}
pca = ipa.pca(data=data, imap=imap, impute_method='sample', mincov=0.5)
pca.run(nreplicates=1)
pca.draw(width=800, height=400)

Samples: 67
Sites before filtering: 160046
Filtered (indels): 31418
Filtered (bi-allel): 22981
Filtered (mincov): 77081
Filtered (minmap): 64000
Filtered (subsample invariant): 14429
Filtered (minor allele frequency): 0
Filtered (combined): 95691
Sites after filtering: 63175
Sites containing missing values: 61345 (97.10%)
Missing values in SNP matrix: 836926 (19.77%)
SNPs (total): 63175
SNPs (unlinked): 17529
Imputation: 'sampled'; (0, 1, 2) = 83.3%, 11.7%, 5.1%
Subsampling SNPs: 17529/63175


(<toyplot.canvas.Canvas at 0x7fec426a1250>,
 <toyplot.coordinates.Cartesian at 0x7fec3a3ec0e0>)

In [18]:
imap = metadata.groupby("longitude").groups
imap = {x:y.tolist() for x,y in imap.items()}
pca = ipa.pca(data=data, imap=imap, impute_method='sample', mincov=0.7)
pca.run(nreplicates=1)
pca.draw(width=1000, height=600)

Samples: 67
Sites before filtering: 160046
Filtered (indels): 31418
Filtered (bi-allel): 22981
Filtered (mincov): 103167
Filtered (minmap): 19860
Filtered (subsample invariant): 14429
Filtered (minor allele frequency): 0
Filtered (combined): 112208
Sites after filtering: 46658
Sites containing missing values: 44828 (96.08%)
Missing values in SNP matrix: 395543 (12.65%)
SNPs (total): 46658
SNPs (unlinked): 13993
Imputation: 'sampled'; (0, 1, 2) = 83.2%, 13.7%, 3.0%
Subsampling SNPs: 13993/46658


(<toyplot.canvas.Canvas at 0x7fec427b38f0>,
 <toyplot.coordinates.Cartesian at 0x7fec427b0da0>)

# Try TSNE as an alternate clustering idea

In [144]:
pca.run_tsne(perplexity=10)
pca.draw()

Subsampling SNPs: 26642/92455




(<toyplot.canvas.Canvas at 0x193a9da50>,
 <toyplot.coordinates.Cartesian at 0x1930acd90>)

# Debug and not useful below here

In [79]:
imap = {"pop1":['1A_0', '1B_0', '1C_0'],
        "pop2":['2E_0', '2F_0', '2G_0', '2H_0'],
        "pop3":['3I_0', '3J_0', '3K_0', '3L_0']}
sdat = "/tmp/ipyrad-test/se_outfiles/se.snps.hdf5"
spca = ipa.pca(data=sdat, imap=imap)
spca.run()
print(spca.names)
spca.draw()

Samples: 11
Sites before filtering: 4310
Filtered (indels): 9
Filtered (bi-allel): 80
Filtered (mincov): 0
Filtered (minmap): 1
Filtered (subsample invariant): 344
Filtered (minor allele frequency): 0
Filtered (combined): 432
Sites after filtering: 3878
Sites containing missing values: 4 (0.10%)
Missing values in SNP matrix: 4 (0.01%)
SNPs (total): 3878
SNPs (unlinked): 980
Imputation (null; sets to 0): 100.0%, 0.0%, 0.0%
Subsampling SNPs: 980/3878
['1A_0', '1B_0', '1C_0', '2E_0', '2F_0', '2G_0', '2H_0', '3I_0', '3J_0', '3K_0', '3L_0']


(<toyplot.canvas.Canvas at 0x19316a950>,
 <toyplot.coordinates.Cartesian at 0x1932ff410>)