In [6]:
import ipyrad.analysis as ipa
import toyplot
import pandas as pd

In [7]:
data = "hoploreference_outfiles/hoploreference.snps.hdf5"

In [8]:
metadata = pd.read_csv("Hoplo_meta_data_2.csv", index_col="Seq ID")
metadata.loc["reference"] = ['ref', 'ref', 'ref', 'ref']
metadata= metadata.drop (['HOOC0024', 'HOOC0039', 'HOOC0047'])
metadata = metadata.drop(['HOOC0084', 'HOOC0085', 'HOOC0086', 'HOOC0087', 
                                'HOOC0088', 'HOOC0089', 'HOOC0090', 'HOOC0091', 'reference'])
metadata= metadata.drop(['HOOC0005', 'HOOC0006', 'HOOC0014', 'HOOC0021', 'HOOC0022', 'HOOC0029', 'HOOC0023',
                         'HOOC0030', 'HOOC0038', 'HOOC0046', 'HOOC0053', 'HOOC0054', 'HOOC0057', 'HOOC0062', 'HOOC0093'])
imap = metadata.groupby("Vegetation zone").groups
imap = {x:y.tolist() for x,y in imap.items()}
# require that 50% of samples have data in each group
minmap = {i: 0.5 for i in imap}
imap


{'Derived savanna': ['HOOC0059',
  'HOOC0060',
  'HOOC0061',
  'HOOC0063',
  'HOOC0064',
  'HOOC0065',
  'HOOC0066',
  'HOOC0067',
  'HOOC0068',
  'HOOC0069',
  'HOOC0070',
  'HOOC0071',
  'HOOC0072',
  'HOOC0073',
  'HOOC0074',
  'HOOC0075',
  'HOOC0076',
  'HOOC0077',
  'HOOC0078',
  'HOOC0079',
  'HOOC0080',
  'HOOC0081',
  'HOOC0082',
  'HOOC0083'],
 'Giunea savanna': ['HOOC0092', 'HOOC0094', 'HOOC0095', 'HOOC0096'],
 'Mangrove Forest': ['HOOC0001',
  'HOOC0002',
  'HOOC0003',
  'HOOC0004',
  'HOOC0007',
  'HOOC0008',
  'HOOC0009',
  'HOOC0010',
  'HOOC0011',
  'HOOC0012',
  'HOOC0013',
  'HOOC0015',
  'HOOC0016',
  'HOOC0017',
  'HOOC0018',
  'HOOC0019',
  'HOOC0020',
  'HOOC0025',
  'HOOC0026',
  'HOOC0027',
  'HOOC0028'],
 'Rainforest': ['HOOC0031',
  'HOOC0032',
  'HOOC0033',
  'HOOC0034',
  'HOOC0035',
  'HOOC0036',
  'HOOC0037',
  'HOOC0040',
  'HOOC0041',
  'HOOC0042',
  'HOOC0043',
  'HOOC0044',
  'HOOC0045',
  'HOOC0048',
  'HOOC0049',
  'HOOC0050',
  'HOOC0051',
  'HOOC00

In [9]:
# init analysis object with input data and (optional) parameter options
struct = ipa.structure(
    name="hoplostructure",
    data=data,
    imap=imap,
    minmap=minmap,
    mincov=0.9,
)

Samples: 70
Sites before filtering: 280267
Filtered (indels): 0
Filtered (bi-allel): 4945
Filtered (mincov): 181439
Filtered (minmap): 116247
Filtered (subsample invariant): 41104
Filtered (minor allele frequency): 0
Filtered (combined): 168451
Sites after filtering: 92113
Sites containing missing values: 72259 (78.45%)
Missing values in SNP matrix: 247950 (3.85%)
SNPs (total): 92113
SNPs (unlinked): 26574


In [10]:
struct.mainparams.burnin = 5000
struct.mainparams.numreps = 10000

In [12]:
struct.ipcluster['cores']=10
struct.ipcluster

{'cluster_id': '',
 'profile': 'default',
 'engines': 'Local',
 'quiet': 0,
 'timeout': 60,
 'cores': 10,
 'threads': 2,
 'pids': {}}

In [13]:
struct.run(nreps=3, kpop=[2, 3, 4, 5], auto=True)

[####################] 100% 1:18:28 | running 12 structure jobs 


In [14]:
etable = struct.get_evanno_table([2, 3, 4, 5])
etable

  tab.loc[kpop, "lnPK"] = tab.loc[kpop, "estLnProbMean"] \
  tab.loc[kpop, "lnPPK"] = abs(tab.loc[kpop+1, "lnPK"]
  tab.loc[kpop, "deltaK"] = (abs(


Unnamed: 0,Nreps,lnPK,lnPPK,deltaK,estLnProbMean,estLnProbStdev
2,3,0.0,0.0,0.0,-563500.0,3149.0
3,3,-13060.0,20820000.0,1781.835,-576500.0,11680.0
4,3,-20830000.0,32200000.0,4.949,-21410000.0,6507000.0
5,3,11370000.0,0.0,0.0,-10040000.0,12350000.0


In [15]:
# get canvas object and set size
canvas = toyplot.Canvas(width=400, height=300)

# plot the mean log probability of the models in red
axes = canvas.cartesian(ylabel="estLnProbMean")
axes.plot(etable.estLnProbMean * -1, color="darkred", marker="o")
axes.y.spine.style = {"stroke": "darkred"}

# plot delta K with its own scale bar of left side and in blue
axes = axes.share("x", ylabel="deltaK", ymax=etable.deltaK.max() + etable.deltaK.max() * .25)
axes.plot(etable.deltaK, color="steelblue", marker="o");
axes.y.spine.style = {"stroke": "steelblue"}

# set x labels
axes.x.ticks.locator = toyplot.locator.Explicit(range(len(etable.index)), etable.index)
axes.x.label.text = "K (N ancestral populations)"

In [25]:
k = 3
table = struct.get_clumpp_table(k)

[K3] 3/3 results permuted across replicates (max_var=0).


  table = pd.read_csv(ofile, delim_whitespace=True, header=None)


In [26]:
# sort list by columns
table.sort_values(by=list(range(k)), inplace=True)

# or, sort by a list of names (here taken from imap)
import itertools
onames = list(itertools.chain(*imap.values()))
table = table.loc[onames]

In [28]:
# build barplot
canvas = toyplot.Canvas(width=1000, height=250)
axes = canvas.cartesian(bounds=("10%", "90%", "10%", "45%"))
axes.bars(table)

# add labels to x-axis
ticklabels = [i for i in table.index.tolist()]
axes.x.ticks.locator = toyplot.locator.Explicit(labels=ticklabels)
axes.x.ticks.labels.angle = -60
axes.x.ticks.show = True
axes.x.ticks.labels.offset = 10
axes.x.ticks.labels.style = {"font-size": "12px"}