In [2]:
from functools import partial
import os

import numpy as np
import pandas as pd
import skbio
import seaborn as sn

from qiime2 import Artifact, Metadata, Visualization

import qiime2.plugins.alignment.actions as q2_alignment
import qiime2.plugins.dada2.actions as q2_dada2
import qiime2.plugins.demux.actions as q2_demux
import qiime2.plugins.feature_table.actions as q2_feature_table
import qiime2.plugins.phylogeny.actions as q2_phylogeny


%matplotlib inline

In [3]:
ls data/inputs/mock/

SRR11180057_1.fastq                  SRR2182220_1_batch_170000.fasta.map
SRR11180057_2.fastq                  SRR2182220_1_batch_180000.fasta
SRR2182220_1.fasta                   SRR2182220_1_batch_180000.fasta.map
SRR2182220_1.fastq                   SRR2182220_1_batch_20000.fasta
SRR2182220_1_batch_0.fasta           SRR2182220_1_batch_20000.fasta.map
SRR2182220_1_batch_0.fasta.map       SRR2182220_1_batch_30000.fasta
SRR2182220_1_batch_10000.fasta       SRR2182220_1_batch_30000.fasta.map
SRR2182220_1_batch_10000.fasta.map   SRR2182220_1_batch_40000.fasta
SRR2182220_1_batch_100000.fasta      SRR2182220_1_batch_40000.fasta.map
SRR2182220_1_batch_100000.fasta.map  SRR2182220_1_batch_50000.fasta
SRR2182220_1_batch_110000.fasta      SRR2182220_1_batch_50000.fasta.map
SRR2182220_1_batch_110000.fasta.map  SRR2182220_1_batch_60000.fasta
SRR2182220_1_batch_120000.fasta      SRR2182220_1_batch_60000.fasta.map
SRR2182220_1_batch_120000.fasta.map  SRR2182220_1_batch_70000.fasta
SRR2182220_1_batc

In [4]:
# seqs_q2 = Artifact.load('data/inputs/mock/sequences.qza')
# q2_demux.summarize(seqs_q2).visualization

In [5]:
def convert_fastq_2_fasta(fastq_filepath, fasta_filepath, phred_offset=33, 
                          chuck_size=10000):
    """
    A lightweight function to convert fastq files to fastas
    """
    seqs_with_qual = skbio.io.read(fastq_filepath, 
                                   format='fastq', 
                                   phred_offset=phred_offset,
                                   )
    for i, seq in enumerate(seqs_with_qual):
        if (i / chuck_size) == np.floor(i / chuck_size):
            new_fp = fasta_filepath.replace(".fasta", f'_batch_{i}.fasta')
            f_ = skbio.io.open(new_fp, 'w')
        seq.write(f_)

In [6]:
convert_fastq_2_fasta('data/inputs/mock/SRR2182220_1.fastq',
                      'data/inputs/mock/SRR2182220_1.fasta')

In [7]:
seqs_with_qual = skbio.io.read('data/inputs/mock/SRR2182220_1.fastq', 
                                   format='fastq', 
                                   phred_offset=33,
                                   )
seq_lengths = {
    seq.metadata['id']: len(seq) 
    for seq in seqs_with_qual
}
seq_lengths = pd.Series(seq_lengths)

In [8]:
seq_length_dist = np.floor(seq_lengths / 25).value_counts()
seq_length_dist.rename({i: i * 25 for i in seq_length_dist.index})

275.0    55347
200.0    51307
250.0    44601
175.0    13766
225.0     4230
150.0     2881
50.0      2688
25.0      2440
125.0     2395
100.0     2331
75.0      2054
0.0        670
300.0      526
325.0       57
350.0       43
450.0        1
375.0        1
dtype: int64

In [9]:
(seq_lengths < 150).mean()

0.06786519763890837

In [10]:
(seq_lengths < 200).mean()

0.1576848784383127

In [None]:
breaks = [

In [16]:
msa1 = skbio.TabularMSA.read('data/output/mock/build_alignment/gg_88_rep_seq_alignment_0.fasta', 
                      constructor=partial(skbio.DNA, lowercase=True),
                      )
msa2 = skbio.TabularMSA.read('data/output/mock/build_alignment/gg_88_rep_seq_alignment_10000.fasta', 
                      constructor=partial(skbio.DNA, lowercase=True),
                      )

In [17]:
msa_series = pd.Series({seq.metadata['id']: seq for seq in msa1}).astype(str)
aligned_seqs = msa_series.loc[[i for i in msa_series.index if ("SR" in i)]]
msa_df = aligned_seqs.apply(lambda x: pd.Series(list(x)))

In [None]:
msa_df_na = msa_df.replace({"-": np.nan})

In [None]:
align_coords = msa_df_na.apply(lambda x: np.array(x.index[x.notna()].values), axis=1)

In [None]:
# msa_df.notna().sum(axis=0).sort_values(ascending=False)

In [None]:
align_coord_first = align_coords.apply(lambda x: x.min())

In [None]:
align_coord_first.value_counts()

In [None]:
align_coord_first.loc[align_coord_first == 1842].index[:10]

In [None]:
test1 = msa_df_na.loc[align_coord_first.loc[align_coord_first == 1842].index].dropna(axis=1, how='all')
test1[test1.columns[:15]].isna().sum(axis=1).value_counts()

In [None]:
test1.loc[test1[test1.columns[:15]].isna().sum(axis=1) > 0, test1.columns[:30]]

In [None]:
!mafft \
 --add data/inputs/mock/SRR2182220_1.fasta \
 --mapout data/reference/gg_13_8_88/gg_88_otus_aligned.fasta \
 > data/output/mock/build_alignment/gg_88_rep_seq_alignment.fasta

In [140]:
seq = skbio.DNA.read('data/inputs/mock/SRR2182220_1.fastq', phred_offset=phred_offset)
seq.

True

In [3]:
!qiime tools import \
 --type 'SampleData[SequencesWithQuality]' \
 --input-path data/inputs/mock/manifest.tsv \
 --output-path data/inputs/mock/sequences.qza \
  --input-format SingleEndFastqManifestPhred33V2

[32mImported data/inputs/mock/manifest.tsv as SingleEndFastqManifestPhred33V2 to data/inputs/mock/sequences.qza[0m


In [4]:
seqs_with_quality = Artifact.load('data/inputs/mock/sequences.qza')

In [5]:
# q2_demux.summarize(seqs_with_quality).visualization

In [6]:
all_table, all_rep_seqs, all_stats = q2_dada2.denoise_pyro(
    demultiplexed_seqs=seqs_with_quality,
    trunc_len=200,
    chimera_method='none',
)

Running external command line application(s). This may print messages to stdout and/or stderr.
The command(s) being run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: run_dada_single.R /var/folders/bw/q064ds0d2795_6mxnrssf0l1gkw0rj/T/qiime2-archive-p9syluwl/aaded3cb-ad94-411a-81e5-b83aa41bf122/data /var/folders/bw/q064ds0d2795_6mxnrssf0l1gkw0rj/T/tmpod116mj5/output.tsv.biom /var/folders/bw/q064ds0d2795_6mxnrssf0l1gkw0rj/T/tmpod116mj5/track.tsv /var/folders/bw/q064ds0d2795_6mxnrssf0l1gkw0rj/T/tmpod116mj5 200 0 2.0 2 Inf independent none 1.0 1 250000 -1 32



In [7]:
all_rep_seqs

<artifact: FeatureData[Sequence] uuid: 472da9be-831c-465e-8a5b-69cdc88e0f43>

In [8]:
all_rep_seqs.view(pd.Series)
!mkdir -p data/output/mock/build_alignment/
with skbio.io.open('data/output/mock/build_alignment/rep_seqs.fasta', 'w') as f_:
    for seq in all_rep_seqs.view(pd.Series).values:
        seq.write(f_)

In [11]:
with skbio.io.open('data/output/mock/build_alignment/rep_seqs-rev.fasta', 'w') as f_:
    for seq in all_rep_seqs.view(pd.Series).values:
        seq.reverse_complement().write(f_)

In [145]:
!mafft \
 --add data/inputs/mock/SRR2182220_1.fasta \
 --mapout data/reference/gg_13_8_88/gg_88_otus_aligned.fasta \
 > data/output/mock/build_alignment/gg_88_rep_seq_alignment.fasta

nadd = 185338
nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb->38258 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..

There are 4202 ambiguous characters.
 93201 / 195882/Users/jusdeb/miniconda3/envs/qiime2-2021.8-dev/bin/mafft: line 2747: 28734 Killed: 9               "$prefix/disttbfast" -q $npickup -E $cycledisttbfast -V "-"$gopdist -s $unalignlevel $legacygapopt $mergearg -W $tuplesize $termgapopt $outnum $addarg $add2ndhalfarg -C $numthreads-$numthreadstb $memopt $weightopt $treeinopt $treeoutopt $distoutopt $seqtype $model -g $gexp -f "-"$gop -Q $spfactor -h $aof $param_fft $algopt $treealg $scoreoutarg $anchoropt -x $maxanchorseparation $oneiterationopt < infile > pre 2>> "$progressfile"


In [12]:
!mafft \
 --add data/output/mock/build_alignment/rep_seqs-rev.fasta \
 --mapout data/reference/gg_13_8_88/gg_88_otus_aligned.fasta \
 > data/output/mock/build_alignment/gg_88_rep_seq_alignment-rev.fasta

nadd = 326
nthread = 0
nthreadpair = 0
nthreadtb = 0
ppenalty_ex = 0
stacksize: 8192 kb
generating a scoring matrix for nucleotide (dist=200) ... done
Gap Penalty = -1.53, +0.00, +0.00



Making a distance matrix ..

There are 4202 ambiguous characters.
 10801 / 10870
done.

Constructing a UPGMA tree (efffree=0) ... 
 10860 / 10870
done.

Progressive alignment 1/2... 
STEP  3001 / 10869 
done.

Making a distance matrix from msa.. 
 10800 / 10870
done.

Constructing a UPGMA tree (efffree=1) ... 
 10860 / 10870
done.

Progressive alignment 2/2... 
STEP  8801 / 10869 
done.

disttbfast (nuc) Version 7.480
alg=A, model=DNA200 (2), 1.53 (4.59), -0.00 (-0.00), noshift, amax=0.0
0 thread(s)


To keep the alignment length, 303 letters were DELETED.
The deleted letters are shown in the (filename).map file.

Strategy:
 FFT-NS-2 (Fast but rough)
 Progressive method (guide trees were built 2 times.)

If unsure which option to use, try 'mafft --auto input > output'.
For more information, see 'mafft

In [16]:
fwd_gg138_88_align = skbio.TabularMSA.read(
    'data/output/mock/build_alignment/gg_88_rep_seq_alignment.fasta',
    constructor=partial(skbio.DNA, lowercase=True)
)

In [46]:
rev_gg138_88_align = skbio.TabularMSA.read(
    'data/output/mock/build_alignment/gg_88_rep_seq_alignment-rev.fasta',
    constructor=partial(skbio.DNA, lowercase=True)
)

In [47]:
fwd_seq_position = pd.Series({seq_.metadata['id']: i 
                              for i, seq_ in enumerate(fwd_gg138_88_align)})
rev_seq_position = pd.Series({seq_.metadata['id']: i 
                              for i, seq_ in enumerate(rev_gg138_88_align)})

In [48]:
asv_fwd_pos = fwd_seq_position.loc[all_rep_seqs.view(pd.Series).index]
asv_fwd_rev = rev_seq_position.loc[all_rep_seqs.view(pd.Series).index]

In [49]:
aligned_fwd = fwd_gg138_88_align.loc[asv_fwd_pos]
aligned_rev = rev_gg138_88_align.loc[asv_fwd_rev]

In [50]:
fwd_seqs = pd.Series({'{id}_fwd'.format(**seq.metadata): seq 
                      for seq in aligned_fwd.to_dict().values()})
rev_seqs = pd.Series({'{id}_rev'.format(**seq.metadata): seq 
                      for seq in aligned_fwd.to_dict().values()})

In [58]:
all_seqs_aligned = pd.concat(axis=0, objs=[fwd_seqs, rev_seqs])
all_seqs_aligned = \
    all_seqs_aligned.astype(str).apply(lambda x: pd.Series(list(x)))
all_seqs_aligned.replace({"-": np.nan}, inplace=True)

In [69]:
aligned_seq_pos = all_seqs_aligned.apply(lambda x: x.dropna().index.values, axis=1)

In [82]:
aligned_seq_pos_summary = pd.DataFrame({
    'first': aligned_seq_pos.apply(lambda x: x[0]), 
    'last': aligned_seq_pos.apply(lambda x: x[-1]), 
    'mean': aligned_seq_pos.apply(lambda x: np.mean(x)), 
})

In [88]:
first_grouped = \
    aligned_seq_pos_summary.groupby('first').apply(lambda x: x.index.values)

In [93]:
first_grouped

first
446     [e280f676bc9164faa188ce13ca0fcd23_fwd, b4c3a60...
1842    [221282155f99e7c5c1db27bb52739487_fwd, da0ed08...
1924    [3191d0e21ab539a36fd3f6ec731c656e_fwd, 529ada5...
2232    [2c2fda0f43a5ac2086b5db6cc074804e_fwd, 70529bc...
2239    [3911dcc2d8b9756a2955b45844bcdb34_fwd, 3911dcc...
2246    [5896ae641a1e09789dea1ac234c3a772_fwd, feec1d0...
2329    [63b007da33f2565a72a6d9fde54798b9_fwd, 63b007d...
4049    [bf0eb2226e664f68e62490a1e5be17c0_fwd, f97fd74...
4052    [57bfc062e18f9cc0dd6f2340a315edd4_fwd, 7da1470...
4570    [6fb16ad6b32d348d8fc7c3b81c51ff7b_fwd, a671fe2...
4641    [06e51bf729b20aa80e4269edd1602cf2_fwd, 1dc294c...
4652    [c020ca7117d5403cddf965118bafe6dd_fwd, c020ca7...
4947    [75c7a4f8dea15251e58c717bff25d147_fwd, b33a384...
5009    [a4970fdc6b01daa1369a9eddbd7f2cb8_fwd, a4970fd...
5012    [1437c319929123f1239776e9fbf9fd0c_fwd, 1854b3c...
5028    [1ce53f7d397950c427d7a581e788b2f8_fwd, 1ce53f7...
5073    [e1a369640ff6854d47d2956f4896aa32_fwd, e1a3696...
5076    