In [8]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import shlex
from Bio.Seq import Seq
import subprocess
import os


In [9]:
#helper functions
def read_gff(path):
    df = pd.read_csv(path, index_col=False, sep='\t', header=None, comment="#")
    df.columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
    return df

def sys(command):
    """
    """
    print("-" * 10)
    print("Executing: %s" % command)
    process = subprocess.Popen(shlex.split(command),stdout=subprocess.PIPE,stderr=subprocess.PIPE, cwd="/home/juan/Desktop/juan/bio/gliadinas/")
    stdout, stderr = process.communicate()
    #print("Result: stdout: %s - stderr: %s" % (stdout, stderr))
    #print("-" * 10)
    if stderr:
        print(stderr)
    return stdout, stderr



def gff2FA(df_gff, path_sequence, windows, output):
    """Extract fasta files from annotations
    """
    fasta_seq = SeqIO.parse(path_genome, 'fasta')
    buffer_seqs = []
    cont = 0
    for record in fasta_seq:
        print(record.id)
        dff_extract = df_gff[df_gff.seqname == record.id]
        for key,val in dff_extract.iterrows():
            clean_seq = ''.join(str(record.seq).splitlines())
            if int(val.start) - windows < 0:
                start = 0
            else:
                start = int(val.start) - windows
            if int(val.end) + windows > len(clean_seq):
                end = len(clean_seq)
            else:
                end = int(val.end) + windows
            new_seq = clean_seq[start:end]
            att = val.attribute
            id = record.id + '_' + str(start) + '_' + str(end)
            desc = "seq_id:" + str(record.id)
            desc += " feature_start:" + str(val.start)
            desc += " feature_end:" + str(val.end)
            desc += " genome_start:" + str(start)
            desc += " genome_end:" + str(end)
            desc += " feature:" + str(val.feature)
            desc += " attributes:" + val.attribute
            seq = SeqRecord(Seq(new_seq), id=id, description=desc)
            buffer_seqs.append(seq)
            cont += 1
    SeqIO.write(buffer_seqs, output, "fasta")
    return buffer_seqs


In [10]:
adapters = "data/adapters_new_rc.fa"
path_output_dir = "data/clean/"
path_spades = "sw/SPAdes-3.13.0-Linux/bin/spades"
path_cutadapt = "sw/cutadapt-2.7/"
path_fastqc = "sw/FastQC/"
adapter_f = "AATGATACGGCGACCACCGAGATCTACAC"
adapter_r = "CAAGCAGAAGACGGCATACGAGAT"

In [11]:
sys("pwd")

----------
Executing: pwd


(b'/home/juan/Desktop/juan/bio/gliadinas\n', b'')

In [12]:
!rm -r data/clean/*

rm: cannot remove 'data/clean/*': No such file or directory


In [13]:
s = 86
for i in range(434, 536):
    sample_1 = "data/reads/sar{}-2019_S{}_L001_R1_001.fastq.gz".format(i,s)
    sample_2 = "data/reads/sar{}-2019_S{}_L001_R2_001.fastq.gz".format(i,s)
    cmd = "sw/TrimGalore-0.6.5/trim_galore --fastqc --paired {} {} --output_dir {} -a {} -a {}"
    cmd = cmd.format(sample_1, sample_2, path_output_dir, adapter_f, adapter_r)
    print(cmd)
    out, err = sys(cmd)
    s += 1
    #if err:
    #    break


sw/TrimGalore-0.6.5/trim_galore --fastqc --paired data/reads/sar434-2019_S86_L001_R1_001.fastq.gz data/reads/sar434-2019_S86_L001_R2_001.fastq.gz --output_dir data/clean/ -a AATGATACGGCGACCACCGAGATCTACAC -a CAAGCAGAAGACGGCATACGAGAT
----------
Executing: sw/TrimGalore-0.6.5/trim_galore --fastqc --paired data/reads/sar434-2019_S86_L001_R1_001.fastq.gz data/reads/sar434-2019_S86_L001_R2_001.fastq.gz --output_dir data/clean/ -a AATGATACGGCGACCACCGAGATCTACAC -a CAAGCAGAAGACGGCATACGAGAT
sw/TrimGalore-0.6.5/trim_galore --fastqc --paired data/reads/sar435-2019_S87_L001_R1_001.fastq.gz data/reads/sar435-2019_S87_L001_R2_001.fastq.gz --output_dir data/clean/ -a AATGATACGGCGACCACCGAGATCTACAC -a CAAGCAGAAGACGGCATACGAGAT
----------
Executing: sw/TrimGalore-0.6.5/trim_galore --fastqc --paired data/reads/sar435-2019_S87_L001_R1_001.fastq.gz data/reads/sar435-2019_S87_L001_R2_001.fastq.gz --output_dir data/clean/ -a AATGATACGGCGACCACCGAGATCTACAC -a CAAGCAGAAGACGGCATACGAGAT
sw/TrimGalore-0.6.5/trim_gal

In [15]:
s = 86
for i in range(434, 536):
    sample_1 = "data/clean/sar{}-2019_S{}_L001_R1_001_val_1.fq.gz".format(i,s)
    sample_2 = "data/clean/sar{}-2019_S{}_L001_R2_001_val_2.fq.gz".format(i,s)
    output = "data/assembly/assembly-{}-2019_S{}_L001_R2_001.fasta".format(i,s)
    cmd = "/home/juan/Desktop/juan/bio/gliadinas/sw/SPAdes-3.13.0-Linux/bin/spades.py --careful -1 {} -2 {} -o {}"
    cmd = cmd.format(sample_1, sample_2, output)
    out, err = sys(cmd)
    s += 1
    #if err:
    #    break


----------
Executing: /home/juan/Desktop/juan/bio/gliadinas/sw/SPAdes-3.13.0-Linux/bin/spades.py --careful -1 data/clean/sar434-2019_S86_L001_R1_001_val_1.fq.gz -2 data/clean/sar434-2019_S86_L001_R2_001_val_2.fq.gz -o data/assembly/assembly-434-2019_S86_L001_R2_001.fasta
----------
Executing: /home/juan/Desktop/juan/bio/gliadinas/sw/SPAdes-3.13.0-Linux/bin/spades.py --careful -1 data/clean/sar435-2019_S87_L001_R1_001_val_1.fq.gz -2 data/clean/sar435-2019_S87_L001_R2_001_val_2.fq.gz -o data/assembly/assembly-435-2019_S87_L001_R2_001.fasta
----------
Executing: /home/juan/Desktop/juan/bio/gliadinas/sw/SPAdes-3.13.0-Linux/bin/spades.py --careful -1 data/clean/sar436-2019_S88_L001_R1_001_val_1.fq.gz -2 data/clean/sar436-2019_S88_L001_R2_001_val_2.fq.gz -o data/assembly/assembly-436-2019_S88_L001_R2_001.fasta
----------
Executing: /home/juan/Desktop/juan/bio/gliadinas/sw/SPAdes-3.13.0-Linux/bin/spades.py --careful -1 data/clean/sar437-2019_S89_L001_R1_001_val_1.fq.gz -2 data/clean/sar437-20

In [18]:
s = 86
for i in range(434, 536):
    assembly = "data/assembly/assembly-{}-2019_S{}_L001_R2_001.fasta/contigs.fasta".format(i,s)
    output = "data/clusters/cluster-{}-2019_S{}.fasta".format(i,s)
    cmd = "/home/juan/Desktop/juan/bio/gliadinas/sw/cd-hit-v4.8.1-2019-0228/cd-hit-est -i {} -o {} -c 0.98"
    cmd = cmd.format(assembly, output)
    out, err = sys(cmd)
    s += 1
    #if err:
    #    break


----------
Executing: /home/juan/Desktop/juan/bio/gliadinas/sw/cd-hit-v4.8.1-2019-0228/cd-hit-est -i data/assembly/assembly-434-2019_S86_L001_R2_001.fasta/contigs.fasta -o data/clusters/cluster-434-2019_S86.fasta -c 0.98
----------
Executing: /home/juan/Desktop/juan/bio/gliadinas/sw/cd-hit-v4.8.1-2019-0228/cd-hit-est -i data/assembly/assembly-435-2019_S87_L001_R2_001.fasta/contigs.fasta -o data/clusters/cluster-435-2019_S87.fasta -c 0.98
----------
Executing: /home/juan/Desktop/juan/bio/gliadinas/sw/cd-hit-v4.8.1-2019-0228/cd-hit-est -i data/assembly/assembly-436-2019_S88_L001_R2_001.fasta/contigs.fasta -o data/clusters/cluster-436-2019_S88.fasta -c 0.98
----------
Executing: /home/juan/Desktop/juan/bio/gliadinas/sw/cd-hit-v4.8.1-2019-0228/cd-hit-est -i data/assembly/assembly-437-2019_S89_L001_R2_001.fasta/contigs.fasta -o data/clusters/cluster-437-2019_S89.fasta -c 0.98
----------
Executing: /home/juan/Desktop/juan/bio/gliadinas/sw/cd-hit-v4.8.1-2019-0228/cd-hit-est -i data/assembly/a

In [None]:
#!/usr/bin/env bds

string adapters = "data/adapters_new_rc.fa"


sys export PATH=$PATH:$PWD/sw/SPAdes-3.13.0-Linux/bin
sys export PATH=$PATH:$PWD/sw/.bds
sys export PATH=$PATH:$PWD/sw/cutadapt-2.7
sys export PATH=$PATH:$PWD/sw/FastQC

sys rm -r data/clean_1/*
sys rm -r data/clean_2/*
sys rm -r data/clean_3/*
sys rm -r data/assembly/*

int s = 86
for( int i=434 ; i <= 535 ; i++ ) {

    string sample_1_1 = "data/reads/sar"+i+"-2019_S"+s+"_L001_R1_001.fastq.gz"
    string sample_2_1 = "data/reads/sar"+i+"-2019_S"+s+"_L001_R2_001.fastq.gz"
    try{
        sys sw/TrimGalore-0.6.5/trim_galore --paired $sample_1_1 $sample_2_1 --output_dir data/clean_1/
    }catch (Exception e){
        print "error in $sample_1_1 $sample_2_1\n"
    }
    

    string sample_1_2 = "data/clean_1/sar"+i+"-2019_S"+s+"_L001_R1_001_val_1.fq.gz"
    string sample_2_2 = "data/clean_1/sar"+i+"-2019_S"+s+"_L001_R2_001_val_2.fq.gz"
    try{
        sys sw/TrimGalore-0.6.5/trim_galore --fastqc --paired $sample_1_2 $sample_2_2 --output_dir data/clean_2/ -a AATGATACGGCGACCACCGAGATCTACAC -a CAAGCAGAAGACGGCATACGAGAT
    }catch (Exception e){
        print "error in $sample_1_2 $sample_2_2\n"
    }

    string sample_1_3 = "data/clean_2/sar"+i+"-2019_S"+s+"_L001_R1_001_val_1_val_1.fq.gz"
    string sample_2_3 = "data/clean_2/sar"+i+"-2019_S"+s+"_L001_R2_001_val_2_val_2.fq.gz"
    try{
        sys sw/TrimGalore-0.6.5/trim_galore --fastqc --paired $sample_1_3 $sample_2_3 --output_dir data/clean_3/ -a GTGTAGATCTCGGTGGTCGCCGTATCATT -a ATCTCGTATGCCGTCTTCTGCTTG
    }catch (Exception e){
        print "error in $sample_1_3 $sample_2_3\n"
    }

    string sample_1 = "data/clean_3/sar"+i+"-2019_S"+s+"_L001_R1_001_val_1_val_1_val_1.fq.gz"
    string sample_2 = "data/clean_3/sar"+i+"-2019_S"+s+"_L001_R2_001_val_2_val_2_val_2.fq.gz"
    string sample_out = "data/assembly/sar" + i + "-2019_S" + s + "/"
    
    sys sw/SPAdes-3.13.0-Linux/bin/spades.py --careful -1 $sample_1_3 -2 $sample_2_3 -o $sample_out

    s = s + 1
}
