In [1]:
import os
import sys
import subprocess
import pandas as pd

# Prepare FASTA files for diamond

In [2]:
base_path = u"/Users/robin/xbiome/datasets/protein"

# Pandas dataframe with protein sequences
train_data_file = os.path.join(base_path, u"train_data.pkl")
# Fasta file
train_FASTA_file = os.path.join(base_path, u"train_data.fa")
# DIAMOND database file from train dataset
train_diamond_db = os.path.join(base_path, u"train_data.dmnd")

# Pandas dataframe with protein sequences
test_data_file = os.path.join(base_path, u"test_data.pkl")
# Fasta file
test_FASTA_file = os.path.join(base_path, u"test_data.fa")
# Diamond Mapping file for test dataset
test_diamond_file = os.path.join(base_path, u"test_diamond.res")

In [3]:
def save_seq_to_FASTA(data_file, FASTA_file):
    # load DataFrame file
    df = pd.read_pickle(data_file)

    # save proteins and sequences to FASTA format file
    with open(FASTA_file, 'w') as f:
        for row in df.itertuples():
            f.write('>' + row.proteins + '\n')
            f.write(row.sequences + '\n')

    if os.path.exists(FASTA_file):
        print("%d sequences are saved to FASTA file %s."%(len(df), FASTA_file))
        return True
    else:
        return False

In [4]:
save_seq_to_FASTA(train_data_file, train_FASTA_file)
save_seq_to_FASTA(test_data_file, test_FASTA_file)

73592 sequences are saved to FASTA file /Users/robin/xbiome/datasets/protein/train_data.fa.
3874 sequences are saved to FASTA file /Users/robin/xbiome/datasets/protein/test_data.fa.


True

# Create diamond database from train dataset

In [6]:
cmd = ["diamond", "makedb",  "--db", train_diamond_db, "--in", train_FASTA_file]

proc = subprocess.run(cmd)

if proc.returncode != 0:
    logging.error('Error running diamond!')
    sys.exit(1)

diamond v0.9.14.115 | by Benjamin Buchfink <buchfink@gmail.com>
Licensed under the GNU AGPL <https://www.gnu.org/licenses/agpl.txt>
Check http://github.com/bbuchfink/diamond for updates.

#CPU threads: 8
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database file: /Users/robin/xbiome/datasets/protein/train_data.fa
Opening the database file...  [0.000426s]
Loading sequences...  [0.20599s]
Masking sequences...  [2.40989s]
Writing sequences...  [0.080624s]
Loading sequences...  [1.5e-05s]
Writing trailer...  [0.003948s]
Closing the input file...  [2.6e-05s]
Closing the database file...  [0.000357s]
Processed 73592 sequences, 38780826 letters.
Total time = 2.70137s


# Generate diamond scores for test dataset

In [7]:
cmd = ["diamond", "blastp",  "-d", train_diamond_db, "--more-sensitive", "-t", "/tmp",
       "-q", test_FASTA_file, "--outfmt", "6", "qseqid", "sseqid", "bitscore", "-o",
       test_diamond_file]

proc = subprocess.run(cmd)

if proc.returncode != 0:
    logging.error('Error running diamond!')
    sys.exit(1)

diamond v0.9.14.115 | by Benjamin Buchfink <buchfink@gmail.com>
Licensed under the GNU AGPL <https://www.gnu.org/licenses/agpl.txt>
Check http://github.com/bbuchfink/diamond for updates.

#CPU threads: 8
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
#Target sequences to report alignments for: 25
Temporary directory: /tmp
Opening the database...  [4.8e-05s]
Opening the input file...  [0.000233s]
Opening the output file...  [0.002833s]
Loading query sequences...  [0.008808s]
Masking queries...  [0.101426s]
Building query seed set...  [0.000375s]
Algorithm: Double-indexed
Building query histograms...  [0.044475s]
Allocating buffers...  [9.7e-05s]
Loading reference sequences...  [0.082508s]
Building reference histograms...  [1.38256s]
Allocating buffers...  [0.000127s]
Initializing temporary storage...  [0.017009s]
Processing query chunk 0, reference chunk 0, shape 0, index chunk 0.
Building reference index...  [0.234147s]
Building query index...  [0.011482s]
Bu