In [1]:
import os
import pandas as pd
import subprocess
from Bio import SeqIO

In [2]:
# Get sequences from Reg's LIB file:
infile = os.path.join("fasta_files", "MAPT_Morgan_Copy.LIB")
record_dict = SeqIO.to_dict(SeqIO.parse(infile, "fasta"))

# Add LIB data to df
ids = []
names = []
seqs = []
descs = []

for record in record_dict:
    ids.append(record_dict[record].id)
    names.append(record_dict[record].name)
    seqs.append(record_dict[record].seq)
    descs.append(record_dict[record].description)

mapt_morgan_df = pd.DataFrame({
    "ID": ids,
    "Name": names,
    "Sequence": seqs,
    "Description": descs
})
mapt_morgan_df.head()

Unnamed: 0,ID,Name,Sequence,Description
0,MAP2HSA4,MAP2HSA4,"(M, A, D, E, R, K, D, E, A, K, A, P, H, W, T, ...",MAP2HSA4 C-1915 aa ORF2 452-6196 1915 aa/6325 ...
1,MAP2CJA2,MAP2CJA2,"(M, A, D, E, R, K, D, E, A, K, A, P, H, W, T, ...",MAP2CJA2 C-1918 aa ORF1 1-5754 1918 aa/5917 bp...
2,MAP2GGO2,MAP2GGO2,"(M, A, D, E, R, K, D, E, A, K, A, P, H, W, T, ...",MAP2GGO2 C-1915 aa ORF1 1-5745 1915 aa/5748 bp...
3,MAP2MFA2,MAP2MFA2,"(M, A, D, E, R, K, D, E, A, K, A, P, H, W, T, ...",MAP2MFA2 C-1915 aa MAP2 Macaca fascicularis (c...
4,MAP2MML2,MAP2MML2,"(M, A, D, E, R, K, D, E, A, K, A, P, H, W, T, ...",MAP2MML2 C-1915 aa ORF3 30-5774 1915 aa/5903 b...


In [7]:
# Searching sequences using blastp CLI:
# For each record in `mapt_morgan_df`:
# Write current sequence to temp file in fasta format
# Call blastp to query the single-sequence fasta file
# Write results to temop output file
# Read in output file for processing, removing the temp files

for record in record_dict:
    SeqIO.write(record_dict[record], "temp.fasta", "fasta")
    result = subprocess.check_output(["blastp", "-db", "nr", "-query", "temp.fasta", "-out", "temp.out", "-remote"])
    print(result)
    break

b''


In [None]:
mapt_morgan_df[mapt_morgan_df["ID"] == "MAPTHSv6"]

In [None]:
SeqIO.write(record_dict["MAPTHSv6"], "temp.fasta", "fasta")

os.system("blastp -db nr -query temp.fasta -out temp.out -remote")

In [None]:
os.getcwd()

In [4]:
print(mapt_morgan_df.loc[0, ]["Sequence"])

MADERKDEAKAPHWTSAPLTEASAHSHPPEIKDQGGAGEGLVRSANGFPYREDEEGAFGEHGSQGTYSNTKENGINGELTSADRETAEEVSARIVQVVTAEAVAVLKGEQEKEAQHKDQTAALPLAAEETANLPPSPPPSPASEQTVTVEEDLLTASKMEFHDQQELTPSTAEPSDQKEKESEKQSKPGEDLKHAALVSQPETTKTYPDKKDMQGTEEEKAPLALFGHTLVASLEDMKQKTEPSLVVPGIDLPKEPPTPKEQKDWFIEMPTEAKKDEWGLVAPISPGPLTPMREKDVFDDIPKWEGKQFDSPMPSPFQGGSFTLPLDVMKNEIVTETSPFAPAFLQPDDKKSLQQTSGPATAKDSFKIEEPHEAKPDKMAEAPPSEAMTLPKDAHIPVVEEHVMGKVLEEEKEAINQETVQQRDTFTPSGQEPILTEKETELKLEEKTTISDKEAVPKESKPPKPADEEIGIIQTSTEHTFSEQKDQEPTTDMLKQDSFPVSLEQAVTDSAMTSKTLEKAMTEPSALIEKSSIQELFEMRVDDKDKIEGVGAATSAELDMPFYEDKSGMSKYFETSALKEEATKSIEPGSDYYELSDTRESVHESIDTMSPMHKNGDKEFQTGKESQPSPPAQEAGYSTLAQSYPSDLPEEPSSPQERMFTIDPKVYGEKRDLHSKNKDDLTLSRSLGLGGRSAIEQRSMSINLPMSCLDSIALGFNFGRGHDLSPLASDILTNTSGSMDEGDDYLPATTPALEKAPCFPVESKEEEQIEKVKATGEESTQAEISCESPFLAKDFYKNGTVMAPDLPEMLDLAGTRSRLASVSADAEVARRKSVPSETVVEDSRTGLPPVTDENHVIVKTDSQLEDLGYCVFNKYTVPLPSPVQDSENLSGESGTFYEGTDDKVRRDLATDLSLIEVKLAAAGRVKDEFSVDKEASAHISGDKSGLSKEFDQEKKANDRLDTVLEKSEEHADSKEHAKKTEEAGDEIETFGLGVTYEQAL

In [5]:
test = subprocess.check_output(["ls"])

In [6]:
test

b'Blastp_CLI_Tau_Search.ipynb\nLIB_Chunks\nMAPT_Morgan\nNCBI_MSA_Phylo\nPresentations\nProtein_Sequence_Search.py\nProtein_Sequence_Search_Output\nPyPDB_Tau_Search.ipynb\nRAxML\nREADME.md\nScratch\n__pycache__\naccession_files\nblast+\ncheck_mapt_lib.py\ndna_translation.py\nfasta_files\nfasta_to_phy.py\nhelper_functions.py\nmuscle\ntemp.fasta\ntemp.out\n'

In [8]:
result = subprocess.check_output(["blastp", "-db", "nr", "-query", "temp.fasta", "-out", "temp.out", "-remote"])
result

b''