In [1]:
import pandas as pd
from Bio import SeqIO
from os import listdir
from os.path import isfile, join
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

In [2]:
sauer = ["D","E"]
aliphatisch = ["A","G","I","L","V"]
amidisch = ["N", "Q"]
aromatisch = ["F","W","Y"]
basisch = ["R","H","K"]
hydroxyl = ["S","T"]
imino = ["P"]
schwefel = ["C","M"]

In [3]:
chem_alphabet = {
    "D":"s",
    "E":"s",
    "A":"a",
    "G":"a",
    "I":"a",
    "L":"a",
    "V":"a",
    "N":"m",
    "Q":"m",
    "F":"r",
    "W":"r",
    "Y":"r",
    "R":"b",
    "H":"b",
    "K":"b",
    "S":"h",
    "T":"h",
    "P":"i",
    "C":"w",
    "M":"w" ,
    "X":"x"   #The one-letter symbol for an undetermined amino acid is X
}

In [11]:
def readPDB(fasta):
    for record in SeqIO.parse("../data/pdb_str/fasta/%s" % fasta, "fasta"):
        pdb = list(record.seq)
    return pdb

def readFasta(d):
    for record in SeqIO.parse(d, "fasta"):
        pdb = "".join(list(record.seq))
    return pdb        

def map_alphabet(alphabet, seq):
    return [alphabet[s] for s in seq]

def convert_to_str(alphabet, filename):
    pdb = filename.split(".")[0]
    seq = "".join(map_alphabet(chem_alphabet, readPDB(filename)))
    return SeqRecord(Seq(seq), id=pdb, description="")

In [5]:
fasta_dir = "../data/pdb_str/fasta/"
chem_dir = "../data/pdb_str/chem/"
fasta_files = [  f for f in listdir("../data/pdb_str/fasta/") ]
fasta_files[1:4]

['132lA00.fasta', '153lA00.fasta', '155cA00.fasta']

In [11]:
chem_seq = { f:convert_to_str(chem_alphabet, f) for f in fasta_files }

TypeError: parse() missing 1 required positional argument: 'format'

In [50]:
for k in chem_seq:
    with open("%s/%s" % (chem_dir, k), "w") as o:
        SeqIO.write(chem_seq[k], o, "fasta")

In [16]:
[ f for f in listdir(chem_dir) if len(readFasta("/".join([chem_dir, f]))) < 50 ] 

['1a1iA01.fasta',
 '1a3pA00.fasta',
 '1a5tA02.fasta',
 '1a62A01.fasta',
 '1afoA00.fasta',
 '1aggA00.fasta',
 '1agrH01.fasta',
 '1ahlA00.fasta',
 '1al0B00.fasta',
 '1amlA00.fasta',
 '1aooA00.fasta',
 '1aq5A00.fasta',
 '1aroP02.fasta',
 '1autL01.fasta',
 '1b01A00.fasta',
 '1b8wA00.fasta',
 '1b8xA03.fasta',
 '1b9wA02.fasta',
 '1b9xC02.fasta',
 '1bazC00.fasta',
 '1bbgA00.fasta',
 '1bccD01.fasta',
 '1bdsA00.fasta',
 '1bg5A03.fasta',
 '1bh9A00.fasta',
 '1bhpA00.fasta',
 '1bi6H00.fasta',
 '1bjtA01.fasta',
 '1bpeA01.fasta',
 '1bvsF03.fasta',
 '1bzkA00.fasta',
 '1cf4B00.fasta',
 '1cfhA00.fasta',
 '1ci6B00.fasta',
 '1cixA00.fasta',
 '1co4A00.fasta',
 '1cp9A02.fasta',
 '1crnA00.fasta',
 '1csbA01.fasta',
 '1cukA03.fasta',
 '1cwxA00.fasta',
 '1czqA00.fasta',
 '1d4vA02.fasta',
 '1d4vA03.fasta',
 '1d5sB00.fasta',
 '1d66B02.fasta',
 '1d6gA00.fasta',
 '1dd4D01.fasta',
 '1devB00.fasta',
 '1dgwY01.fasta',
 '1dipA01.fasta',
 '1dqbA01.fasta',
 '1dv0A00.fasta',
 '1dx5I02.fasta',
 '1dx7A00.fasta',
 '1e0aB00.