In [61]:
import os
import pandas as pd 
import matplotlib.pyplot as plt

In [62]:
LENGTH = 214
MAX_TAIL = 3 # Max alanine tail length
TAG_NAMES = ["T7", "V5", "S", "HAT"]
TAG_LENS = [11, 14, 15, 19]
TAG_SEQS = ["MASMTGGQQMG", "GKPIPNPLLGLDST", "KETAAAKFERQHMDS", "KDHLIHNVHKEFHAHAHNK"]
LINK_LENS = [20, 29]
LINK_SEQS = ["GGGSGGGSGGGSGGPGS", "GSGGGSGGSGGGGSGGGGSGGGGSGGGGS"]
LEN_RANGE = (90, 335)

DDIR = "../../pdb/bylen/"
ODIR = "../../pdb/proc/"

In [63]:
if not os.path.isdir(ODIR):
    os.mkdir(ODIR)

In [64]:
# Load sequences with fixed length
data = pd.read_csv(DDIR+"pdb_seq_{}.csv".format(LENGTH), sep=',')["seq"]
data = pd.DataFrame(data.apply(list).to_list())
data.to_csv(ODIR+'test.csv', index=False)
INIT_DATA = data.shape[0]
data = None

In [65]:
# Load sequences with alanine tail
for alen in range(1, MAX_TAIL+1):
    adata = pd.read_csv(DDIR+"pdb_seq_{}.csv".format(LENGTH-alen), delimiter=',')
    adata = adata["seq"] + "A"*alen
    adata = pd.DataFrame(adata.apply(list).to_list())
    if data is None:
        data = adata
    else:
        data = pd.concat([data, adata])

In [66]:
# Load sequences with tag tail
for tlen, tseq in zip(TAG_LENS, TAG_SEQS):
    tdata = pd.read_csv(DDIR+"pdb_seq_{}.csv".format(LENGTH-tlen), delimiter=',')
    tdata = tdata["seq"] + tseq
    tdata = pd.DataFrame(tdata.apply(list).to_list())
    data = pd.concat([data, tdata])

In [67]:
# Load sequences linked with linker
augseqs = []
for llen, lseq in zip(LINK_LENS, LINK_SEQS):
    for slen in range(90, LENGTH-llen-89):
        elen = LENGTH-llen-slen
        sdata = pd.read_csv(DDIR+"pdb_seq_{}.csv".format(slen), delimiter=',')["seq"].tolist()
        edata = pd.read_csv(DDIR+"pdb_seq_{}.csv".format(elen), delimiter=',')["seq"].tolist()
        for s in sdata:
            for e in edata:
                augseqs.append(list(s+lseq+e))
                break

In [68]:
data = pd.concat([data, pd.DataFrame(augseqs)])
AUG_DATA = data.shape[0]
print(INIT_DATA, AUG_DATA, AUG_DATA//INIT_DATA) 

897 9471 10


In [69]:
data["ptn"] = 1
data.to_csv(ODIR+"data.csv", index=False)