## This notebook is for dividing the amino sequences based on their respective structural elements

In [1]:
import pandas as pd
import numpy as np
from Bio import SeqIO, SeqRecord, Seq
import math

In [2]:
str_elements = pd.read_csv("../data/Structural_work/Structures_and_elements/Structural_element_residues_edited.csv")
str_elements.replace("n.a.", np.nan, inplace=True)
str_elements = str_elements.rename(columns={"Entry Name":"prot_id"})
str_elements

Unnamed: 0,prot_id,Periplasmic,TM,Cytoplasmic,Pore lining,C-terminal
0,AAC73831,14.0,60,100,230,
1,AAC76042,20.0,64,102,232,244.0
2,AAG04082,384.0,431,475,609,615.0
3,ABF90492,29.0,61,103,225,240.0
4,ABF91581,22.0,68,109,230,
5,ABF92363,13.0,62,96,227,
6,ABF92421,13.0,62,99,220,250.0
7,ACX73657.1,7.0,38,79,195,
8,ACX73657,330.0,361,402,522,
9,ADD67874.1,10.0,40,85,198,


In [3]:
id = []
li_seq = []

for seq_record in SeqIO.parse("../data/Structural_work/Structures_and_elements/PDB_seqs/combined.fasta", "fasta"):
    id.append(seq_record.id)
    li_seq.append(seq_record.seq)

seq_entries = dict(prot_id=id, prot_seq=li_seq)
seq_entries = pd.DataFrame(seq_entries)
seq_entries

Unnamed: 0,prot_id,prot_seq
0,AAC73831,"(M, T, D, M, N, I, L, D, L, F, L, K, A, S, L, ..."
1,AAC76042,"(M, G, N, N, L, M, Q, T, D, L, S, V, W, G, M, ..."
2,AAG04082,"(M, Y, R, L, L, L, T, S, L, I, C, L, G, L, L, ..."
3,ABF90492,"(M, T, P, F, L, S, L, A, Q, T, N, H, T, E, L, ..."
4,ABF91581,"(M, T, P, H, L, P, L, A, L, G, A, M, N, Y, V, ..."
5,ABF92363,"(M, Q, F, T, L, A, E, I, W, D, H, T, G, L, F, ..."
6,ABF92421,"(M, N, F, N, L, R, D, I, Y, N, H, M, G, V, F, ..."
7,ACX73657.1,"(E, F, F, H, N, G, G, I, L, M, Y, P, I, A, T, ..."
8,ACX73657,"(M, F, R, R, K, N, G, A, S, P, L, Q, T, K, D, ..."
9,ADD67874.1,"(F, N, M, E, L, I, Q, K, G, G, V, L, M, Y, P, ..."


In [4]:
str_seq_df = str_elements.merge(seq_entries, on=["prot_id"], how="inner")
str_seq_df

Unnamed: 0,prot_id,Periplasmic,TM,Cytoplasmic,Pore lining,C-terminal,prot_seq
0,AAC73831,14.0,60,100,230,,"(M, T, D, M, N, I, L, D, L, F, L, K, A, S, L, ..."
1,AAC76042,20.0,64,102,232,244.0,"(M, G, N, N, L, M, Q, T, D, L, S, V, W, G, M, ..."
2,AAG04082,384.0,431,475,609,615.0,"(M, Y, R, L, L, L, T, S, L, I, C, L, G, L, L, ..."
3,ABF90492,29.0,61,103,225,240.0,"(M, T, P, F, L, S, L, A, Q, T, N, H, T, E, L, ..."
4,ABF91581,22.0,68,109,230,,"(M, T, P, H, L, P, L, A, L, G, A, M, N, Y, V, ..."
5,ABF92363,13.0,62,96,227,,"(M, Q, F, T, L, A, E, I, W, D, H, T, G, L, F, ..."
6,ABF92421,13.0,62,99,220,250.0,"(M, N, F, N, L, R, D, I, Y, N, H, M, G, V, F, ..."
7,ACX73657.1,7.0,38,79,195,,"(E, F, F, H, N, G, G, I, L, M, Y, P, I, A, T, ..."
8,ACX73657,330.0,361,402,522,,"(M, F, R, R, K, N, G, A, S, P, L, Q, T, K, D, ..."
9,ADD67874.1,10.0,40,85,198,,"(F, N, M, E, L, I, Q, K, G, G, V, L, M, Y, P, ..."


In [7]:
def split_seq(row, prev_col, col):
    if pd.notna(row["Periplasmic"]):
        return row["prot_seq"][:int(row["Periplasmic"])]
    else:
        return np.nan

# str_seq_df["Periplasmic_seq"] = str_seq_df.apply(split_seq, axis=1)
# str_seq_df
str_seq_df["Periplasmic_seq"] = str_seq_df.apply(lambda x: x["prot_seq"][:int(x["Periplasmic"])] if pd.notna(x["Periplasmic"]) else np.nan, axis=1)
str_seq_df["TM_seq"] = str_seq_df.apply(lambda x: x["prot_seq"][int(x["Periplasmic"]):int(x["TM"])] if pd.notna(x["Periplasmic"]) else x["prot_seq"][:int(x["TM"])], axis=1)
str_seq_df["Cytoplasmic_seq"] = str_seq_df.apply(lambda x: x["prot_seq"][int(x["TM"]):int(x["Cytoplasmic"])], axis=1)
str_seq_df["Pore-lining_seq"] = str_seq_df.apply(lambda x: x["prot_seq"][int(x["Cytoplasmic"]):int(x["Pore lining"])], axis=1)
str_seq_df["C-term_seq"] = str_seq_df.apply(lambda x: x["prot_seq"][int(x["Pore lining"]):int(x["C-terminal"])] if pd.notna(x["C-terminal"]) else np.nan, axis=1)
str_seq_df

Unnamed: 0,prot_id,Periplasmic,TM,Cytoplasmic,Pore lining,C-terminal,prot_seq,Periplasmic_seq,TM_seq,Cytoplasmic_seq,Pore-lining_seq,C-term_seq
0,AAC73831,14.0,60,100,230,,"(M, T, D, M, N, I, L, D, L, F, L, K, A, S, L, ...","(M, T, D, M, N, I, L, D, L, F, L, K, A, S)","(L, L, V, K, L, I, M, L, I, L, I, G, F, S, I, ...","(E, L, S, R, L, Y, Q, E, S, Q, G, K, R, D, N, ...","(P, E, A, V, V, E, G, A, S, R, A, M, R, I, S, ...",
1,AAC76042,20.0,64,102,232,244.0,"(M, G, N, N, L, M, Q, T, D, L, S, V, W, G, M, ...","(M, G, N, N, L, M, Q, T, D, L, S, V, W, G, M, ...","(I, V, V, K, C, V, M, I, G, L, I, L, A, S, V, ...","(A, R, S, L, N, Q, A, N, D, I, A, A, D, F, G, ...","(D, N, E, G, I, K, E, R, T, S, F, R, L, E, R, ...","(H, P, V, R, V, A, Q, K, L, R, A, G)"
2,AAG04082,384.0,431,475,609,615.0,"(M, Y, R, L, L, L, T, S, L, I, C, L, G, L, L, ...","(M, Y, R, L, L, L, T, S, L, I, C, L, G, L, L, ...","(L, D, A, W, V, I, I, A, I, L, A, L, M, M, V, ...","(R, L, E, L, L, A, D, D, R, E, L, A, A, R, L, ...","(S, A, A, T, I, E, A, I, R, A, S, M, D, G, V, ...","(G, Q, P, L, P, A)"
3,ABF90492,29.0,61,103,225,240.0,"(M, T, P, F, L, S, L, A, Q, T, N, H, T, E, L, ...","(M, T, P, F, L, S, L, A, Q, T, N, H, T, E, L, ...","(A, E, W, V, L, W, V, L, V, I, L, S, V, L, S, ...","(D, S, E, A, L, A, V, R, L, A, R, G, E, Y, D, ...","(A, D, T, V, E, Q, V, I, A, S, T, L, S, R, E, ...","(P, A, S, N, T, A, T, A, G, R, A, A, E, A, R)"
4,ABF91581,22.0,68,109,230,,"(M, T, P, H, L, P, L, A, L, G, A, M, N, Y, V, ...","(M, T, P, H, L, P, L, A, L, G, A, M, N, Y, V, ...","(L, I, E, L, A, V, L, L, L, L, M, G, V, S, V, ...","(R, L, E, A, I, Y, Q, T, A, Q, K, L, D, G, S, ...","(A, L, A, E, R, L, G, G, I, E, N, V, E, R, A, ...",
5,ABF92363,13.0,62,96,227,,"(M, Q, F, T, L, A, E, I, W, D, H, T, G, L, F, ...","(M, Q, F, T, L, A, E, I, W, D, H, T, G)","(L, F, A, R, M, I, I, F, T, L, G, I, M, S, I, ...","(D, L, N, T, A, A, N, T, N, L, G, K, D, V, G, ...","(K, D, V, A, V, E, S, V, A, R, A, L, E, R, Q, ...",
6,ABF92421,13.0,62,99,220,250.0,"(M, N, F, N, L, R, D, I, Y, N, H, M, G, V, F, ...","(M, N, F, N, L, R, D, I, Y, N, H, M, G)","(V, F, A, L, G, I, A, W, T, L, I, L, F, A, V, ...","(Q, H, E, A, L, V, K, E, A, E, A, T, K, G, S, ...","(K, L, G, P, V, E, L, T, R, R, E, L, V, R, I, ...","(P, Q, K, P, V, A, T, N, G, A, A, V, A, T, G, ..."
7,ACX73657.1,7.0,38,79,195,,"(E, F, F, H, N, G, G, I, L, M, Y, P, I, A, T, ...","(E, F, F, H, N, G, G)","(I, L, M, Y, P, I, A, T, L, F, I, L, G, L, I, ...","(R, G, A, R, N, A, V, K, A, L, A, N, R, D, L, ...","(K, D, R, A, S, A, E, K, S, L, E, V, L, F, A, ...",
8,ACX73657,330.0,361,402,522,,"(M, F, R, R, K, N, G, A, S, P, L, Q, T, K, D, ...","(M, F, R, R, K, N, G, A, S, P, L, Q, T, K, D, ...","(I, L, M, Y, P, I, A, T, L, F, I, L, G, L, I, ...","(R, G, A, R, N, A, V, K, A, L, A, N, R, D, L, ...","(K, D, R, A, S, A, E, K, S, L, E, V, L, F, A, ...",
9,ADD67874.1,10.0,40,85,198,,"(F, N, M, E, L, I, Q, K, G, G, V, L, M, Y, P, ...","(F, N, M, E, L, I, Q, K, G, G)","(V, L, M, Y, P, I, I, F, L, S, V, L, A, L, A, ...","(P, A, L, L, K, E, K, I, K, Q, A, L, Q, N, N, ...","(L, Q, R, L, M, E, T, V, E, E, S, G, R, F, Q, ...",


In [33]:
def df_to_fasta(df, id_col_name, seq_col_name):
    fasta_output = ""

    for entry in df[[id_col_name, seq_col_name]].values:
        if pd.notna(entry[1]):
            fasta_output = fasta_output + ">" + entry[0] + "\n" + str(entry[1]) + "\n"

    return(fasta_output)

In [35]:
with open("../data/Structural_work/Structures_and_elements/Seq_domains/Periplasmic_domain_seqs.fasta", "w") as file:
    file.write(df_to_fasta(str_seq_df, "prot_id", "Periplasmic_seq"))

with open("../data/Structural_work/Structures_and_elements/Seq_domains/TM_domain_seqs.fasta", "w") as file:
    file.write(df_to_fasta(str_seq_df, "prot_id", "TM_seq"))

with open("../data/Structural_work/Structures_and_elements/Seq_domains/Cytoplasmic_domain_seqs.fasta", "w") as file:
    file.write(df_to_fasta(str_seq_df, "prot_id", "Cytoplasmic_seq"))

with open("../data/Structural_work/Structures_and_elements/Seq_domains/pore_lining_domain_seqs.fasta", "w") as file:
    file.write(df_to_fasta(str_seq_df, "prot_id", "Pore-lining_seq"))

with open("../data/Structural_work/Structures_and_elements/Seq_domains/C_term_domain_seqs.fasta", "w") as file:
    file.write(df_to_fasta(str_seq_df, "prot_id", "C-term_seq"))
