In [None]:
import json
import csv

protein_info_file = "data\\protein_information.json"
training_file = "f_covae_training.tsv"
test_file = "f_covae_test.tsv"

In [None]:
with open(protein_info_file, 'r') as file:
    proteins_raw = json.load(file)

proteins_prep = {}
for id in proteins_raw.keys():
    proteins_prep[id] = {}

Names

In [None]:
for p_id, p_data in proteins_raw.items():
    proteins_prep[p_id]["name"] = p_data["name"]

GO Terms

In [None]:
go_terms = {"mf": [], "cc": [], "bp": []}
for p_id, p_data in proteins_raw.items():
    for p_term in p_data["go_terms"]:
        go_type = ""
        match p_term[1][0]:
            case "F": go_type = "mf"
            case "P": go_type = "bp"
            case "C": go_type = "cc"

        if p_term[0] not in go_terms[go_type]:
            go_terms[go_type].append(p_term[0])

for aspect in go_terms.keys():
    go_terms[aspect].sort()

In [None]:
for id in proteins_prep.keys():
    proteins_prep[id]["go_terms"] = {}

for aspect, terms in go_terms.items():
    for p_id, p_data in proteins_raw.items():
        p_encoded_go = [0] * len(go_terms[aspect])
        for p_go in p_data["go_terms"]:
            if p_go[0] in terms:
                index = terms.index(p_go[0])
                p_encoded_go[index] = 1
        proteins_prep[p_id]["go_terms"][aspect] = p_encoded_go


In [None]:
#test
test_protein_terms = proteins_raw["A0A0X8BZN2"]["go_terms"]
for term in test_protein_terms:
    if "F:" in term[1]: print(f"F:{term[0]}: {go_terms['mf'].index(term[0])}")
    elif "C:" in term[1]: print(f"C:{term[0]}: {go_terms['cc'].index(term[0])}")
    elif "P:" in term[1]: print(f"P:{term[0]}: {go_terms['bp'].index(term[0])}")

print()
for aspect in go_terms.keys():
    c = -1
    for num in proteins_prep["A0A0X8BZN2"]["go_terms"][aspect]:
        c+=1
        if num == 1: print(c)
    print()

Subcellular Location Feature

In [None]:
loc_code = {
    "0": ["Cytoplasm", "Cytoplasmic side", "Cytoplasm, nucleoid"],
    "1": ["Cell membrane", "Membrane", "Cell inner membrane", "Cell outer membrane", "Cytoplasmic side"],
    "2": ["Cell wall"],
    "3": ["Secreted"],
    "4": ["Periplasm", "Periplasmic side"],
    "5": ["Cell surface"],
    "6": ["Cell envelope"],
    "7": ["Chlorosome"],
    "8": ["Cellular thylakoid membrane"],
    "9": ["Cellular chromatopore membrane"],
    "10": ["Single-pass membrane protein"],
    "11": ["Multi-pass membrane protein"],
    "12": ["Peripheral membrane protein"]
}
locs = {}
for p_id, p_data in proteins_raw.items():
    p_loc_ids = []
    for p_loc in p_data["subcell_locations"]:
        for loc_id, loc_group in loc_code.items():
            if p_loc in loc_group:
                p_loc_ids.append(loc_id)
    locs[p_id] = p_loc_ids

In [None]:
for p_id in proteins_raw.keys():
    p_encoded_locs = [0] * 13
    for p_loc in locs[p_id]:
        p_encoded_locs[int(p_loc)] = 1
    proteins_prep[p_id]["subcell_locations"] = p_encoded_locs

In [None]:
#test
for loc in proteins_raw["A0A0X8C1K8"]["subcell_locations"]:
    print(loc)

print()

for loc in proteins_prep["A0A0X8C1K8"]["subcell_locations"]:
    print(loc)

Motifs (DO NOT USE) - All motifs collected, too many features. Unsure of how consensus patterns work.

In [None]:
# motifs = []
# for p_id, p_data in proteins_raw.items():
#     for motif in p_data["motifs"]:
#         if motif not in motifs:
#             motifs.append(motif)

# motifs.sort()
# num_motifs = len(motifs)

In [None]:
# for p_id, p_data in proteins_raw.items():
#     p_encoded_motifs = [0] * num_motifs
#     for p_motif in p_data["motifs"]:
#         index = motifs.index(p_motif)
#         p_encoded_motifs[index] = 1
#     proteins_prep[p_id]["motifs"] = p_encoded_motifs

In [None]:
# #test
# test_protein_motifs = proteins_raw["A0A0X8BZN2"]["motifs"]
# for motif in test_protein_motifs:
#     print(f"{motif}: {motifs.index(motif)}")
# print()
# c = -1
# for num in proteins_prep["A0A0X8BZN2"]["motifs"]:
#     c+=1
#     if num == 1: print(c)

Simple Biopython Features

In [None]:
biopyfeatures = ["length", "m_weight", "instab_index", "isoele_point", "gravy", "sec_sruct_frac", "ext_coeff"]
for p_id, p_data in proteins_raw.items():
    for feature in biopyfeatures:
        proteins_prep[p_id][feature] = p_data[feature]


In [None]:
#test
for feature in biopyfeatures:
    print(proteins_raw["A0A0X8BZN2"][feature])

print()

for feature in biopyfeatures:
    print(proteins_prep["A0A0X8BZN2"][feature])

Amino Count

In [None]:
for p_id, p_data in proteins_raw.items():
    acids = []
    for acid in p_data["amino_count"].values():
        acids.append(acid)
    proteins_prep[p_id]["amino_count"] = acids

In [None]:
#test
for acid in proteins_raw["A0A0X8BZN2"]["amino_count"].values():
    print(acid)

print()

for acid in proteins_prep["A0A0X8BZN2"]["amino_count"]:
    print(acid)


Split Training and Test (hypothetical proteins)

In [None]:
train_proteins = {}
test_proteins = {}

for id, data in proteins_prep.items():
    if "hypothetical protein" in data["name"]:
        test_proteins[id] = data
    else:
        train_proteins[id] = data



OUTPUT

In [None]:
def output_proteins(proteins, outfile):
    rows = []

    header = ["protein_id"]
    r1 = proteins[next(iter(proteins))]
    for feature, values in r1.items():
        if feature == "go_terms": continue
        if type(values) == list:
            for i in range(0,len(values)):
                header.append(f"{feature}{i}")
        else:
            header.append(feature)
    for term in go_terms["mf"]:
        header.append(term)
    rows.append(header)


    for p_id, p_data in proteins.items():
        row = [p_id]
        for feature, value in p_data.items():
            if feature == "go_terms": continue
            if type(value) == list:
                for i in range(0,len(value)):
                    row.append(value[i])
            else:
                row.append(value)
        for value in p_data["go_terms"]["mf"]:
            row.append(value)
        rows.append(row)

    with open(outfile, "w", newline="") as file:
        writer = csv.writer(file, delimiter="\t")
        writer.writerows(rows)

output_proteins(train_proteins, training_file)
output_proteins(test_proteins, test_file)