In [6]:
import json, csv
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis

#--- Variables: Test File Locations ---
test_fasta_file = "data\\test\\test_proteins.fasta"
test_json_file = "data\\test\\test_proteins.json"
# motif_file = ""
test_ifeature_file = "data\\test\\test_iFeature.json"
test_names_file = "data\\test\\test_names.tsv"
test_output_file = "data\\test\\test_protein_information.json"

#--- Variables: Training File Locations ---
train_protein_file = "data\\train\\train_proteins.fasta"
train_uniprot_file = "data\\train\\train_proteins.json"
train_ifeature_file = "data\\train\\train_iFeature.json"
train_output_file = "data\\train\\train_protein_information.json"

In [7]:
#--- Functions: Assign Annotations (UniProt) ---
def assign_annotations(json_file, proteins):
    # Load JSON data from UniProt file
    with open(json_file, 'r') as file:
        data = json.load(file)
        results = data["results"]
    
    for result in results:
        id = result["primaryAccession"]
        if "uniProtKBCrossReferences" in result: proteins[id]["go_terms"] = get_go_terms(result)
        if "comments" in result: proteins[id]["subcell_locations"] = get_subcellular_locations(result)
        if "features" in result: proteins[id]["transmembrane"] = get_transmembrane(result)
        if "keywords" in result: proteins[id]["binding_preference"] = get_binding_preference(result)

def get_go_terms(protein_info):
    go_terms = []
    references = protein_info["uniProtKBCrossReferences"]
    for reference in references:
        if reference["database"] == "GO":
            go_id = reference["id"]
            go_term = reference["properties"][0]["value"]
            go_terms.append((go_id, go_term))
    return go_terms

def get_subcellular_locations(protein_info):
    subcell_locations = []
    comments = protein_info["comments"]
    for comment in comments:
        if comment["commentType"] == "SUBCELLULAR LOCATION":
            locations = comment["subcellularLocations"]
            for location in locations:
                subcell_locations.append(location["location"]["value"])
    return subcell_locations

def get_transmembrane(protein_info):
    transmembrane = 0
    features = protein_info["features"]
    for feature in features:
        if feature["type"] == "Transmembrane":
            transmembrane = 1
    return transmembrane

def get_binding_preference(protein_info):
    bp = []
    keywords = protein_info["keywords"]
    for keyword in keywords:
        kw = keyword["name"]
        if (kw == "DNA-binding" or kw == "RNA-binding"):
            bp.append("DNA/RNA-binding")
        elif (kw == "Nucleotide-binding" or kw == "Metal-binding"):
            bp.append(kw)
    return bp

# #--- Functions: Assign Motifs (Prosite) ---
# def assign_motifs():
#     global proteins
#     with open(motif_file, 'r') as file:
#         data = json.load(file)
#     for id in data:
#         proteins[id]['motifs'] = data[id]

#--- Functions: Assign iFeature Data (iFeature) ---
def assign_iFeature_data(ifeature_file, proteins):
    with open(ifeature_file, 'r') as file:
        features = json.load(file)
    for feature, ids in features.items():
        for id, value in ids.items():
            proteins[id][feature] = value

#--- Functions: Assign Protein Names* ---
# *names exctracted from EMBL database for test proteins
def assign_names(names_file, proteins):
    data = []
    with open(names_file, 'r') as file:
        tsv_reader = csv.reader(file, delimiter='\t')
        next(tsv_reader)
        for row in tsv_reader:
            data.append(row)
    for protein in data:
        proteins[protein[0]]['name'] = protein[2]

#--- Functions: Generate JSON Based on Dictionary
def dict_to_json(dict, file_path):
    with open(file_path, 'w') as json_file:
        json.dump(dict, json_file, indent=4)

Test Proteins

In [None]:
# -- Variables: Protein Dictionary ---
test_proteins = {}

#--- Main: Generate Protein Dictionary ---
for record in SeqIO.parse(test_fasta_file, "fasta"):
    protein_id = record.id.split("|")[1].strip()
    seq = record.seq
    a_seq = ProteinAnalysis(seq)

    protein = {
        "seq" : str(seq),
        "name" : "",
        "length" : len(record),
        "m_weight": a_seq.molecular_weight(),
        "instab_index": a_seq.instability_index(),
        "isoele_point": a_seq.isoelectric_point(),
        "gravy": a_seq.gravy(),
        "amino_count": a_seq.count_amino_acids(),
        "aromaticity": a_seq.aromaticity(),
        "flexibility": a_seq.flexibility(),
        "sec_sruct_frac": a_seq.secondary_structure_fraction(),
        "ext_coeff": a_seq.molar_extinction_coefficient(),
        "go_terms": [],
        #"motifs": [],
        "dpc": [],
        # "tpc": [],
        "paac": [],
        "ctdc": [],
        "ctdt": [],
        "ctdd": [],
        "ctriad": [],
        "gaac": [],
        "moran": [],
        "subcell_locations": [],
        "transmembrane": 0,
        "binding_preference": [],
    }
    test_proteins[protein_id] = protein

In [None]:
assign_annotations(test_json_file, test_proteins)
#assign_motifs()
assign_iFeature_data(test_ifeature_file, test_proteins)
assign_names(test_names_file, test_proteins)

In [None]:
dict_to_json(test_proteins, test_output_file)

Train Proteins

In [8]:
# -- Variables: Protein Dictionary ---
train_proteins = {}

#--- Main: Generate Protein Dictionary ---
for record in SeqIO.parse(train_protein_file, "fasta"):
    protein_id = record.id.split("|")[1].strip()
    seq = record.seq
    a_seq = ProteinAnalysis(seq)

    protein = {
        "seq" : str(seq),
        "name" : "",
        "length" : len(record),
        "m_weight": a_seq.molecular_weight(),
        "instab_index": a_seq.instability_index(),
        "isoele_point": a_seq.isoelectric_point(),
        "gravy": a_seq.gravy(),
        "amino_count": a_seq.count_amino_acids(),
        "aromaticity": a_seq.aromaticity(),
        "flexibility": a_seq.flexibility(),
        "sec_sruct_frac": a_seq.secondary_structure_fraction(),
        "ext_coeff": a_seq.molar_extinction_coefficient(),
        "go_terms": [],
        #"motifs": [],
        "dpc": [],
        # "tpc": [],
        "paac": [],
        "ctdc": [],
        "ctdd": [],
        "ctdt": [],
        "ctriad": [],
        "gaac": [],
        "moran": [],
        "subcell_locations": [],
        "transmembrane": 0,
        "binding_preference": [],
    }
    train_proteins[protein_id] = protein

In [9]:
assign_annotations(train_uniprot_file, train_proteins)
#assign_motifs()
assign_iFeature_data(train_ifeature_file, train_proteins)
#Note: names not assigned for train proteins

In [10]:
dict_to_json(train_proteins, train_output_file)

Depreciated testing

In [None]:
# --- Test: Print Proteins ---
# for protein_id, protein_info in proteins.items():
#     print(f"Protein ID: {protein_id}, Name: {protein_info['name']}")
#     print(f"Protein Sequence: {protein_info['seq']}")
#     print(f"Protein Length: {protein_info['length']}")
#     print(f"Molecular Weight: {protein_info['m_weight']}")
#     print(f"Instability Index: {protein_info['instab_index']}")
#     print(f"Isoelectric Point: {protein_info['isoele_point']}")
#     print(f"Gravy: {protein_info['gravy']}")
#     print(f"Amino Acid Count: {protein_info['amino_count']}")
#     print(f"Aromaticity: {protein_info['aromaticity']}")
#     print(f"Flexibility: {protein_info['flexibility']}")
#     print(f"Secondary Structure Fraction: {protein_info['sec_sruct_frac']}")
#     print(f"Gene Ontology Terms: {protein_info['go_terms']}")
    # print(f"Motifs: {protein_info['motifs']}")
    # print(f"Dipeptide Composition: {protein_info['dpc']}")
    # print(f"Tripeptide Composition: {protein_info['tpc']}")
    # print(f"Pseudo Amino Acid Composition: {protein_info['paac']}")
    # print(f"Composition: {protein_info['ctdc']}")
    # print(f"Distribution: {protein_info['ctdd']}")
    # print(f"Translation: {protein_info['ctdt']}")
    # print(f"Conjoint Triad: {protein_info['ctriad']}")
    # print(f"Grouped Amino Acid Composition: {protein_info['gaac']}")
    # print(f"Moran Autocorrelation: {protein_info['moran']}")
    # print(f"Subcellular Locations: {protein_info['subcell_locations']}")
    # print(f"Transmembrane?: {protein_info['transmembrane']}")
    # print(f"Binding Preference: {protein_info['binding_preference']}")
    # print()