iFeature File Cleaning

In [None]:
import csv
import json
from Bio import SeqIO

#--- iFeature Files [CHANGE BASED ON FILE LOCATION] ---
ifeature_files = {
"dpc" : "data\\train\\ifeature\\dpc.tsv",
#"tpc" : "data\\tpc.tsv",
"paac" : "data\\train\\ifeature\\paac.tsv",
"ctdc" : "data\\train\\ifeature\\ctdc.tsv",
"ctdt" : "data\\train\\ifeature\\ctdt.tsv",
"ctdd" : "data\\train\\ifeature\\ctdd.tsv",
"ctriad" : "data\\train\\ifeature\\ctriad.tsv",
"gaac" : "data\\train\\ifeature\\gaac.tsv",
"moran" : "data\\train\\ifeature\\moran.tsv"
}

#--- iFeature output file ---
out_file = "train_iFeature.json"

def nested_dict_to_json(nested_dict, file_path):
    with open(file_path, 'w') as json_file:
        json.dump(nested_dict, json_file, indent=4)

def json_to_nested_dict(file_path):
    with open(file_path, 'r') as json_file:
        nested_dict = json.load(json_file)
    return nested_dict

#--- iFeature cleaning ---
def create_dictionary_from_iFeature_tsv(ifeature_file):
    dictionary = {}
    with open(ifeature_file, 'r') as file:
        reader = csv.reader(file, delimiter='\t')
        headers = next(reader) 
        for row in reader:
            key = row[0].split("|")[1].strip()  #grabs only protein id
            values = {header: value for header, value in zip(headers[1:], row[1:])}
            dictionary[key] = values
    return dictionary


combined_dictionary = {}

for key, value in ifeature_files.items():
    combined_dictionary[key] = create_dictionary_from_iFeature_tsv(value)

nested_dict_to_json(combined_dictionary, out_file)

Filter proteins based on annotion score and completeness of sequence

In [None]:
import json
from Bio import SeqIO

json_raw_file = "data\\train\\uniprotkb_taxonomy_id_237_2023_07_09.json"        #json input file from uniprot download
json_filtered_file = "train_proteins.json"                                      #json output file for filtered proteins
fasta_raw_file = "data\\train\\uniprotkb_taxonomy_id_237_2023_07_08.fasta"      #fasta input file from uniprot download
fasta_filtered_file = "train_proteins.fasta"                                    #fasta output file for filtered proteins

with open(json_raw_file, 'r') as file:
        data = json.load(file)
        results = data["results"]

In [None]:
#filters out proteins with 1 or 2 annotation score
anno_results = []
for result in results:
    id = result["primaryAccession"]
    score = str(result["annotationScore"])
    if "3" in score or "4" in score or "5" in score:
        anno_results.append(result)
    
print(len(anno_results))

In [None]:
#filters out proteins with incomplete sequence (sequence contains X)
seq_results = []
for result in anno_results:
    id = result["primaryAccession"]
    seq = str(result["sequence"]["value"])
    if not ("X" in seq or "x" in seq):
        seq_results.append(result)
    
print(len(seq_results))

In [None]:
with open(json_filtered_file, 'w') as json_file:
        json.dump({"results" : seq_results}, json_file, indent=4)

In [None]:
json_results = seq_results

in_fasta_proteins = []
for record in SeqIO.parse(fasta_raw_file, "fasta"):
    in_fasta_proteins.append(record)

protein_ids = []
for result in json_results:
    id = result["primaryAccession"]
    protein_ids.append(id)

out_fasta_proteins = []
for protein in in_fasta_proteins:
    for id in protein_ids:
        if id in protein.id:
            out_fasta_proteins.append(protein)

print(len(out_fasta_proteins))

In [None]:
with open(fasta_filtered_file, "w") as output_handle:
    SeqIO.write(out_fasta_proteins, output_handle, "fasta")

Compile protein features

In [None]:
import json, csv
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis

#--- Variables: Training File Locations [CHANGE BASED ON FILE LOCATION] ---
train_protein_file = "train_proteins.fasta"                #filtered fasta file from above      
train_uniprot_file = "train_proteins.json"                 #filtered proteins from above
train_ifeature_file = "train_iFeature.json"                #combined iFeature data from above
train_output_file = "train_protein_information.json"       #output file for proteins with compile protein features

In [None]:
#--- Functions: Assign Annotations (UniProt) ---
def assign_annotations(json_file, proteins):
    # Load JSON data from UniProt file
    with open(json_file, 'r') as file:
        data = json.load(file)
        results = data["results"]
    
    for result in results:
        id = result["primaryAccession"]
        if "uniProtKBCrossReferences" in result: proteins[id]["go_terms"] = get_go_terms(result)
        if "comments" in result: proteins[id]["subcell_locations"] = get_subcellular_locations(result)
        if "features" in result: proteins[id]["transmembrane"] = get_transmembrane(result)
        if "keywords" in result: proteins[id]["binding_preference"] = get_binding_preference(result)

def get_go_terms(protein_info):
    go_terms = []
    references = protein_info["uniProtKBCrossReferences"]
    for reference in references:
        if reference["database"] == "GO":
            go_id = reference["id"]
            go_term = reference["properties"][0]["value"]
            go_terms.append((go_id, go_term))
    return go_terms

def get_subcellular_locations(protein_info):
    subcell_locations = []
    comments = protein_info["comments"]
    for comment in comments:
        if comment["commentType"] == "SUBCELLULAR LOCATION":
            locations = comment["subcellularLocations"]
            for location in locations:
                subcell_locations.append(location["location"]["value"])
    return subcell_locations

def get_transmembrane(protein_info):
    transmembrane = 0
    features = protein_info["features"]
    for feature in features:
        if feature["type"] == "Transmembrane":
            transmembrane = 1
    return transmembrane

def get_binding_preference(protein_info):
    bp = []
    keywords = protein_info["keywords"]
    for keyword in keywords:
        kw = keyword["name"]
        if (kw == "DNA-binding" or kw == "RNA-binding"):
            bp.append("DNA/RNA-binding")
        elif (kw == "Nucleotide-binding" or kw == "Metal-binding"):
            bp.append(kw)
    return bp


#--- Functions: Assign iFeature Data (iFeature) ---
def assign_iFeature_data(ifeature_file, proteins):
    with open(ifeature_file, 'r') as file:
        features = json.load(file)
    for feature, ids in features.items():
        for id, value in ids.items():
            proteins[id][feature] = value

#--- Functions: Assign Protein Names* ---
# *names exctracted from EMBL database for test proteins
def assign_names(names_file, proteins):
    data = []
    with open(names_file, 'r') as file:
        tsv_reader = csv.reader(file, delimiter='\t')
        next(tsv_reader)
        for row in tsv_reader:
            data.append(row)
    for protein in data:
        proteins[protein[0]]['name'] = protein[2]

#--- Functions: Generate JSON Based on Dictionary
def dict_to_json(dict, file_path):
    with open(file_path, 'w') as json_file:
        json.dump(dict, json_file, indent=4)

In [None]:
# -- Variables: Protein Dictionary ---
train_proteins = {}

#--- Main: Generate Protein Dictionary ---
for record in SeqIO.parse(train_protein_file, "fasta"):
    protein_id = record.id.split("|")[1].strip()
    seq = record.seq
    a_seq = ProteinAnalysis(seq)

    protein = {
        "seq" : str(seq),
        "name" : "",
        "length" : len(record),
        "m_weight": a_seq.molecular_weight(),
        "instab_index": a_seq.instability_index(),
        "isoele_point": a_seq.isoelectric_point(),
        "gravy": a_seq.gravy(),
        "amino_count": a_seq.count_amino_acids(),
        "aromaticity": a_seq.aromaticity(),
        "flexibility": a_seq.flexibility(),
        "sec_sruct_frac": a_seq.secondary_structure_fraction(),
        "ext_coeff": a_seq.molar_extinction_coefficient(),
        "go_terms": [],
        "dpc": [],
        # "tpc": [],
        "paac": [],
        "ctdc": [],
        "ctdd": [],
        "ctdt": [],
        "ctriad": [],
        "gaac": [],
        "moran": [],
        "subcell_locations": [],
        "transmembrane": 0,
        "binding_preference": [],
    }
    train_proteins[protein_id] = protein

In [None]:
assign_annotations(train_uniprot_file, train_proteins)
assign_iFeature_data(train_ifeature_file, train_proteins)

In [None]:
dict_to_json(train_proteins, train_output_file)

Filter for proteins with desired go terms

In [1]:
import json
from Bio import SeqIO

proteins_json_file = "train_protein_information.json"
proteins_fasta_file = "train_proteins.fasta"

with open(proteins_json_file, 'r') as file:
    in_json_proteins = json.load(file)

in_fasta_proteins = []
for record in SeqIO.parse(proteins_fasta_file, "fasta"):
    in_fasta_proteins.append(record)

In [2]:
go_list = [
"GO:0000049",
"GO:0000287",
"GO:0003677",
"GO:0003723",
"GO:0005506",
"GO:0005524",
"GO:0005525",
"GO:0008270",
"GO:0016887",
"GO:0019843",
"GO:0030170",
"GO:0046872",
"GO:0050661",
"GO:0051287",
"GO:0051539",
]

protein_ids = []
selected_ids = []

out_json_proteins = {}
for id, info in in_json_proteins.items():
    protein_ids.append(id)
    flag = False
    for term in info["go_terms"]:
        if term[0] in go_list:
            flag = True
            selected_ids.append(id)
    if flag == True:
        out_json_proteins[id] = info
    
print(len(out_json_proteins))

out_fasta_proteins = []
for protein in in_fasta_proteins:
    if protein.id.split("|")[1].strip() in selected_ids:
        out_fasta_proteins.append(protein)

print(len(out_fasta_proteins))

32822
32822


In [3]:
with open(proteins_json_file, 'w') as json_file:
        json.dump(out_json_proteins, json_file, indent=4)

with open(proteins_fasta_file, "w") as output_handle:
    SeqIO.write(out_fasta_proteins, output_handle, "fasta")