inputs

In [9]:
input_gene_id = "ADA2" ## entrez ID
input_entrez_id = 51816
input_uniprot_id = "Q9NZK5"
input_protein_file = "ADA2.txt" ## isoform file

optional inputs

In [10]:
drosophila_entrez_id = 7227
homo_sapiens_entrez_id = 9606

script_folder = "oracle_scripts"
get_protein_info_script = f"{script_folder}/get_protein_info.py"
clustalw_script = f"{script_folder}/clustalw.sh"

dry_run = "yes"

imports

In [11]:
import oracle_functions
import requests
import pandas as pd
import os
import importlib

importlib.reload(oracle_functions)

<module 'oracle_functions' from 'l:\\Lab-Rusan\\Jacie\\00.code\\00.git\\oracle\\oracle_functions.py'>

code

orthologs and alignments

In [12]:
## make ortholog output folder
ortholog_and_alignment_output_folder = "ortholog_and_alignments_output"
os.system(f"mkdir {ortholog_and_alignment_output_folder}")

## getting and filtering DIOPT orthologs
diopt_results, diopt_file = oracle_functions.pull_diopt_orthologs(homo_sapiens_entrez_id, drosophila_entrez_id, input_entrez_id, ortholog_and_alignment_output_folder)
filtered_diopt_results, filtered_diopt_file = oracle_functions.filter_diopt_results(diopt_results, diopt_file)

## getting protein info for alignment
diopt_id_list = filtered_diopt_results["entrez_id"].to_list()
diopt_id_string = " ".join(map(str, diopt_id_list))
diopt_fasta = "protein_orthologs.fasta"
command = f"python {get_protein_info_script} {diopt_id_string} {ortholog_and_alignment_output_folder}/protein_orthologs.zip {diopt_fasta}"
os.system(command)

## combine into one file
combined_file = f"{ortholog_and_alignment_output_folder}/combined_proteins.fasta"
with open(input_protein_file, 'r') as f1:
    data1 = f1.read()
with open(diopt_fasta, 'r') as f2:
    data2 = f2.read()
with open(combined_file, 'w') as cf:
    cf.write(data1)
    cf.write("\n")
    cf.write(data2)

# submit clustalw script
if dry_run == "no":
    os.system(f"sbatch {clustalw_script} -INFILE={combined_file}")



Found DIOPT orthologs


OSError: Cannot save file into a non-existent directory: 'ortholog_output'

evolution: phylogenetic tree

In [None]:
alignment_file = f"{ortholog_and_alignment_output_folder}/msa_test.dnd"
oracle_functions.visualize_phylogenetic_tree(alignment_file, output_file=f"{ortholog_and_alignment_output_folder}/phylo_tree.png")

mutation effects

In [None]:
url = "http://v1.marrvel.org/data/clinvar"
req = requests.get(url, params = {"geneSymbol": input_gene_id})
df = pd.read_json(req.text)

# Filter the DataFrame to include only rows where the title contains "(p."
filtered_df = df[df['title'].str.contains(r'\(p\.', na=False)]
filtered_df = filtered_df.reset_index(drop=True)

# Extract the string between parentheses that contains "p."
filtered_df['protein_change'] = filtered_df['title'].str.extract(r'\(([^)]*p\.[^)]*)\)')

# Remove the "p." prefix from the extracted protein change
filtered_df['protein_change'] = filtered_df['protein_change'].str.replace('p.', '', regex=False)

significance_description = []
for i, row in filtered_df.iterrows():
    desc = filtered_df["significance"][i]["description"]
    significance_description.append(desc)
filtered_df["significance_description"] = significance_description

amino_acid_position = []
for i in filtered_df["protein_change"].to_list():
    position = oracle_functions.extract_numbers(i)
    if len(position) > 1:
        raise ValueError("Something went wrong. There hould not be more than one amino acid position")
    amino_acid_position.append(position[0])
filtered_df["amino_acid_position"] = amino_acid_position

# Generate PyMOL script
dict = oracle_functions.create_color_dict(filtered_df, 'amino_acid_position', 'significance_description')
filtered_df["color"] = filtered_df["amino_acid_position"].map(dict)
oracle_functions.generate_pymol_script_all  eles(filtered_df, "amino_acid_position", "color", "color_alleles.pml")


In [None]:
url = f"https://rest.uniprot.org/uniprotkb/{input_uniprot_id}"
req = requests.get(url)
data = req.json()

temp_list = []
for i in range(0, len(data["features"])):
    if data["features"][i]["type"] == "Active site" or data["features"][i]["type"] == "Binding site":
        temp_list.append(data["features"][i])
df = pd.DataFrame(temp_list)
df_flattened = pd.json_normalize(temp_list)
oracle_functions.generate_pymol_script_domains(df_flattened)