In [94]:
import requests
import os
import pandas as pd
import re


In [95]:
gene_id = "ADA2"

In [96]:
def extract_numbers(s):
    """
    Extracts all numbers from a given string and returns them as a list of integers.

    Parameters:
    s (str): The input string from which to extract numbers.

    Returns:
    list: A list of integers extracted from the string.
    """
    # Use regular expression to find all sequences of digits in the string
    numbers = re.findall(r'\d+', s)
    
    # Convert the extracted sequences to integers
    return [int(num) for num in numbers]

def create_color_dict(df, key_col, value_col):
    """
    Creates a dictionary where the color is set to red if the string contains 'pathogenic' or 'Pathogenic'.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing the key and value columns.
    key_col (str): The name of the column to be used as keys in the dictionary.
    value_col (str): The name of the column to be checked for 'pathogenic' or 'Pathogenic'.

    Returns:
    dict: A dictionary with keys from key_col and values as 'red' if the value_col contains 'pathogenic' or 'Pathogenic'.
    """
    color_dict = {
    row[key_col]: 'red' if 'pathogenic' in row[value_col].lower() else 
    'green' if 'benign' in row[value_col].lower() else None
    for _, row in df.iterrows()
    }
    return color_dict

def generate_pymol_script(df, position_col, color_col, output_file):
    """
    Generates a PyMOL script to color-code amino acid positions based on a key.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing amino acid positions and color keys.
    position_col (str): The name of the column containing amino acid positions.
    color_col (str): The name of the column containing color keys.
    output_file (str): The path to the output PyMOL script file.
    """
    with open(output_file, 'w') as f:
        f.write("from pymol import cmd\n\n")
        
        for index, row in df.iterrows():
            position = row[position_col]
            color = row[color_col]
            f.write(f"cmd.color('{color}', 'resi {position}')\n")
        
        f.write("\ncmd.show('cartoon')\n")
        f.write("cmd.bg_color('white')\n")
        f.write("cmd.zoom()\n")


In [97]:
# url = "http://v1.marrvel.org/data/clinvar"
# req = requests.get(url, params = {"geneSymbol": gene_id})
# df = pd.read_json(req.text)

# filtered_df = df['title'].str.extract(r'\(([^)]*p\.[^)]*)\)')

# significance_description = []
# for i, row in filtered_df.iterrows():
#         desc = filtered_df["significance"][i]["description"]
#         significance_description.append(desc)

# alleles_to_map = pd.DataFrame()
# alleles_to_map["uid"] = filtered_df["uid"]
# alleles_to_map["title"] = filtered_df["title"]
# alleles_to_map["significance"] = significance_description
# alleles_to_map['protein_change'] = filtered_df['title'].str.extract(r'\(([^)]*p\.[^)]*)\)')
# alleles_to_map['protein_change'] = alleles_to_map['protein_change'].str.replace('p.', '', regex=False)

# amino_acid_position = []
# for i in alleles_to_map["protein_change"].to_list():
#     position = extract_numbers(i)
#     print(position)
    
# display(alleles_to_map)

In [98]:
# import requests
# import os
# import pandas as pd

# gene_id = "ADA2"

# url = "http://v1.marrvel.org/data/clinvar"
# req = requests.get(url, params = {"geneSymbol": gene_id})
# df = pd.read_json(req.text)

# # Filter the DataFrame to include only rows where the title contains "(p."
# filtered_df = df[df['title'].str.contains(r'\(p\.', na=False)]

# # Reset the index without keeping the old index column
# filtered_df = filtered_df.reset_index(drop=True)

# print(filtered_df)

         uid                                           title  \
0    1285515         NM_001282225.2(ADA2):c.2T>A (p.Met1Lys)   
1    1285514       NM_001282225.2(ADA2):c.158del (p.Asn53fs)   
2    1184416       NM_001282225.2(ADA2):c.88A>G (p.Ile30Val)   
3    1171012    NM_001282225.2(ADA2):c.1225C>T (p.Pro409Ser)   
4    1171011  NM_001282225.2(ADA2):c.631TTC[1] (p.Phe212del)   
..       ...                                             ...   
172   120303     NM_001282225.2(ADA2):c.506G>A (p.Arg169Gln)   
173   120302     NM_001282225.2(ADA2):c.336C>G (p.His112Gln)   
174   120301      NM_001282225.2(ADA2):c.140G>C (p.Gly47Ala)   
175   120300     NM_001282225.2(ADA2):c.326C>A (p.Ala109Asp)   
176   120299    NM_001282225.2(ADA2):c.1358A>G (p.Tyr453Cys)   

                                  condition  \
0    Polyarteritis nodosa, childhoood-onset   
1    Polyarteritis nodosa, childhoood-onset   
2    Polyarteritis nodosa, childhoood-onset   
3                          Sneddon syndrome

  df = pd.read_json(req.text)


In [99]:
# # import requests
# # import os
# # import pandas as pd

# # gene_id = "ADA2"

# # url = "http://v1.marrvel.org/data/clinvar"
# # req = requests.get(url, params = {"geneSymbol": gene_id})
# # df = pd.read_json(req.text)

# # # Filter the DataFrame to include only rows where there is a set of parentheses containing "p."
# # filtered_df = df[df['title'].str.contains(r'\([^)]*p\.[^)]*\)', na=False)]

# # # Reset the index without keeping the old index column
# # filtered_df = filtered_df.reset_index(drop=True)

# # # Extract the string between parentheses that contains "p."
# # filtered_df['protein_change'] = filtered_df['title'].str.extract(r'\(([^)]*p\.[^)]*)\)')

# # # Remove the "p." prefix from the extracted protein change
# # filtered_df['protein_change'] = filtered_df['protein_change'].str.replace('p.', '', regex=False)

# # display(filtered_df)

# url = "http://v1.marrvel.org/data/clinvar"
# req = requests.get(url, params = {"geneSymbol": gene_id})
# df = pd.read_json(req.text)
# filtered_df = df[df['title'].str.contains("(p.")].reset_index(drop=True)

# # significance_description = []
# # for i, row in filtered_df.iterrows():
# #         desc = filtered_df["significance"][i]["description"]
# #         significance_description.append(desc)

# # alleles_to_map = pd.DataFrame()
# # alleles_to_map["uid"] = filtered_df["uid"]
# # alleles_to_map["title"] = filtered_df["title"]
# # alleles_to_map["significance"] = significance_description
# # # Filter the DataFrame to include only rows where there is a set of parentheses containing "p."
# # alleles_to_map = df[df['title'].str.contains(r'\([^)]*p\.[^)]*\)', na=False)]
# # alleles_to_map['protein_change'] = alleles_to_map['protein_change'].str.replace('p.', '', regex=False)

# # amino_acid_position = []
# # for i in alleles_to_map["protein_change"].to_list():
# #     position = extract_numbers(i)
# #     print(position)
    
# display(filtered_df)

In [100]:
import requests
import os
import pandas as pd

gene_id = "ADA2"

url = "http://v1.marrvel.org/data/clinvar"
req = requests.get(url, params = {"geneSymbol": gene_id})
df = pd.read_json(req.text)

# Filter the DataFrame to include only rows where the title contains "(p."
filtered_df = df[df['title'].str.contains(r'\(p\.', na=False)]

# Reset the index without keeping the old index column
filtered_df = filtered_df.reset_index(drop=True)

# Extract the string between parentheses that contains "p."
filtered_df['protein_change'] = filtered_df['title'].str.extract(r'\(([^)]*p\.[^)]*)\)')

# Remove the "p." prefix from the extracted protein change
filtered_df['protein_change'] = filtered_df['protein_change'].str.replace('p.', '', regex=False)

significance_description = []
for i, row in filtered_df.iterrows():
    desc = filtered_df["significance"][i]["description"]
    significance_description.append(desc)
filtered_df["significance_description"] = significance_description

amino_acid_position = []
for i in filtered_df["protein_change"].to_list():
    position = extract_numbers(i)
    if len(position) > 1:
        raise ValueError("Something went wrong. There hould not be more than one amino acid position")
    amino_acid_position.append(position[0])
filtered_df["amino_acid_position"] = amino_acid_position

# Generate PyMOL script
dict = create_color_dict(filtered_df, 'amino_acid_position', 'significance_description')
filtered_df["color"] = filtered_df["amino_acid_position"].map(dict)
generate_pymol_script(filtered_df, "amino_acid_position", "color", "color_code_script.pml")


  df = pd.read_json(req.text)


In [101]:
filtered_df

Unnamed: 0,uid,title,condition,significance,start,stop,protein_change,significance_description,amino_acid_position,color
0,1285515,NM_001282225.2(ADA2):c.2T>A (p.Met1Lys),"Polyarteritis nodosa, childhoood-onset","{'description': 'Uncertain significance', 'rev...",17690566,17690566,Met1Lys,Uncertain significance,1,
1,1285514,NM_001282225.2(ADA2):c.158del (p.Asn53fs),"Polyarteritis nodosa, childhoood-onset","{'description': 'Likely pathogenic', 'reviewSt...",17690409,17690409,Asn53fs,Likely pathogenic,53,green
2,1184416,NM_001282225.2(ADA2):c.88A>G (p.Ile30Val),"Polyarteritis nodosa, childhoood-onset","{'description': 'Uncertain significance', 'rev...",17690480,17690480,Ile30Val,Uncertain significance,30,
3,1171012,NM_001282225.2(ADA2):c.1225C>T (p.Pro409Ser),Sneddon syndrome,"{'description': 'Pathogenic', 'reviewStatus': ...",17663508,17663508,Pro409Ser,Pathogenic,409,red
4,1171011,NM_001282225.2(ADA2):c.631TTC[1] (p.Phe212del),Sneddon syndrome,"{'description': 'Pathogenic', 'reviewStatus': ...",17684569,17684569,Phe212del,Pathogenic,212,red
...,...,...,...,...,...,...,...,...,...,...
172,120303,NM_001282225.2(ADA2):c.506G>A (p.Arg169Gln),Sneddon syndrome,{'description': 'Pathogenic/Likely pathogenic'...,17687997,17687997,Arg169Gln,Pathogenic/Likely pathogenic,169,red
173,120302,NM_001282225.2(ADA2):c.336C>G (p.His112Gln),"Polyarteritis nodosa, childhoood-onset","{'description': 'Pathogenic', 'reviewStatus': ...",17688167,17688167,His112Gln,Pathogenic,112,red
174,120301,NM_001282225.2(ADA2):c.140G>C (p.Gly47Ala),"Polyarteritis nodosa, childhoood-onset","{'description': 'Pathogenic', 'reviewStatus': ...",17690428,17690428,Gly47Ala,Pathogenic,47,red
175,120300,NM_001282225.2(ADA2):c.326C>A (p.Ala109Asp),"Polyarteritis nodosa, childhoood-onset","{'description': 'Pathogenic', 'reviewStatus': ...",17688177,17688177,Ala109Asp,Pathogenic,109,red
