<a href="https://colab.research.google.com/github/kattens/PubChem-Data-Handler/blob/main/Visualization_and_Alignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

in this notebook we will use py3mol package to get access to the protein sequences and 3d structure, we will also do alignment of the structure of 2 proteins as well to check the similarities between them.

In [None]:
!pip install biopython
!pip install py3Dmol

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85
Collecting py3Dmol
  Downloading py3Dmol-2.4.2-py2.py3-none-any.whl.metadata (1.9 kB)
Downloading py3Dmol-2.4.2-py2.py3-none-any.whl (7.0 kB)
Installing collected packages: py3Dmol
Successfully installed py3Dmol-2.4.2


In [None]:
from Bio.PDB import PDBParser,PPBuilder, PDBList,Superimposer, PDBIO, Select
import py3Dmol
import requests
import json

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/3.3 MB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.8/3.3 MB[0m [31m12.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.3/3.3 MB[0m [31m34.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85
Collecting py3Dmol
  Downloading py3Dmol-2.4.2-py2.py3-none-any.whl.met

In [None]:
def get_pdb_info_from_uniprot(uniprot_id):
    """
    Retrieves PDB IDs associated with a UniProt ID from the UniProt API,
    and then retrieves specific information (resolution, length, etc.) about
    each PDB entry using the RCSB PDB API, formatting the output as a dictionary
    with the structure you specified.

    Args:
        uniprot_id (str): The UniProt ID (e.g., P0DTC2).

    Returns:
        dict: A dictionary with PDB IDs as keys and dictionaries of structured
              information as values, formatted as you requested.
              Returns an empty dictionary if no PDB IDs are found or if there's an error.
    """

    # Step 1: Get PDB IDs associated with the UniProt ID from UniProt API
    uniprot_url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}?format=json"

    try:
        response = requests.get(uniprot_url)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        data = response.json()

        pdb_ids = []
        for dbReference in data.get('uniProtKBCrossReferences', []):
            if dbReference.get('database') == 'PDB':
                pdb_ids.append(dbReference['id'])

        if not pdb_ids:
            print(f"No PDB entries found for UniProt ID: {uniprot_id}")
            return {}

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from UniProt for {uniprot_id}: {e}")
        return {}
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from UniProt for {uniprot_id}: {e}")
        return {}
    except KeyError as e:
        print(f"Error parsing UniProt JSON for {uniprot_id}: Missing key: {e}")
        return {}


    # Step 2: Get information about each PDB entry from RCSB PDB API
    pdb_info = {}
    for pdb_id in pdb_ids:
        pdb_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"  # Use RCSB API for PDB details

        try:
            pdb_response = requests.get(pdb_url)
            pdb_response.raise_for_status() # Raise HTTPError for bad responses
            pdb_data = pdb_response.json()


            # Initialize the dictionary for this PDB ID
            pdb_info[pdb_id] = {
                "method": None,
                "resolution": None,
                "chains": []  # Initialize the chains list
            }

            # Extract method
            if 'exptl' in pdb_data and pdb_data['exptl']:
                pdb_info[pdb_id]["method"] = pdb_data['exptl'][0]['method']

            # Extract resolution
            if 'rcsb_entry_info' in pdb_data and 'resolution_combined' in pdb_data['rcsb_entry_info']:
                pdb_info[pdb_id]["resolution"] = pdb_data['rcsb_entry_info']['resolution_combined'][0]

            # Extract chain information and sequence range
            chain_id = None
            seq_start = None
            seq_end = None

            if 'rcsb_entry_container_identifiers' in pdb_data:
               if pdb_data['rcsb_entry_container_identifiers']['entity_ids']:
                    entity_id = pdb_data['rcsb_entry_container_identifiers']['entity_ids'][0]

                    #Extract chain ID
                    if 'rcsb_entry_container_identifiers' in pdb_data:
                       if pdb_data['rcsb_entry_container_identifiers']['assembly_ids']:
                           assembly_id = pdb_data['rcsb_entry_container_identifiers']['assembly_ids'][0]  #default the first one

                           for assembly in pdb_data.get("assemblies", []):
                                if assembly['assembly_id'] == assembly_id:
                                    for chain in assembly.get("rcsb_assembly_container_identifiers", {}).get("auth_asym_ids",[]):
                                       chain_id = chain   #default the first one

                    # Get the range of the protein
                    for struct_ref in pdb_data.get("struct_ref", []):
                        for seq_range in struct_ref.get("seq_range", []):
                            seq_start = seq_range["beg_seq_id"]
                            seq_end = seq_range["end_seq_id"]

                    # Add the chain information to the chains list
                    if chain_id is not None and seq_start is not None and seq_end is not None:
                        pdb_info[pdb_id]["chains"].append({
                            "chain": chain_id,
                            "start": seq_start,
                            "finish": seq_end
                        })


        except requests.exceptions.RequestException as e:
            print(f"Error fetching data from RCSB PDB for {pdb_id}: {e}")
            pdb_info[pdb_id] = {"error": f"Error: {e}"} # Store error information
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from RCSB PDB for {pdb_id}: {e}")
            pdb_info[pdb_id] = {"error": f"JSON decode error: {e}"}
        except Exception as e:
            print(f"Unexpected error processing PDB ID {pdb_id}: {e}")
            pdb_info[pdb_id] = {"error": f"Unexpected error: {e}"}


    return pdb_info


# Example usage:
uniprot_id = "P00533"  # Example -> this is the example that was working very fine
pdb_data = get_pdb_info_from_uniprot(uniprot_id)

if pdb_data:
    print(json.dumps(pdb_data, indent=4))  # Print the structured output
else:
    print("No PDB information found.")

{
    "1IVO": {
        "method": "X-RAY DIFFRACTION",
        "resolution": 3.3,
        "chains": []
    },
    "1M14": {
        "method": "X-RAY DIFFRACTION",
        "resolution": 2.6,
        "chains": []
    },
    "1M17": {
        "method": "X-RAY DIFFRACTION",
        "resolution": 2.6,
        "chains": []
    },
    "1MOX": {
        "method": "X-RAY DIFFRACTION",
        "resolution": 2.5,
        "chains": []
    },
    "1NQL": {
        "method": "X-RAY DIFFRACTION",
        "resolution": 2.8,
        "chains": []
    },
    "1XKK": {
        "method": "X-RAY DIFFRACTION",
        "resolution": 2.4,
        "chains": []
    },
    "1YY9": {
        "method": "X-RAY DIFFRACTION",
        "resolution": 2.605,
        "chains": []
    },
    "1Z9I": {
        "method": "SOLUTION NMR",
        "resolution": null,
        "chains": []
    },
    "2EB2": {
        "method": "X-RAY DIFFRACTION",
        "resolution": 2.5,
        "chains": []
    },
    "2EB3": {
        "metho

In [None]:
# After the get_pdb_info_from_uniprot function and its usage:

def get_top_n_resolution(pdb_data, n=3):
    """
    Finds the top N PDB entries with the best (highest) resolution.

    Args:
        pdb_data (dict): The dictionary returned by get_pdb_info_from_uniprot.
        n (int): The number of top entries to return (default: 3).

    Returns:
        list: A list of tuples, where each tuple contains (PDB ID, resolution).
              Sorted by resolution in ascending order (best resolution first).
    """

    resolution_list = []
    for pdb_id, info in pdb_data.items():
        if "error" not in info and info["resolution"] is not None:
            try:
                resolution = info["resolution"]
                if isinstance(resolution, str):
                    resolution = float(resolution.replace("Å", ""))  # Remove Å and convert to float if it's a string
                elif isinstance(resolution, (int, float)):
                   pass  #It's already a number.
                else:
                    print(f"Warning: Unexpected resolution type for PDB ID: {pdb_id}, skipping.")
                    continue

                resolution_list.append((pdb_id, resolution))
            except ValueError:
                print(f"Warning: Could not convert resolution to float for PDB ID: {pdb_id}, skipping.")

    # Sort by resolution (lower is better)
    resolution_list.sort(key=lambda x: x[1])

    return resolution_list[:n]  # Return the top N


# Get the top 3 PDBs by resolution:
if pdb_data:  # Ensure pdb_data is not empty
    top_3 = get_top_n_resolution(pdb_data, n=3)

    print("\nTop 3 PDBs by Resolution:")
    for pdb_id, resolution in top_3:
        print(f"  {pdb_id}: {resolution:.2f} Å") # Format resolution nicely
else:
    print("No PDB data available to find top resolutions.")


Top 3 PDBs by Resolution:
  8A27: 1.07 Å
  8A2D: 1.11 Å
  5UG9: 1.33 Å


In [None]:
# Example usage:
uniprot_id = "A0A0A7RC34"  # Example: Spike glycoprotein from SARS-CoV-2
pdb_data = get_pdb_info_from_uniprot(uniprot_id)

if pdb_data:
    print(json.dumps(pdb_data, indent=4))  # Print the structured output
else:
    print("No PDB information found.")

No PDB entries found for UniProt ID: A0A0A7RC34
No PDB information found.


In [None]:
!wget -O 8A27.pdb https://files.rcsb.org/download/8A27.pdb
!wget -O 5UG9.pdb https://files.rcsb.org/download/5UG9.pdb

--2025-03-04 15:15:14--  https://files.rcsb.org/download/8A27.pdb
Resolving files.rcsb.org (files.rcsb.org)... 18.238.176.45, 18.238.176.78, 18.238.176.21, ...
Connecting to files.rcsb.org (files.rcsb.org)|18.238.176.45|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/octet-stream]
Saving to: ‘8A27.pdb’

8A27.pdb                [ <=>                ] 482.92K  2.75MB/s    in 0.2s    

2025-03-04 15:15:15 (2.75 MB/s) - ‘8A27.pdb’ saved [494505]

--2025-03-04 15:15:15--  https://files.rcsb.org/download/5UG9.pdb
Resolving files.rcsb.org (files.rcsb.org)... 18.238.176.45, 18.238.176.78, 18.238.176.21, ...
Connecting to files.rcsb.org (files.rcsb.org)|18.238.176.45|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/octet-stream]
Saving to: ‘5UG9.pdb’

5UG9.pdb                [ <=>                ] 255.10K  --.-KB/s    in 0.1s    

2025-03-04 15:15:16 (2.27 MB/s) - ‘5UG9.pdb’ saved [261225]



In [None]:
import py3Dmol

pdb_id = "5UG9"  # same example as part 2
view = py3Dmol.view(query=f'pdb:{pdb_id}', options={'doAssembly': True})
view.setStyle({'cartoon': {'color': 'spectrum'}})
view.addStyle({'model': -1}, {'stick': {}})
view.zoomTo()
view.show()


In [None]:
'''
def search_pdb_for_plasmodium_falciparum():
    """
    Searches the RCSB PDB for structures related to Plasmodium falciparum using
    a keyword search. This approach avoids relying on taxonomy IDs, which may
    have annotation issues.
    """

    search_term = "Plasmodium falciparum"
    pdb_search_url = f"https://search.rcsb.org/graphql?query={{search(terms:\"{search_term}\",return_type:entry)}}"

    try:
        response = requests.get(pdb_search_url)
        response.raise_for_status()  # Raise HTTPError for bad responses
        search_data = response.json()

        if not search_data or not search_data.get("data") or not search_data["data"].get("search"):
            print("No search results found for Plasmodium falciparum.")
            return {} # Return an empty dict instead of list

        pdb_ids = search_data["data"]["search"]
        if not pdb_ids:
             print("No PDB IDs found for Plasmodium falciparum.")
             return {} # Return an empty dict instead of list

        print (f"Found those pdb {pdb_ids}")

        # Fetch details for each PDB ID
        pdb_details = {}
        for pdb_id in pdb_ids:
             pdb_details[pdb_id] = get_pdb_details(pdb_id) #call function to return pdb details
        return pdb_details


    except requests.exceptions.RequestException as e:
        print(f"Error during RCSB PDB search: {e}")
        return {} # Return an empty dict instead of list
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from RCSB PDB search: {e}")
        return {} # Return an empty dict instead of list
    except KeyError as e:
        print(f"Error parsing JSON from RCSB PDB search: {e}")
        return {} # Return an empty dict instead of list

def get_pdb_details(pdb_id):

    """
    Retrieves details for a given PDB ID from the RCSB PDB API.

    Args:
        pdb_id (str): The PDB ID (e.g., "1IVO").

    Returns:
        dict: A dictionary containing details about the PDB entry,
              or an empty dictionary if there was an error.
    """

    pdb_url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"  # Use RCSB API for PDB details

    try:
            pdb_response = requests.get(pdb_url)
            pdb_response.raise_for_status() # Raise HTTPError for bad responses
            pdb_data = pdb_response.json()


            extracted_info = {
                "pdb_id": pdb_id,
                "resolution": None, # Initialize to None in case it's missing
                "length": None,
                "method": None,
                "chain_id": None,
                "entity_id": None,
                "seq_start": None,
                "seq_end": None,
                # Add more fields as needed. Find fields by inspecting the JSON
                # response from the RCSB PDB API (printed in previous example)
            }


            # Extract resolution
            if 'rcsb_entry_info' in pdb_data and 'resolution_combined' in pdb_data['rcsb_entry_info']:
                extracted_info["resolution"] = pdb_data['rcsb_entry_info']['resolution_combined'][0] # Taking the first resolution if there are multiple


            # Extract method
            if 'exptl' in pdb_data and pdb_data['exptl']:
                extracted_info['method'] = pdb_data['exptl'][0]['method']


            # Now we need to access the correct entity and chain
            if 'rcsb_entry_container_identifiers' in pdb_data:
               if pdb_data['rcsb_entry_container_identifiers']['entity_ids']:
                entity_id = pdb_data['rcsb_entry_container_identifiers']['entity_ids'][0]  #default the first one
                extracted_info['entity_id'] = entity_id

                # Extract chain ID and length associated with entity ID
                for entity in pdb_data.get("entities", []):
                    if entity['entity_id'] == entity_id:
                        extracted_info['length'] = entity['rcsb_entity_info']['entity_length']

                # Extract chain ID
                if 'rcsb_entry_container_identifiers' in pdb_data:
                   if pdb_data['rcsb_entry_container_identifiers']['assembly_ids']:
                       assembly_id = pdb_data['rcsb_entry_container_identifiers']['assembly_ids'][0]  #default the first one

                       for assembly in pdb_data.get("assemblies", []):
                            if assembly['assembly_id'] == assembly_id:
                                for chain in assembly.get("rcsb_assembly_container_identifiers", {}).get("auth_asym_ids",[]):
                                   extracted_info['chain_id'] = chain   #default the first one

                # Get the range of the protein
                for struct_ref in pdb_data.get("struct_ref", []):
                    for seq_range in struct_ref.get("seq_range", []):
                         extracted_info['seq_start'] = seq_range["beg_seq_id"]
                         extracted_info['seq_end'] = seq_range["end_seq_id"]

            return extracted_info
    except requests.exceptions.RequestException as e:
            print(f"Error fetching data from RCSB PDB for {pdb_id}: {e}")
            return {}
    except json.JSONDecodeError as e:
            print(f"Error decoding JSON from RCSB PDB for {pdb_id}: {e}")
            return {}
    except Exception as e:
            print(f"Unexpected error processing PDB ID {pdb_id}: {e}")
            return {}


def search_alphafold_for_plasmodium_falciparum():
    """
    Attempts to search the AlphaFold database for Plasmodium falciparum structures.
    This is a simplified approach and may not be reliable as direct programmatic
    access to the AlphaFold database is limited. It relies on precomputed data
    available from RCSB PDB, which includes AlphaFold predictions.
    """

    search_term = "Plasmodium falciparum AND experimental:false"  # Focus on AlphaFold
    pdb_search_url = f"https://search.rcsb.org/graphql?query={{search(terms:\"{search_term}\",return_type:entry)}}"

    try:
        response = requests.get(pdb_search_url)
        response.raise_for_status()
        search_data = response.json()

        if not search_data or not search_data.get("data") or not search_data["data"].get("search"):
            print("No AlphaFold results found for Plasmodium falciparum.")
            return {}  #Empty dict

        pdb_ids = search_data["data"]["search"]
        if not pdb_ids:
             print("No AlphaFold IDs found for Plasmodium falciparum.")
             return {}   #Empty dict


        print (f"Found those pdb {pdb_ids}")

        # Fetch details for each PDB ID
        pdb_details = {}
        for pdb_id in pdb_ids:
             pdb_details[pdb_id] = get_pdb_details(pdb_id) #call function to return pdb details
        return pdb_details



    except requests.exceptions.RequestException as e:
        print(f"Error during RCSB PDB AlphaFold search: {e}")
        return {} # Empty dict
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from RCSB PDB AlphaFold search: {e}")
        return {}  # Empty dict
    except KeyError as e:
        print(f"Error parsing JSON from RCSB PDB AlphaFold search: {e}")
        return {}  # Empty dict


# Main Execution
print("Searching RCSB PDB for experimentally determined structures:")
pdb_results = search_pdb_for_plasmodium_falciparum()

if pdb_results:
    print("PDB Entries related to Plasmodium falciparum:")
    print(json.dumps(pdb_results, indent=4))
else:
    print("No PDB entries found for Plasmodium falciparum using RCSB PDB search.")

print("\nSearching RCSB PDB for AlphaFold predicted structures:")
alphafold_results = search_alphafold_for_plasmodium_falciparum()

if alphafold_results:
    print("AlphaFold Entries related to Plasmodium falciparum:")
    print(json.dumps(alphafold_results, indent=4))
else:
    print("No AlphaFold entries found for Plasmodium falciparum using RCSB PDB search.")

'''

Searching RCSB PDB for experimentally determined structures:
Error during RCSB PDB search: 404 Client Error:  for url: https://search.rcsb.org/graphql?query=%7Bsearch(terms:%22Plasmodium%20falciparum%22,return_type:entry)%7D
No PDB entries found for Plasmodium falciparum using RCSB PDB search.

Searching RCSB PDB for AlphaFold predicted structures:
Error during RCSB PDB AlphaFold search: 404 Client Error:  for url: https://search.rcsb.org/graphql?query=%7Bsearch(terms:%22Plasmodium%20falciparum%20AND%20experimental:false%22,return_type:entry)%7D
No AlphaFold entries found for Plasmodium falciparum using RCSB PDB search.


In [8]:
#from the result of blast with P00533

path_to_5833 = '/content/drive/MyDrive/Blast Target Proteins/Plasmodium.pdb'
#https://www.uniprot.org/uniprotkb/A0A0L1IDW2/entry#structure
path_to_target_1 = '/content/drive/MyDrive/Blast Target Proteins/TKL_protein_kinase.pdb'
#https://www.uniprot.org/uniprotkb/A0A024VJH5/entry#structure
path_to_target_2 = '/content/drive/MyDrive/Blast Target Proteins/CAMKCDPK_protein_kinase.pdb'
#https://www.uniprot.org/uniprotkb/Q8ILL6/entry#structure
path_to_target_3 = '/content/drive/MyDrive/Blast Target Proteins/Calcium-dependent protein_kinase.pdb'

In [11]:
#from the result of blast with A0A0A0MQF6

#https://www.uniprot.org/uniprotkb/W7J3S3/entry#structure
path_to_target_4 ='/content/drive/MyDrive/Blast Target Proteins/Glyceraldehyde-3-phosphate_dehydrogenase.pdb'

In [12]:
import numpy as np
import Bio.PDB
import py3Dmol
from termcolor import colored

# Specify downloaded PDB files
pdb_file1 = path_to_target_4
pdb_file2 = path_to_5833

# Load the structures
parser = Bio.PDB.PDBParser(QUIET=True)
structure1 = parser.get_structure("Protein1", pdb_file1)
structure2 = parser.get_structure("Protein2", pdb_file2)

# Select chains and get only matching residues
model1 = structure1[0]
model2 = structure2[0]

chain1 = list(model1.get_chains())[0]
chain2 = list(model2.get_chains())[0]

atoms1 = []
atoms2 = []
residue_pairs = []
distances = []

# Collect atoms and calculate distances
for res1, res2 in zip(chain1.get_residues(), chain2.get_residues()):
    if res1.has_id('CA') and res2.has_id('CA'):
        atom1 = res1['CA']
        atom2 = res2['CA']
        atoms1.append(atom1)
        atoms2.append(atom2)
        residue_pairs.append((res1, res2))
        distances.append(atom1 - atom2)  # Calculate distance between C-alpha atoms

# Check if atoms were found before aligning
if not atoms1 or not atoms2:
    raise ValueError("No matching alpha carbon atoms (CA) found in the provided chains.")

# Perform the alignment
super_imposer = Bio.PDB.Superimposer()
super_imposer.set_atoms(atoms1, atoms2)
super_imposer.apply(structure2.get_atoms())

# Prepare PDB strings for visualization
io = Bio.PDB.PDBIO()
io.set_structure(structure1)
with open("aligned1.pdb", "w") as f:
    io.save(f)

io.set_structure(structure2)
with open("aligned2.pdb", "w") as f:
    io.save(f)

# Print a clear alignment summary with aligned regions
print("\nAlignment Summary (Residue1 - Residue2 : Distance)")

min_dist = min(distances)
max_dist = max(distances)
alignment_regions = []
current_region = None

for (res1, res2), distance in zip(residue_pairs, distances):
    res1_info = f"{res1.get_resname()} {res1.get_id()[1]}"
    res2_info = f"{res2.get_resname()} {res2.get_id()[1]}"

    # Determine alignment quality
    if distance < (max_dist * 0.25):
        color = 'green'
        alignment_quality = "Well aligned"
    elif distance < (max_dist * 0.5):
        color = 'yellow'
        alignment_quality = "Moderately aligned"
    elif distance < (max_dist * 0.75):
        color = 'magenta'
        alignment_quality = "Weakly aligned"
    else:
        color = 'red'
        alignment_quality = "Poorly aligned"

    # Print detailed alignment info
    print(colored(f"{res1_info} - {res2_info} : {distance:.2f} ({alignment_quality})", color))

    # Detect and store aligned regions
    if color in ['green', 'yellow']:  # Only track good alignment regions
        if not current_region:
            current_region = [res1_info, res2_info]
        else:
            current_region[1] = res2_info
    else:
        if current_region:
            alignment_regions.append(current_region)
            current_region = None

if current_region:
    alignment_regions.append(current_region)

# Display the aligned regions
print("\nAligned Regions (Well/Moderately Aligned):")
for start, end in alignment_regions:
    print(f"From {start} to {end}")

# Visualize the alignment using py3Dmol with color based on alignment quality
view = py3Dmol.view(width=800, height=600)

# Add the first model (reference in blue)
with open("aligned1.pdb", "r") as f:
    view.addModel(f.read(), "pdb")
    view.setStyle({'model': 0}, {'cartoon': {'color': 'blue'}})

# Add the second model (aligned in gradient colors)
with open("aligned2.pdb", "r") as f:
    view.addModel(f.read(), "pdb")

# Map distances to colors for visualization
colors = [f"rgb({int(255 * (d - min_dist) / (max_dist - min_dist))}, {int(255 * (1 - (d - min_dist) / (max_dist - min_dist)))}, 0)" for d in distances]

for i, color in enumerate(colors):
    view.setStyle({'model': 1, 'resi': i + 1}, {'cartoon': {'color': color}})

view.zoomTo()
view.show()



Alignment Summary (Residue1 - Residue2 : Distance)
MET 1 - LEU 1 : 37.83 (Poorly aligned)
ALA 2 - PRO 2 : 39.48 (Poorly aligned)
VAL 3 - PHE 3 : 34.71 (Poorly aligned)
THR 4 - PRO 4 : 30.38 (Weakly aligned)
LYS 5 - LEU 5 : 28.00 (Weakly aligned)
LEU 6 - VAL 6 : 24.18 (Weakly aligned)
GLY 7 - PHE 7 : 20.59 (Moderately aligned)
ILE 8 - CYS 8 : 16.05 (Moderately aligned)
ASN 9 - ILE 9 : 11.86 (Moderately aligned)
GLY 10 - GLY 10 : 5.75 (Well aligned)
PHE 11 - GLY 11 : 8.88 (Well aligned)
GLY 12 - PHE 12 : 9.49 (Well aligned)
ARG 13 - ASP 13 : 13.24 (Moderately aligned)
ILE 14 - GLY 14 : 17.35 (Moderately aligned)
GLY 15 - VAL 15 : 18.15 (Moderately aligned)
ARG 16 - GLU 16 : 17.25 (Moderately aligned)
LEU 17 - TYR 17 : 16.35 (Moderately aligned)
VAL 18 - LEU 18 : 14.46 (Moderately aligned)
PHE 19 - TYR 19 : 12.13 (Moderately aligned)
ARG 20 - SER 20 : 14.81 (Moderately aligned)
ALA 21 - MET 21 : 15.36 (Moderately aligned)
ALA 22 - GLU 22 : 19.97 (Moderately aligned)
PHE 23 - LEU 23 : 20.

#Color-Coded Alignment Quality:

  - Green: Well aligned
  - Yellow: Moderately aligned
  - Magenta: Weakly aligned
  - Red: Poorly aligned

# **Next Step: Identifying Unique Target Proteins**

- In this phase, our objective is to determine the uniqueness of proteins that are targeted by each drug. By identifying unique target proteins, we can streamline the processing by eliminating redundant entries. Additionally, to track which drug interacts with each protein, we will maintain this information in a dictionary format.

- The procedure involves creating a dictionary where each unique protein serves as a key. The values associated with these keys will be lists containing the names of drugs that target these proteins. While the primary focus remains on the keys representing the proteins, the inclusion of drug names ensures that no pertinent information is lost.

In [13]:
import pandas as pd
import csv
df = pd.read_csv('/content/drive/MyDrive/Human_target_results.csv')

In [15]:
df.head()

Unnamed: 0,PubChem ID,Target Names,Accession IDs,Target Gene Name
0,444810,['Homo'],['W8R5U2'],[]
1,135421339,"['Phosphatidylinositol', 'AKT3', 'Protooncogen...","['O08967', 'P56279', 'Q95M86', 'L8GL10', 'P013...","['AKT3', 'TRIM24', 'PRKAB1', 'MAP2K7', 'CSNK1D..."
2,9939609,"['Homo', 'NTMT1', 'IFNG', 'Chain', 'GNMT', 'GA...","['W8R5U2', 'S4R3J7', 'P01579', 'P0C023', 'Q147...","['NTMT1', 'IFNG', 'GNMT', 'GAMT', 'COMT', 'IP6..."
3,42627755,"['IFNB1', 'NTMT1', 'Chain', 'GNMT', 'GAMT', 'C...","['P01574', 'S4R3J7', 'P0C023', 'Q14749', 'A0A0...","['IFNB1', 'NTMT1', 'GNMT', 'GAMT', 'COMT', 'IP..."
4,53464483,"['NR1I2', 'TRPC6', 'NTMT1', 'Chain', 'GNMT', '...","['Q9CRZ0', 'Q99N78', 'S4R3J7', 'P0C023', 'Q147...","['NR1I2', 'TRPC6', 'NTMT1', 'GNMT', 'TRPC3', '..."


In [23]:
import ast


df = pd.read_csv('/content/drive/MyDrive/Human_target_results.csv')

# Convert the strings of lists into actual lists if necessary
df['Target Names'] = df['Accession IDs'].apply(ast.literal_eval)

# Create a dictionary to hold the proteins as keys and drugs as values
protein_drug_dict = {}

# Iterate through the DataFrame rows
for index, row in df.iterrows():
    drug = row['PubChem ID']
    protein_targets = row['Target Names']

    for protein in protein_targets:
        if protein not in protein_drug_dict:
            protein_drug_dict[protein] = [drug]
        else:
            if drug not in protein_drug_dict[protein]:
                protein_drug_dict[protein].append(drug)

# Display the dictionary
for protein, drugs in protein_drug_dict.items():
    print(f"Protein: {protein}, Drugs: {drugs}")


Protein: W8R5U2, Drugs: [444810, 9939609, 41867, 6918837, 11978790, 5583, 6197, 30323, 441074, 4735, 3034034, 49855250, 154257]
Protein: O08967, Drugs: [135421339, 24756910, 24795070, 11978790, 6445562, 51358113, 16007391, 11511120]
Protein: P56279, Drugs: [135421339, 9549305, 16007391, 3499]
Protein: Q95M86, Drugs: [135421339, 24756910, 24795070, 11978790, 6445562, 16007391, 5330175, 11511120]
Protein: L8GL10, Drugs: [135421339, 24756910, 24795070, 11978790, 6445562, 9549305, 16007391, 11511120]
Protein: P01308, Drugs: [135421339, 24756910, 24795070, 11978790, 6445562, 16007391, 11511120]
Protein: P45452, Drugs: [135421339, 24756910, 24795070, 11978790, 6445562, 16007391, 11511120]
Protein: O15164, Drugs: [135421339, 10219, 9914412, 16007391, 9829836, 11511120, 154257]
Protein: A0A1V9YMQ6, Drugs: [135421339, 10219, 24756910, 51000408, 41867, 24795070, 11978790, 439530, 6445562, 9549305, 6197, 51358113, 16007391, 30323, 11511120, 4735, 3499]
Protein: Q3U5I9, Drugs: [135421339, 9549305,

In [24]:
protein_drug_dict

{'W8R5U2': [444810,
  9939609,
  41867,
  6918837,
  11978790,
  5583,
  6197,
  30323,
  441074,
  4735,
  3034034,
  49855250,
  154257],
 'O08967': [135421339,
  24756910,
  24795070,
  11978790,
  6445562,
  51358113,
  16007391,
  11511120],
 'P56279': [135421339, 9549305, 16007391, 3499],
 'Q95M86': [135421339,
  24756910,
  24795070,
  11978790,
  6445562,
  16007391,
  5330175,
  11511120],
 'L8GL10': [135421339,
  24756910,
  24795070,
  11978790,
  6445562,
  9549305,
  16007391,
  11511120],
 'P01308': [135421339,
  24756910,
  24795070,
  11978790,
  6445562,
  16007391,
  11511120],
 'P45452': [135421339,
  24756910,
  24795070,
  11978790,
  6445562,
  16007391,
  11511120],
 'O15164': [135421339, 10219, 9914412, 16007391, 9829836, 11511120, 154257],
 'A0A1V9YMQ6': [135421339,
  10219,
  24756910,
  51000408,
  41867,
  24795070,
  11978790,
  439530,
  6445562,
  9549305,
  6197,
  51358113,
  16007391,
  30323,
  11511120,
  4735,
  3499],
 'Q3U5I9': [135421339, 9549305

In [25]:
# Sort the dictionary by the number of drugs per protein, in descending order
sorted_protein_drug_dict = sorted(protein_drug_dict.items(), key=lambda item: len(item[1]), reverse=True)

# Extract the top 10 most targeted proteins
top_10_targeted_proteins = sorted_protein_drug_dict[:10]

# Display the top 10 proteins and the count of drugs targeting each
for protein, drugs in top_10_targeted_proteins:
    print(f"Protein: {protein}, Number of Drugs: {len(drugs)}")


Protein: Q9Y294, Number of Drugs: 47
Protein: P0C023, Number of Drugs: 45
Protein: O96028, Number of Drugs: 45
Protein: S4R3J7, Number of Drugs: 44
Protein: P11352, Number of Drugs: 44
Protein: K7EJ20, Number of Drugs: 42
Protein: P07954, Number of Drugs: 41
Protein: H3BU54, Number of Drugs: 39
Protein: D3YY46, Number of Drugs: 39
Protein: P01579, Number of Drugs: 36


In [26]:
#from Q9Y294 first result: A0A024W454
path_to_target_5 = '/content/drive/MyDrive/Blast Target Proteins/Histone_chaperone_ASF1.pdb'

In [28]:
import numpy as np
import Bio.PDB
import py3Dmol
from termcolor import colored

# Specify downloaded PDB files
pdb_file1 = path_to_target_5
pdb_file2 = path_to_5833

# Load the structures
parser = Bio.PDB.PDBParser(QUIET=True)
structure1 = parser.get_structure("Protein1", pdb_file1)
structure2 = parser.get_structure("Protein2", pdb_file2)

# Select chains and get only matching residues
model1 = structure1[0]
model2 = structure2[0]

chain1 = list(model1.get_chains())[0]
chain2 = list(model2.get_chains())[0]

atoms1 = []
atoms2 = []
residue_pairs = []
distances = []

# Collect atoms and calculate distances
for res1, res2 in zip(chain1.get_residues(), chain2.get_residues()):
    if res1.has_id('CA') and res2.has_id('CA'):
        atom1 = res1['CA']
        atom2 = res2['CA']
        atoms1.append(atom1)
        atoms2.append(atom2)
        residue_pairs.append((res1, res2))
        distances.append(atom1 - atom2)  # Calculate distance between C-alpha atoms

# Check if atoms were found before aligning
if not atoms1 or not atoms2:
    raise ValueError("No matching alpha carbon atoms (CA) found in the provided chains.")

# Perform the alignment
super_imposer = Bio.PDB.Superimposer()
super_imposer.set_atoms(atoms1, atoms2)
super_imposer.apply(structure2.get_atoms())

# Prepare PDB strings for visualization
io = Bio.PDB.PDBIO()
io.set_structure(structure1)
with open("aligned1.pdb", "w") as f:
    io.save(f)

io.set_structure(structure2)
with open("aligned2.pdb", "w") as f:
    io.save(f)

# Print a clear alignment summary with aligned regions
print("\nAlignment Summary (Residue1 - Residue2 : Distance)")

min_dist = min(distances)
max_dist = max(distances)
alignment_regions = []
current_region = None

for (res1, res2), distance in zip(residue_pairs, distances):
    res1_info = f"{res1.get_resname()} {res1.get_id()[1]}"
    res2_info = f"{res2.get_resname()} {res2.get_id()[1]}"

    # Determine alignment quality
    if distance < (max_dist * 0.25):
        color = 'green'
        alignment_quality = "Well aligned"
    elif distance < (max_dist * 0.5):
        color = 'yellow'
        alignment_quality = "Moderately aligned"
    elif distance < (max_dist * 0.75):
        color = 'magenta'
        alignment_quality = "Weakly aligned"
    else:
        color = 'red'
        alignment_quality = "Poorly aligned"

    # Print detailed alignment info
    print(colored(f"{res1_info} - {res2_info} : {distance:.2f} ({alignment_quality})", color))

    # Detect and store aligned regions
    if color in ['green', 'yellow']:  # Only track good alignment regions
        if not current_region:
            current_region = [res1_info, res2_info]
        else:
            current_region[1] = res2_info
    else:
        if current_region:
            alignment_regions.append(current_region)
            current_region = None

if current_region:
    alignment_regions.append(current_region)

# Display the aligned regions
print("\nAligned Regions (Well/Moderately Aligned):")
for start, end in alignment_regions:
    print(f"From {start} to {end}")

# Visualize the alignment using py3Dmol with color based on alignment quality
view = py3Dmol.view(width=800, height=600)

# Add the first model (reference in blue)
with open("aligned1.pdb", "r") as f:
    view.addModel(f.read(), "pdb")
    view.setStyle({'model': 0}, {'cartoon': {'color': 'blue'}})

# Add the second model (aligned in gradient colors)
with open("aligned2.pdb", "r") as f:
    view.addModel(f.read(), "pdb")

# Map distances to colors for visualization
colors = [f"rgb({int(255 * (d - min_dist) / (max_dist - min_dist))}, {int(255 * (1 - (d - min_dist) / (max_dist - min_dist)))}, 0)" for d in distances]

for i, color in enumerate(colors):
    view.setStyle({'model': 1, 'resi': i + 1}, {'cartoon': {'color': color}})

view.zoomTo()
view.show()



Alignment Summary (Residue1 - Residue2 : Distance)
MET 1 - LEU 1 : 35.88 (Moderately aligned)
SER 2 - PRO 2 : 31.22 (Moderately aligned)
GLU 3 - PHE 3 : 28.73 (Moderately aligned)
VAL 4 - PRO 4 : 23.60 (Moderately aligned)
ASN 5 - LEU 5 : 21.77 (Well aligned)
VAL 6 - VAL 6 : 16.40 (Well aligned)
THR 7 - PHE 7 : 16.98 (Well aligned)
LYS 8 - CYS 8 : 14.90 (Well aligned)
VAL 9 - ILE 9 : 14.25 (Well aligned)
ILE 10 - GLY 10 : 16.15 (Well aligned)
VAL 11 - GLY 11 : 14.76 (Well aligned)
ASN 12 - PHE 12 : 21.09 (Well aligned)
ASN 13 - ASP 13 : 22.96 (Moderately aligned)
PRO 14 - GLY 14 : 22.38 (Moderately aligned)
ILE 15 - VAL 15 : 22.98 (Moderately aligned)
CYS 16 - GLU 16 : 20.62 (Well aligned)
ASP 17 - TYR 17 : 16.62 (Well aligned)
ILE 18 - LEU 18 : 13.28 (Well aligned)
LEU 19 - TYR 19 : 15.07 (Well aligned)
ASP 20 - SER 20 : 18.22 (Well aligned)
PRO 21 - MET 21 : 18.37 (Well aligned)
PHE 22 - GLU 22 : 18.08 (Well aligned)
VAL 23 - LEU 23 : 18.74 (Well aligned)
PHE 24 - LEU 24 : 18.56 (We