<a href="https://colab.research.google.com/github/jyryu3161/DrugDiscovery/blob/main/lec4_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas biopython requests pypdb


Collecting pypdb
  Downloading pypdb-2.4-py3-none-any.whl.metadata (3.0 kB)
Downloading pypdb-2.4-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.6/40.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdb
Successfully installed pypdb-2.4


### Kinase information

http://www.kinhub.org/kinases.html

In [None]:
!git clone https://github.com/jyryu3161/DrugDiscovery.git

Cloning into 'DrugDiscovery'...
remote: Enumerating objects: 108, done.[K
remote: Counting objects: 100% (108/108), done.[K
remote: Compressing objects: 100% (105/105), done.[K
remote: Total 108 (delta 52), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (108/108), 107.73 KiB | 2.03 MiB/s, done.
Resolving deltas: 100% (52/52), done.


DR   PDB; 8VF6; X-ray; 2.70 A; A/B=99-383.
['8VF6; X-ray; 2.70 A; A/B=99-383.']


In [None]:
from Bio import SwissProt
from Bio import PDB
from urllib.request import urlopen
from io import StringIO
import pandas as pd
import tqdm
import time

def get_pdb_ids_from_uniprot(uniprot_id):
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.txt"
    response = urlopen(url)
    record_string_list = response.read().decode("utf-8").split('\n')
    time.sleep(0.1)

    pdb_ids = []
    for dbref in record_string_list:
        if 'PDB;' in dbref:
            pdb_id = dbref.split("PDB;")[1].strip()
            pdb_ids.append(pdb_id)
    return pdb_ids

if __name__ == "__main__":
    df = pd.read_csv('./DrugDiscovery/KinaseData.csv')
    structure_info_list = []

    for index, row in tqdm.tqdm(df.iterrows(), total=len(df)):
        uniprot_id = row['UniprotID']
        try:
            pdb_ids = get_pdb_ids_from_uniprot(uniprot_id)
        except:
            pdb_ids = []

        if len(pdb_ids) > 0:
            structure_info_list.append(True)
        else:
            structure_info_list.append(False)

    df['Structure Info'] = structure_info_list
    df.to_csv('./KinaseData_StructureInfo.csv', index=False)
    print(df)

100%|██████████| 536/536 [05:52<00:00,  1.52it/s]

        xName Manning Name HGNC Name  \
0        ABL1          ABL      ABL1   
1         ACK          ACK      TNK2   
2       ACTR2        ACTR2    ACVR2A   
3      ACTR2B       ACTR2B    ACVR2B   
4       ADCK4        ADCK4     ADCK4   
..        ...          ...       ...   
531    GTF2F1       GTF2F1       NaN   
532  Col4A3BP     Col4A3BP  COL4A3BP   
533     BLVRA        BLVRA     BLVRA   
534     BAZ1A        BAZ1A     BAZ1A   
535     BAZ1B        BAZ1B     BAZ1B   

                                           Kinase Name     Group    Family  \
0                         Tyrosine-protein kinase ABL1        TK       Abl   
1                             Activated CDC42 kinase 1        TK       Ack   
2                             Activin receptor type-2A       TKL      STKR   
3                             Activin receptor type-2B       TKL      STKR   
4    Uncharacterized aarF domain-containing protein...  Atypical      ABC1   
..                                                 




### *In order not to overload the PubTator3 server, we ask that users post no more than three requests per second.

In [None]:
import pandas as pd
import requests
import json
import time
import tqdm

def get_disease_info_from_pubtator(gene):
    # URL
    url = "https://www.ncbi.nlm.nih.gov/research/pubtator3-api/relations?e1=@GENE_%s&type=stimulate&e2=Disease"%(gene)
    # API
    response = requests.get(url)


    data = response.json() # JSON 응답 파싱
    # 'source'에서 질병 이름만 추출
    diseases = []
    for each_data in data:
        each_disease = each_data['source']
        each_disease = each_disease.replace('@DISEASE_', '')
        each_disease = each_disease.replace('_', ' ')
        diseases.append(each_disease)

    diseases = list(set(diseases))

    return diseases

df = pd.read_csv('./KinaseData_StructureInfo.csv')
df_wo_structure = df[df['Structure Info']==False]

disease_list = []
all_disease_list = []
for index, each_df in tqdm.tqdm(df_wo_structure.iterrows(), total=len(df_wo_structure)):
    gene = each_df['HGNC Name']

    try:
        diseases = get_disease_info_from_pubtator(gene)
        all_disease_list+=diseases
        disease_string = ';'.join(diseases)
    except:
        disease_string = ''
    disease_list.append(disease_string)


df_wo_structure['Disease Information'] = disease_list
df_wo_structure.to_csv('./KinaseData_WO_StructureInfo_DiseaseInfo.csv')

all_disease_list = list(set(all_disease_list))
for disease in all_disease_list:
    print(disease)


100%|██████████| 140/140 [00:29<00:00,  4.80it/s]

Alzheimer Disease
Inflammation
Tertiary Lymphoid Structures
Diabetes Mellitus
Reperfusion Injury
Transfusion Reaction
Arthropathy progressive pseudorheumatoid of childhood
Osteosarcoma
Hereditary Breast and Ovarian Cancer Syndrome
Erythema Infectiosum
Death
Edema
Radiation Injuries
Noble Bass Sherman syndrome
Diabetic Retinopathy
Intracranial Hemorrhages
Enteropathy Associated T Cell Lymphoma
Long QT Syndrome
Oto palato digital syndrome type 2
Anemia Hemolytic
Pulmonary Fibrosis
Basal Ganglia Diseases
Silicosis
Cystitis
Hypothermia
Lymphoma B Cell
Carcinoma Renal Cell
Fibrosis
Prostatic Neoplasms Castration Resistant
Alveolitis Extrinsic Allergic
Neoplastic Syndromes Hereditary
Anemia Aplastic
Neurobehavioral Manifestations
Urinary Bladder Neoplasms
Subarachnoid Hemorrhage
Hyperglycemia
Idiopathic Noncirrhotic Portal Hypertension
Breast Neoplasms
Leber Congenital Amaurosis
Osteoglophonic dwarfism
Calcinosis
Ring Chromosome 20 Syndrome
Enterocolitis Necrotizing
Sepsis
Asthma
Autism Spec


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wo_structure['Disease Information'] = disease_list


Which diseases below are not genetic and are considered rare diseases, or not genetic and have a high unmet medical need?

Inflammation
Tertiary Lymphoid Structures
Diabetes Mellitus
Reperfusion Injury
Transfusion Reaction
Arthropathy progressive pseudorheumatoid of childhood
Osteosarcoma
Hereditary Breast and Ovarian Cancer Syndrome
Erythema Infectiosum
Death
Edema
Radiation Injuries
Noble Bass Sherman syndrome
Diabetic Retinopathy
Intracranial Hemorrhages
Enteropathy Associated T Cell Lymphoma
Long QT Syndrome
Oto palato digital syndrome type 2
Anemia Hemolytic
Pulmonary Fibrosis
Basal Ganglia Diseases
Silicosis
Cystitis
Hypothermia
Lymphoma B Cell
Carcinoma Renal Cell
Fibrosis
Prostatic Neoplasms Castration Resistant
Alveolitis Extrinsic Allergic
Neoplastic Syndromes Hereditary
Anemia Aplastic
Neurobehavioral Manifestations
Urinary Bladder Neoplasms
Subarachnoid Hemorrhage
Hyperglycemia
Idiopathic Noncirrhotic Portal Hypertension
Breast Neoplasms
Leber Congenital Amaurosis
Osteoglophonic dwarfism
Calcinosis
Ring Chromosome 20 Syndrome
Enterocolitis Necrotizing
Sepsis
Asthma
Autism Spectrum Disorder
Apical Hypertrophic Cardiomyopathy
Wounds and Injuries
Arthritis Psoriatic
Anemia
Cytokine Release Syndrome
Lymphatic Metastasis
Osteoarthritis
Demyelinating Diseases
Stress Disorders Post Traumatic
Carcinoma Ductal
Atherosclerosis
Systemic Inflammatory Response Syndrome
Uterine Cervical Neoplasms
Arthritis
Neoplasms
Atrial Fibrillation
Choline Deficiency
Neurodegenerative Diseases
Carcinoma Signet Ring Cell
Pain
Fever
Esophageal Squamous Cell Carcinoma
Microcystic adnexal carcinoma
Pulmonary Embolism
Hepatic Encephalopathy
Skin Diseases
Liver Failure
Squamous Intraepithelial Lesions
Ischemia
Multiple Trauma
Carcinoma Non Small Cell Lung
Iron Overload
Mitochondrial Myopathies
Meningioma
Insulin Resistance
Depressive Disorder
Rumination Syndrome
Hypertension
Prostatic Intraepithelial Neoplasia
Sleep Deprivation
Epilepsy Post Traumatic
Nervous System Diseases
Ischemic Attack Transient
Renal Insufficiency
Protoporphyria Erythropoietic
Myocardial Ischemia
Brain Injury Chronic
Melanoma
Squamous Cell Carcinoma of Head and Neck
Gliosis
Glomerulonephritis Membranous
Leukemia Lymphocytic Chronic B Cell
Hyperglycemic Hyperosmolar Nonketotic Coma
Lymphoma Follicular
Tarlov Cysts
Chromosome Aberrations
Wounds Stab
Pulmonary Arterial Hypertension
Cystic Fibrosis
Epstein Barr Virus Infections
Lung Neoplasms
Keratosis Actinic
Cerebral Infarction
Mitochondrial cytopathy
Burns
Diabetic Nephropathies
Miosis
Neoplasm Metastasis
Hyperplasia
Nasopharyngeal Carcinoma
Carcinoma Basal Cell
Carcinoma Hepatocellular
Multiple Myeloma
Exercise Induced Allergies
Jejunal Diseases
Brain Ischemia
Kidney Failure Chronic
Neuralgia
Acute Kidney Injury
Recurrence
Thymus Neoplasms
Abortion Spontaneous
Colitis
Sciatic Neuropathy
Anodontia
Hepatitis Chronic
Neurofibromatosis 1
Parkinsonian Disorders
Mastocytosis Systemic
Adenocarcinoma of Lung
Cell Transformation Neoplastic
Carcinoma Pancreatic Ductal
Status Asthmaticus
Personality Disorders
Pancreatic Neoplasms
Glioma
Cognition Disorders
Sarcoma Kaposi
Attention Deficit Disorder with Hyperactivity
Diabetes Mellitus Type 1
Carcinoma Squamous Cell
Heart Diseases
Diabetes Mellitus Type 2
Carcinogenesis
Harlequin type ichthyosis
Leukemia Eosinophilic Acute
Immunologic Deficiency Syndromes
Infarction
Small Cell Lung Carcinoma
Adenomyosis
Spherocytosis Type 1
Mitochondrial Diseases
Kidney Diseases
Neurotoxicity Syndromes
Brain Infarction
Adenomatous Polyps
Preeclamptic toxemia
Myocardial Infarction
Nerve Degeneration
Mullerian aplasia
Status Epilepticus
Myocardial Reperfusion Injury
Gallbladder Neoplasms
Obesity
Calcinosis Cutis
Lupus Erythematosus Systemic
Hepatitis B
Carcinoma Ovarian Epithelial
Hyperoxia
Neoplasms Basal Cell
Necrosis
Adenoma Oxyphilic
Vasculitis
Lung Injury
Carcinoma Embryonal
Ureteral Obstruction
Seizures
Esophageal Neoplasms
Ventricular Remodeling
Chromosomal Instability
Colitis Ulcerative
Parkinson Disease Secondary
Glucose Intolerance
Fibromyalgia
Glioblastoma
Fetal Diseases
Arthritis Rheumatoid
Magnesium Deficiency
Escherichia coli Infections
Non alcoholic Fatty Liver Disease
Brain Edema
Cognitive Dysfunction
Blindness
Myelitis Transverse
Cone Dystrophy
Anemia Sickle Cell
Hypoxia Ischemia Brain
Myopathies Structural Congenital
Leukemia Promyelocytic Acute
Renal Insufficiency Chronic
Factor X Deficiency
Cleft Palate
Fetal Growth Retardation
Cardiotoxicity
Pulmonary Disease Chronic Obstructive
Lipoma
Hepatitis C
Hypoxia Brain
Ventricular Dysfunction Left
Liver Neoplasms
Post Acute COVID 19 Syndrome
Drug Related Side Effects and Adverse Reactions
Scleroderma Systemic
Virus Diseases
Arthritis Experimental
Vascular Diseases
Mouth Diseases
Triple Negative Breast Neoplasms
Brain Concussion
Corneal Endothelial Cell Loss
Prodromal Symptoms
Heart Failure
Hyper IgM Immunodeficiency Syndrome
Adamantinoma
Adenoma
Hypoglycemia
Colorectal Neoplasms
Retinal Degeneration
Weight Gain
Huntington Disease
Psoriasis
Reticulocytosis
Bites and Stings
Ovarian Neoplasms
Cystitis Interstitial
Hyperinsulinism
Retinitis
Prostatic Neoplasms
Idiopathic Pulmonary Fibrosis
Malaria Cerebral
Arthralgia
Mandibular Nerve Injuries
Myotonic Dystrophy
Benign non infected urachal cyst
Esophagitis
Stomach Neoplasms
Epilepsy Partial with Variable Foci
Rectal Neoplasms
Lens Diseases
Leukemia Myeloid
Leukemia
Autoimmune Diseases
Brain Injuries Traumatic
16p11.2 Deletion Syndrome
Micrognathism
Behcet Syndrome
Retinal Detachment
Pre Eclampsia
Neutrophil Actin Dysfunction
Leukopenia
Celiac Disease
Malformations of Cortical Development Group I
Syringoma
Hydrops Fetalis
Hemorrhage
613882
Memory Disorders
Uterine Cervicitis
Leukemia Biphenotypic Acute
Hypoxia
Infections
Psychological Distress
Pancreatitis
Mosaic variegated aneuploidy syndrome
248310
Arthritis Infectious
Hyperhomocysteinemia
Cardiomyopathy Hypertrophic
Cardiomyopathy Dilated
Mastitis
Crohn Disease
Prostatic Hyperplasia
Seminoma
Bone Diseases
Infarction Middle Cerebral Artery
Adenocarcinoma
Abdominal Injuries
Learning Disabilities
Hyperimmunoglobulin G1(A1) Syndrome
Parkinson Disease