In [122]:
import shlex
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import holoviews as hv
#import plotly.graph_objects as go



In [2]:
text_data = pd.read_csv('clinical_trials_dataset.csv')
#text_data = text_data.dropna(axis=0,how='any').reset_index(drop=True)
text_data = text_data[['OfficialTitle','StudyType','StartDate',
           'Condition','ConditionAncestorTerm',
           'ArmGroupInterventionName','InterventionArmGroupLabel','InterventionDescription']].copy()
#text_data[text_data['ArmGroupInterventionName'].str.contains('Genetic')]
text_data.isna().sum()

OfficialTitle                  53
StudyType                       0
StartDate                      42
Condition                       0
ConditionAncestorTerm         571
ArmGroupInterventionName     1128
InterventionArmGroupLabel    1124
InterventionDescription      1158
dtype: int64

In [3]:
#deal with nan values column by column
test = text_data.copy()
test = test.fillna({'StartDate':'unknown'}).copy()
test = test.fillna({'OfficialTitle':'unknown'}).copy()
test['ConditionAncestorTerm'] = test['ConditionAncestorTerm'].fillna(test['Condition'])
test = test.fillna({'ArmGroupInterventionName':'other or NA'}).copy()
test = test.fillna({'InterventionArmGroupLabel':'other or NA'}).copy()
test = test.fillna({'InterventionDescription':'other or NA'}).copy()
text_data = test.copy()

del test

In [4]:
#replace nan values with other appropriate values
text_data.isna().sum()

OfficialTitle                0
StudyType                    0
StartDate                    0
Condition                    0
ConditionAncestorTerm        0
ArmGroupInterventionName     0
InterventionArmGroupLabel    0
InterventionDescription      0
dtype: int64

In [5]:
text_data[(text_data['ArmGroupInterventionName'].str.contains('Genetic:')) & (text_data['StudyType']=='Interventional')]

Unnamed: 0,OfficialTitle,StudyType,StartDate,Condition,ConditionAncestorTerm,ArmGroupInterventionName,InterventionArmGroupLabel,InterventionDescription
0,"Phase I/IIa, First-in-human, Open-label, Singl...",Interventional,September 2024,Drug Resistant Epilepsy,Brain Diseases|Central Nervous System Diseases...,Genetic: lentiviral gene therapy,lentiviral gene therapy treatment (Interventio...,lentiviral gene therapy to treat drug resistan...
5,"A Phase I Study to Assess the Safety, Tolerabi...",Interventional,"February 7, 2022",Alzheimer's Disease|Mild Cognitive Impairment,Dementia|Brain Diseases|Central Nervous System...,Genetic: AAV2-BDNF Gene Therapy|Biological: AA...,Gene transfer of AAV2-BDNF|Gene transfer of AA...,AAV2-BDNF is a genetically engineered adeno-as...
7,A Phase I/II Clinical Trial of Hematopoietic S...,Interventional,"April 9, 2010",Lysosomal Storage Disease|Metachromatic Leukod...,"Metabolism, Inborn Errors|Genetic Diseases, In...",Genetic: OTL-200 Gene Therapy,OTL-200 Gene Therapy,Autologous hematopoietic stem/progenitor cells...
8,Phase I/II Clinical Trial of Autologous Hemato...,Interventional,"July 23, 2021",Severe Combined Immunodeficiency Due to RAG1 D...,Immune System Diseases|Primary Immunodeficienc...,Genetic: Gene therapy,Gene therapy,Patients will be infused with autologous CD34+...
12,A Phase I/II Gene Therapy Trial for X-CGD With...,Interventional,July 2013,X-linked Chronic Granulomatous Disease,Pathologic Processes|Phagocyte Bactericidal Dy...,Genetic: ex-vivo gene-therapy,ex-vivo gene-therapy,"transplantation autologous CD34+ cells, transd..."
...,...,...,...,...,...,...,...,...
5804,"An Open-label, Dose-escalation Study to Evalua...",Interventional,"May 5, 2023",Neovascular Age-related Macular Degeneration,Retinal Degeneration|Retinal Diseases|Eye Dise...,Genetic: SKG0106|Genetic: SKG0106|Genetic: SKG...,Dose Level 1|Dose Level 2|Dose Level 3,SKG0106 is a recombinant adeno-associated viru...
5858,WEST-KOaST Study: WES (Whole Exome Sequencing)...,Interventional,"December 1, 2023",Testicular Cancer,Endocrine Gland Neoplasms|Neoplasms by Site|Ne...,Genetic: Analysis of biological samples of sal...,First-degree family members of patients with t...,The project aims to collect biological samples...
5864,Phase II Study of Capecitabine in Combination ...,Interventional,November 2006,Esophageal Cancer,Gastrointestinal Neoplasms|Digestive System Ne...,Drug: capecitabine|Drug: oxaliplatin|Genetic: ...,Chemo|Chemo|Chemo|Chemo|Chemo|Chemo|Chemo|Chem...,Oral|IV|Correlative Study|Correlative Study|Co...
5887,Neoadjuvant Intratumoral Injection of Dendriti...,Interventional,"May 1, 2006",Breast Cancer,Neoplasms by Site|Neoplasms|Breast Diseases|Sk...,Biological: therapeutic autologous dendritic c...,Vaccine|Vaccine|Vaccine|Vaccine|Vaccine|Vaccin...,injected into the primary breast mass or palpa...


In [6]:
#this is for condition names
#the split('|') must be already applied to cond_list, function does not do that
def minimum_cond_name(cond_list):
    con_size=[]
    for elem in text_data['ConditionAncestorTerm'][0].split('|'):
        con_size.append(elem.count(' '))
    return cond_list[con_size.index(min(con_size))]

In [7]:
#add column with modified conditions
text_data['mod_ConditionAncestorTerm'] = [minimum_cond_name(con_term.split('|')) for con_term in text_data['ConditionAncestorTerm']]

In [8]:
condword_list = [' '.join(word.split(',')) if ',' in word else word for word in text_data['mod_ConditionAncestorTerm'].values]
cond_word_count = []
for word in condword_list:
    for each_word in word.split(' '):
        cond_word_count.append(each_word)

frequency_map = {}
for item in cond_word_count:
    if item in frequency_map:
        frequency_map[item] += 1
    else:
        frequency_map[item] = 1

frequency_map_df = pd.DataFrame(frequency_map.items(),columns=['word','frequency']).sort_values(by='frequency',ascending=False)

In [9]:
pick_words= ['Diseases']
frequency_map_df

Unnamed: 0,word,frequency
8,Neoplasms,2204
3,,1330
1,Diseases,942
27,by,914
76,Type,521
...,...,...
518,Metastasis,1
515,Frontline,1
514,From,1
512,Complete,1


In [10]:
# even filter out conditions based on popular words
selected_cond_words = []
for elem in text_data['mod_ConditionAncestorTerm'].values:
    if len(list(set(elem.split(' ')) & set(pick_words)))!=0:
        selected_cond_words.append(elem)
    else:
        selected_cond_words.append('others')

text_data['selected_conditionterms'] = selected_cond_words

In [18]:
text_data['mod_ConditionAncestorTerm'].value_counts()

mod_ConditionAncestorTerm
Neoplasms by Histologic Type             517
Neoplasms by Site                        394
Neoplasms, Glandular and Epithelial      269
Respiratory Tract Neoplasms              243
Pathologic Processes                     191
                                        ... 
Asthma in Children                         1
Vasculitis                                 1
Advanced Nonhaematologic Malignancies      1
Acute Leukaemia                            1
Infertility, Male                          1
Name: count, Length: 720, dtype: int64

In [12]:
modified_armgroup_interventionName=[]
for elem_arm in text_data['ArmGroupInterventionName']:
    if 'Genetic:' in elem_arm:
        if '|' in elem_arm:
            for elem_arm_gene in elem_arm.split('|'):
                if 'Genetic:' in elem_arm_gene and ('therapy' in elem_arm_gene.lower() or 'vector' in elem_arm_gene.lower()):
                    modified_armgroup_interventionName.append(elem_arm_gene)
                    break
                elif 'Genetic:' in elem_arm_gene:
                    modified_armgroup_interventionName.append('other genetic')
                    print(elem_arm)
                    break
        elif '|' not in elem_arm and ('therapy' in elem_arm.lower() or 'vector' in elem_arm.lower()):
            modified_armgroup_interventionName.append(elem_arm)
        else:
            print(elem_arm)
            modified_armgroup_interventionName.append('other genetic')
    else:
        modified_armgroup_interventionName.append('other') 


Genetic: gene transduced PBL and/or gene transduced HSC
Genetic: rAAV1.tMCK.human-alpha-sarcoglycan- First cohort|Genetic: Genetic: rAAV1.tMCK.human-alpha-sarcoglycan- Second cohort
Genetic: rAAV1.tMCK.human-alpha-sarcoglycan- First cohort|Genetic: Genetic: rAAV1.tMCK.human-alpha-sarcoglycan- Second cohort
Genetic: OTL-103
Genetic: Strimvelis
Genetic: RGX-314|Genetic: RGX-314|Genetic: RGX-314|Genetic: RGX-314|Genetic: RGX-314
Genetic: RGX-314|Genetic: RGX-314|Genetic: RGX-314|Genetic: RGX-314|Genetic: RGX-314
Genetic: RGX-314|Genetic: RGX-314|Genetic: RGX-314|Genetic: RGX-314|Genetic: RGX-314
Genetic: RGX-314|Genetic: RGX-314|Genetic: RGX-314|Genetic: RGX-314|Genetic: RGX-314
Genetic: RGX-314|Genetic: RGX-314|Genetic: RGX-314|Genetic: RGX-314|Genetic: RGX-314
Genetic: Ad5-yCD/mutTKSR39rep-ADP
Genetic: Intervention on primary cultured cells
Genetic: OTL-103
Genetic: RGX-314|Genetic: RGX-314|Genetic: RGX-314|Genetic: RGX-314
Genetic: RGX-314|Genetic: RGX-314|Genetic: RGX-314|Genetic: RGX

In [43]:
#np.unique(np.array(modified_armgroup_interventionName),return_counts=True)
onlygen_modified_armgroup_interventionName = [elem for elem in modified_armgroup_interventionName if 'Genetic:' in elem]
len(onlygen_modified_armgroup_interventionName)

47

In [55]:
select_gene_therapy_cases = [elem for elem in text_data['ArmGroupInterventionName'].values for word in onlygen_modified_armgroup_interventionName if elem.find(word)!=-1]
select_gene_therapy_cases

['Genetic: lentiviral gene therapy',
 'Genetic: Gene therapy',
 'Genetic: Gene therapy',
 'Genetic: AAV2-BDNF Gene Therapy|Biological: AAV2-BDNF Gene Therapy',
 'Genetic: OTL-200 Gene Therapy',
 'Genetic: Gene therapy',
 'Genetic: Gene therapy',
 'Genetic: ex-vivo gene-therapy',
 'Genetic: Gene Therapy product CYL-02 = plasmid DNA pre-complexed to linear polyethylenimine encoding sst2 + dck::umk genes',
 'Genetic: Gene Therapy product CYL-02 = plasmid DNA pre-complexed to linear polyethylenimine encoding sst2 + dck::umk genes',
 'Genetic: NG101 AAV gene therapy|Genetic: NG101 AAV gene therapy|Genetic: NG101 AAV gene therapy',
 'Genetic: NG101 AAV gene therapy|Genetic: NG101 AAV gene therapy|Genetic: NG101 AAV gene therapy',
 'Genetic: NG101 AAV gene therapy|Genetic: NG101 AAV gene therapy|Genetic: NG101 AAV gene therapy',
 'Genetic: Intracerebral LV gene therapy',
 'Genetic: Gene Therapy|Drug: Busulfan',
 'Genetic: Single infusion of autologous CD34+ cells transduced with the self-inac

In [68]:
sankey_data = text_data[text_data['ArmGroupInterventionName'].isin(select_gene_therapy_cases)].copy()
sankey_data['no_of_cases'] = np.repeat(f'{sankey_data.shape[0]} cases',sankey_data.shape[0])
sankey_data

Unnamed: 0,OfficialTitle,StudyType,StartDate,Condition,ConditionAncestorTerm,ArmGroupInterventionName,InterventionArmGroupLabel,InterventionDescription,mod_ConditionAncestorTerm,selected_conditionterms,no_of_cases
0,"Phase I/IIa, First-in-human, Open-label, Singl...",Interventional,September 2024,Drug Resistant Epilepsy,Brain Diseases|Central Nervous System Diseases...,Genetic: lentiviral gene therapy,lentiviral gene therapy treatment (Interventio...,lentiviral gene therapy to treat drug resistan...,Brain Diseases,Brain Diseases,31 cases
1,"Growth and Development, Health-related Quality...",Observational,"June 5, 2023",Transfusion-dependent Beta-Thalassemia|Gene Th...,"Anemia, Hemolytic, Congenital|Anemia, Hemolyti...",Genetic: Gene therapy,Gene therapy group,Autologous edited hematopoietic stem cell tran...,"Anemia, Hemolytic, Congenital",others,31 cases
5,"A Phase I Study to Assess the Safety, Tolerabi...",Interventional,"February 7, 2022",Alzheimer's Disease|Mild Cognitive Impairment,Dementia|Brain Diseases|Central Nervous System...,Genetic: AAV2-BDNF Gene Therapy|Biological: AA...,Gene transfer of AAV2-BDNF|Gene transfer of AA...,AAV2-BDNF is a genetically engineered adeno-as...,Dementia,others,31 cases
7,A Phase I/II Clinical Trial of Hematopoietic S...,Interventional,"April 9, 2010",Lysosomal Storage Disease|Metachromatic Leukod...,"Metabolism, Inborn Errors|Genetic Diseases, In...",Genetic: OTL-200 Gene Therapy,OTL-200 Gene Therapy,Autologous hematopoietic stem/progenitor cells...,"Metabolism, Inborn Errors",others,31 cases
8,Phase I/II Clinical Trial of Autologous Hemato...,Interventional,"July 23, 2021",Severe Combined Immunodeficiency Due to RAG1 D...,Immune System Diseases|Primary Immunodeficienc...,Genetic: Gene therapy,Gene therapy,Patients will be infused with autologous CD34+...,Immune System Diseases,Immune System Diseases,31 cases
12,A Phase I/II Gene Therapy Trial for X-CGD With...,Interventional,July 2013,X-linked Chronic Granulomatous Disease,Pathologic Processes|Phagocyte Bactericidal Dy...,Genetic: ex-vivo gene-therapy,ex-vivo gene-therapy,"transplantation autologous CD34+ cells, transd...",Pathologic Processes,others,31 cases
13,PILOT STUDY OF GENE THERAPY FOR LOCALLY ADVANC...,Interventional,December 2010,Pancreatic Adenocarcinoma,"Carcinoma|Neoplasms, Glandular and Epithelial|...",Genetic: Gene Therapy product CYL-02 = plasmid...,Therapy,Intratumoral injection of the gene therapy pro...,Carcinoma,others,31 cases
21,A Phase 1/2a Open-label Study to Evaluate Safe...,Interventional,"September 8, 2023",Age-Related Macular Degeneration,Retinal Degeneration|Retinal Diseases|Eye Dise...,Genetic: NG101 AAV gene therapy|Genetic: NG101...,NG101 Gene Therapy Group 1|NG101 Gene Therapy ...,Sub retinal injection of NG101 (a non-replicat...,Retinal Degeneration,others,31 cases
24,Lentiviral Gene Therapy for X-linked Adrenoleu...,Interventional,"October 30, 2018",X-linked Adrenoleukodystrophy,"Brain Diseases, Metabolic, Inborn|Brain Diseas...",Genetic: Intracerebral LV gene therapy,Lentivirus-mediated delivery of ABCD1 to the CNS.,Intracerebral LV gene therapy to deliver high ...,"Brain Diseases, Metabolic, Inborn",others,31 cases
29,ADA Gene Transfer Into Hematopoietic Stem/Prog...,Interventional,"October 2, 2002",Immunologic Deficiency Syndromes,Immune System Diseases,Genetic: Gene Therapy|Drug: Busulfan,Gene Therapy|Gene Therapy,Infusion of autologous CD34+ cells transduced ...,Immune System Diseases,Immune System Diseases,31 cases


In [129]:
sankey_diagram = sankey_data[['mod_ConditionAncestorTerm','no_of_cases','StudyType','ArmGroupInterventionName']].reset_index(drop=True).copy()


In [130]:
sankey_diagram

Unnamed: 0,mod_ConditionAncestorTerm,no_of_cases,StudyType,ArmGroupInterventionName
0,Brain Diseases,31 cases,Interventional,Genetic: lentiviral gene therapy
1,"Anemia, Hemolytic, Congenital",31 cases,Observational,Genetic: Gene therapy
2,Dementia,31 cases,Interventional,Genetic: AAV2-BDNF Gene Therapy|Biological: AA...
3,"Metabolism, Inborn Errors",31 cases,Interventional,Genetic: OTL-200 Gene Therapy
4,Immune System Diseases,31 cases,Interventional,Genetic: Gene therapy
5,Pathologic Processes,31 cases,Interventional,Genetic: ex-vivo gene-therapy
6,Carcinoma,31 cases,Interventional,Genetic: Gene Therapy product CYL-02 = plasmid...
7,Retinal Degeneration,31 cases,Interventional,Genetic: NG101 AAV gene therapy|Genetic: NG101...
8,"Brain Diseases, Metabolic, Inborn",31 cases,Interventional,Genetic: Intracerebral LV gene therapy
9,Immune System Diseases,31 cases,Interventional,Genetic: Gene Therapy|Drug: Busulfan


In [107]:
test = sankey_diagram[['mod_ConditionAncestorTerm','no_of_cases']].copy()
test.columns = ['Source','Target']
test['Value'] = [test['Source'].value_counts()[disease_type] for disease_type in test['Source']]
test = test.drop_duplicates(subset='Source').copy()
test

Unnamed: 0,Source,Target,Value
0,Brain Diseases,31 cases,1
1,"Anemia, Hemolytic, Congenital",31 cases,2
2,Dementia,31 cases,1
3,"Metabolism, Inborn Errors",31 cases,1
4,Immune System Diseases,31 cases,3
5,Pathologic Processes,31 cases,1
6,Carcinoma,31 cases,1
7,Retinal Degeneration,31 cases,1
8,"Brain Diseases, Metabolic, Inborn",31 cases,1
11,Chronic Disease,31 cases,1


In [131]:
test2 = sankey_diagram[['no_of_cases','StudyType']].copy()
test2.columns = ['Source','Target']
test2['Value'] = [test2['Target'].value_counts()[gene_the] for gene_the in test2['Target']]
test2 = test2.drop_duplicates(subset='Target').copy()
test2


Unnamed: 0,Source,Target,Value
0,31 cases,Interventional,28
1,31 cases,Observational,3


In [135]:
test3 = sankey_diagram[['StudyType','ArmGroupInterventionName']].copy()
test3.columns = ['Source','Target']
test3['Value'] = [test3['Source'].value_counts()[gene_the] for gene_the in test3['Source']]
#test3 = test3.drop_duplicates(subset='Source').copy()
test3


Unnamed: 0,Source,Target,Value
0,Interventional,Genetic: lentiviral gene therapy,28
1,Observational,Genetic: Gene therapy,3
2,Interventional,Genetic: AAV2-BDNF Gene Therapy|Biological: AA...,28
3,Interventional,Genetic: OTL-200 Gene Therapy,28
4,Interventional,Genetic: Gene therapy,28
5,Interventional,Genetic: ex-vivo gene-therapy,28
6,Interventional,Genetic: Gene Therapy product CYL-02 = plasmid...,28
7,Interventional,Genetic: NG101 AAV gene therapy|Genetic: NG101...,28
8,Interventional,Genetic: Intracerebral LV gene therapy,28
9,Interventional,Genetic: Gene Therapy|Drug: Busulfan,28


In [136]:
test_combined = pd.concat([test,test2,test3]).copy()
test_combined['Value'] = test_combined['Value'].values+5
test_combined

Unnamed: 0,Source,Target,Value
0,Brain Diseases,31 cases,6
1,"Anemia, Hemolytic, Congenital",31 cases,7
2,Dementia,31 cases,6
3,"Metabolism, Inborn Errors",31 cases,6
4,Immune System Diseases,31 cases,8
5,Pathologic Processes,31 cases,6
6,Carcinoma,31 cases,6
7,Retinal Degeneration,31 cases,6
8,"Brain Diseases, Metabolic, Inborn",31 cases,6
11,Chronic Disease,31 cases,6


In [137]:
#hv.extension('bokeh')
sankey = hv.Sankey(test_combined)
sankey
#sankey.opts(label_position='left', edge_color='target', node_color='index', cmap='tab20')
