In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plotting
import os # file directory handling
import xml.etree.ElementTree as ET # create, parse, modify XML data

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

"""
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
"""
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

"\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n    for filename in filenames:\n        print(os.path.join(dirname, filename))\n"

# **Introduction**
Clinical trials play a pivotal role in advancing medical research, shaping healthcare policies and improving patient outcomes worldwide. Understanding the global landscape of clinical trials provides valuable insights into emerging health challenges, treatment trends, and research priorities.

**Key Objectives:**
This project analyzes and visualizes worldwide patterns, trends, and disparities in clinical trials. Ultimately, the project aims to provide a comprehensive view of the state of medical research world-wide.

**Status:** This project is in-progress as of March 14, 2024.

# **Part I. Importing the Dataset** 

**About the Dataset**:
This dataset is originally sourced from ClinicalTrials.gov and made available on Kaggle. ClinicalTrials.gov is maintained by the U.S. National Library of Medicine (NLM), and is a public database of clinical trial research studies and their results.The Kaggle dataset was last updated on May 8th, 2020 and contains 338k clinical trials. Each of these entries are stored in XML format, containing information on each studies such as the status, phase, eligibility criteria, and the health condition in question. 

In [2]:
# Directory containing all the folders with the XML data files
main_directory = "/kaggle/input/all-clinical-trials"

# List to store extracted data
data_list = []

# Iterate through each folder (NCTXXXX) in the main directory
for folder in os.listdir(main_directory):
    folder_path = os.path.join(main_directory, folder) # ex:/all-clinical-trials/NCT0000
    if os.path.isdir(folder_path): 
        # Iterate through each XML file in the folder
        for file_name in os.listdir(folder_path):
            if file_name.endswith(".xml"):
                file_path = os.path.join(folder_path, file_name) #ex:/NCT0000/NCT0000102.xml
                if os.path.getsize(file_path) == 0:
                    continue  # Skip empty files
                tree = ET.parse(file_path) #ElementTree object: entire XML file
                root = tree.getroot() # root element: <clinical_study>

                # Extract data from XML tags
                nct_id = root.find(".//nct_id").text
                brief_title = root.find(".//brief_title").text
                overall_status = root.find(".//overall_status").text
                phase = root.find(".//phase").text if root.find(".//phase") is not None else None
                study_type = root.find(".//study_type").text if root.find(".//study_type") is not None else None
                has_expanded_access = root.find(".//has_expanded_access").text if root.find(".//has_expanded_access") is not None else None
                intervention_model = root.find(".//intervention_model").text if root.find(".//intervention_model") is not None else None
                primary_purpose = root.find(".//primary_purpose").text if root.find(".//primary_purpose") is not None else None
                masking = root.find(".//masking").text if root.find(".//masking") is not None else None
                condition = root.find(".//condition").text if root.find(".//condition") is not None else None
                intervention_name = root.find(".//intervention_name").text if root.find(".//intervention_name") is not None else None
                minimum_age = root.find(".//minimum_age").text if root.find(".//minimum_age") is not None else None
                maximum_age = root.find(".//maximum_age").text if root.find(".//maximum_age") is not None else None
                gender = root.find(".//gender").text if root.find(".//gender") is not None else None
                healthy_volunteers = root.find(".//healthy_volunteers").text if root.find(".//healthy_volunteers") is not None else None
                facility_name = root.find(".//facility/name").text if root.find(".//facility/name") is not None else None
                facility_city = root.find(".//facility/address/city").text if root.find(".//facility/address/city") is not None else None
                facility_state = root.find(".//facility/address/state").text if root.find(".//facility/address/state") is not None else None
                facility_country = root.find(".//facility/address/country").text if root.find(".//facility/address/country") is not None else None
                verification_date = root.find(".//verification_date").text if root.find(".//verification_date") is not None else None
                study_first_submitted = root.find(".//study_first_submitted").text if root.find(".//study_first_submitted") is not None else None
                study_first_posted = root.find(".//study_first_posted").text if root.find(".//study_first_posted") is not None else None
                last_update_submitted = root.find(".//last_update_submitted").text if root.find(".//last_update_submitted") is not None else None
                last_update_posted = root.find(".//last_update_posted").text if root.find(".//last_update_posted") is not None else None
                condition_browse = [elem.text for elem in root.findall(".//condition_browse/mesh_term")]
                intervention_browse = [elem.text for elem in root.findall(".//intervention_browse/mesh_term")]

                # Create a dictionary with the extracted data
                data = {
                    "NCT_ID": nct_id,
                    "Brief_Title": brief_title,
                    "Overall_Status": overall_status,
                    "Phase": phase,
                    "Study_Type": study_type,
                    "Has_Expanded_Access": has_expanded_access,
                    "Intervention_Model": intervention_model,
                    "Primary_Purpose": primary_purpose,
                    "Masking": masking,
                    "Condition": condition,
                    "Intervention_Name": intervention_name,
                    "Minimum_Age": minimum_age,
                    "Maximum_Age": maximum_age,
                    "Gender": gender,
                    "Healthy_Volunteers": healthy_volunteers,
                    "Facility_Name": facility_name,
                    "Facility_City": facility_city,
                    "Facility_State": facility_state,
                    "Facility_Country": facility_country,
                    "Verification_Date": verification_date,
                    "Study_First_Submitted": study_first_submitted,
                    "Study_First_Posted": study_first_posted,
                    "Last_Update_Submitted": last_update_submitted,
                    "Last_Update_Posted": last_update_posted,
                    "Condition_Browse": condition_browse,
                    "Intervention_Browse": intervention_browse
                }

                # Append the data to the list
                data_list.append(data)

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data_list)
df.head()


Unnamed: 0,NCT_ID,Brief_Title,Overall_Status,Phase,Study_Type,Has_Expanded_Access,Intervention_Model,Primary_Purpose,Masking,Condition,...,Facility_City,Facility_State,Facility_Country,Verification_Date,Study_First_Submitted,Study_First_Posted,Last_Update_Submitted,Last_Update_Posted,Condition_Browse,Intervention_Browse
0,NCT01051180,Is Doppler Necessary in Haemorrhoidal Artery L...,Terminated,,Observational,No,,,,Piles,...,Poole,Dorset,United Kingdom,April 2017,"January 15, 2010","January 18, 2010","April 3, 2017","April 5, 2017",[Hemorrhage],[]
1,NCT01056120,Long Term Safety Profile of the PRO-Kinetic EN...,Completed,,Observational,No,,,,De Novo and Re-stenosed Coronary Artery Lesions,...,Bruck an der Mur,NRW,Austria,January 2016,"January 25, 2010","January 26, 2010","January 29, 2016","February 1, 2016",[],[]
2,NCT01059903,Study in Healthy Volunteers to Prove That Two ...,Completed,Phase 1,Interventional,No,Crossover Assignment,Basic Science,None (Open Label),Healthy Volunteers,...,Moenchengladbach,NRW,Germany,May 2012,"January 28, 2010","February 1, 2010","May 15, 2012","May 22, 2012",[],[Rotigotine]
3,NCT01058031,Neural Correlates of PTSD Prevention With Mind...,Completed,,Interventional,No,Parallel Assignment,Treatment,None (Open Label),"Stress Disorders, Posttraumatic",...,Decatur,Georgia,United States,November 2013,"January 26, 2010","January 28, 2010","November 4, 2013","November 5, 2013","[Stress Disorders, Post-Traumatic]",[]
4,NCT01056835,Effects of PGI2 Analogue Use on the Developmen...,Completed,Phase 3,Interventional,No,Parallel Assignment,Prevention,None (Open Label),Chronic Allograft Nephropathy,...,Seoul,,"Korea, Republic of",May 2015,"January 24, 2010","January 26, 2010","May 24, 2015","May 27, 2015",[Kidney Diseases],"[Epoprostenol, Tezosentan]"


**Cleaning the Dataset**

In [3]:
# common conditions
conditions = df[['Condition']]
conditions

Unnamed: 0,Condition
0,Piles
1,De Novo and Re-stenosed Coronary Artery Lesions
2,Healthy Volunteers
3,"Stress Disorders, Posttraumatic"
4,Chronic Allograft Nephropathy
...,...
103502,Insomnia
103503,"Diabetic Neuropathy, Painful"
103504,Thyroid Neoplasm
103505,Gastroesophageal Reflux
