In [15]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import glob
import os
from zipfile import ZipFile
import csv
import xml.etree.ElementTree as ET
from lxml import etree


# Clinical Trials Exploration

H1 is dedicated to bringing down the cost and improving the outcomes of the drug discovery pipeline.  As such we are very interested in understanding the drivers behind successful and unsuccessful clinical trials.



## Phase 1:
https://www.clinicaltrials.gov/
Contains all 270k clinical trials performed in the US.  Each of these clinical trials will have papers associated with it.  Grab the clinical trials and the associated papers and store them in a way that links the two.  You may need to figure out how to grab the associated papers (they will not be stored in the same repository, and you may need to grab them using a different REST API).


## Phase 2:
Once we have the data all downloaded and linked, we’ll want to start looking for features that may drive whether or not the clinical trial was a success or a failure.  This will mean combing through the trial itself for “features” that we can use (e.g. number of citations on the associated papers, which drugs were under study, phase of the trial, etc . . .).

Some pointers:
Start by understanding the clinical trials themselves.  Google what you can, and if you still have specific questions around what particular things mean I can get those questions answered by people we have over here.  It will be very important that you understand what the fields in the trial mean in order to make good decisions on what features might make a difference.

I’m most interested in doing some deep text-based analysis on the papers associated with the trials themselves.  Try to set things up so that this analysis will be easy.  We’ll want to ask questions like “for all the successful trials, let’s run this algorithm against the associated papers”.  We’ll provide the algorithm (or some pointers to where you can download it).  

Focus on the trials from Jan 2000 – present.  You can ignore anything before that.

Try to do as much of this as possible in python.  As we move forward we can give you some analytical tools in python, but we don’t have any perl stuff.



In [83]:
from pathlib import Path
from pathlib import PureWindowsPath
#zipfile = './AllPublicXML.zip'
rootdir = './NCT_files/'
nct_files = []
for subddir, dirs, files in os.walk(rootdir):
    for file in files:
        nct_files.append(os.path.join(subddir,file).replace('\\', '/'))


In [None]:
nct_files[1]


In [17]:
with ZipFile(zipfile, 'r') as myzip:
    myzip.extractall()

NameError: name 'zipfile' is not defined

In [16]:
all_xmls = []
zipfile = './AllPublicXML.zip'
with ZipFile(zipfile, 'r') as myzip:
    all_xmls.append(zip.namelist())
    

In [100]:
parsed_xml = []
rootdir = 'C:/Users/Jeffery Rosario/DataScienceFlatiron/dsc-data-science-env-setup-v2-1/Capstone Project/NCT_files\/' 
for subddir, dirs, files in os.walk(rootdir):
    for file in files:
        name = os.path.join(subddir,file)
        parsed_xml.append(XML(name))
        
        

In [4]:
xml_file = './NCT_files/NCT0011xxxx/NCT00110162.xml'
tree = etree.parse(xml_file)
root = tree.getroot()
cols = ['citation','nct_id' ]
for child in root:
    for child in child:
        print(child.tag, child.text)
   # for grandchild in child:
    #    print(grandchild.tag, grandchild.attrib)

download_date ClinicalTrials.gov processed this data on October 14, 2020
link_text Link to the current ClinicalTrials.gov record.
url https://clinicaltrials.gov/show/NCT00110162
org_study_id PMCC-VCOG-PR-0103
secondary_id CDR0000413706
secondary_id PMCC-TROG-0306
nct_id NCT00110162
lead_sponsor 
      
textblock 
      RATIONALE: Androgens can cause the growth of prostate cancer cells. Androgen deprivation
      therapy may stop the adrenal glands from making androgens.

      PURPOSE: This randomized phase III trial is studying how well androgen deprivation therapy
      works in treating patients with prostate cancer.
    
textblock 
      OBJECTIVES:

      Primary

        -  Compare overall survival (with acceptable morbidity) of patients with prostate cancer
           treated with delayed vs immediate androgen deprivation therapy (ADT).

      Secondary

        -  Compare cancer-specific survival of patients treated with these regimens.

        -  Compare clinical progression 

1. Download the dataset. 

2. Analyze XML file. Determine the columns to use for analysis:

first layer

brief_title
source
overall_status
study_type
has_expanded_access
condition
study_first_posted
verification_date
last_update_posted
keyword
results
phase
enrollment_type

second_layer

textblock
observational_model
time_perspective
intervention_type
intervention_name
gender
minimum_age
maximum_age
healthy_volunteers
country
mesh_term
allocation
intervention_model
primary_purpose
masking



third layer

textblock



In [94]:
child_list = ['brief_title', 'source', 'overall_status', 'study_type','has_expanded_access','condition',
                'study_first_posted', 'verification_date', 'last_update_posted', 'keyword', 'phase',
              'enrollment', 'intervention_browse']
gchild_list = ['textblock', 'nct_id','observational_model', 'time_perspective', 'intervention_type',
               'is_fda_regulated_drug', 'is_fda_regulated_device', 
               'defined Population', 'intervention_type', 'intervention_name', 
               'gender', 'minimum_age', 'maximum_age','healthy_volunteers', 'country',
               'mesh_term', 'allocation', 'intervention_model', 'primary_purpose', 'masking']

xml_file = './NCT_files/NCT0011xxxx/NCT00110162.xml'
tree = etree.parse(xml_file)
root = tree.getroot()
first_layer_tag = []
first_layer_text = []
for child in root.iter([child_list]):
    first_layer_tag.append(child.tag)
    first_layer_text.append(child.text)
for gchild in root.iter([gchild_list]):
    first_layer_tag.append(gchild.tag)
    first_layer_text.append(gchild.text)
        
#first_dict = 

df = pd.DataFrame(data = [first_layer_text],columns = first_layer_tag)
df['nct_id']

0    NCT00110162
Name: nct_id, dtype: object

In [38]:
xml_file = './NCT_files/NCT0005xxxx/NCT00050037.xml'
tree = etree.parse(xml_file)
root = tree.getroot()
first_layer_tag = []
first_layer_text = []
for child in root.iter([child_list]):
    first_layer_tag.append(child.tag)
    first_layer_text.append(child.text)
for gchild in root.iter([gchild_list]):
    first_layer_tag.append(gchild.tag)
    first_layer_text.append(gchild.text)

df2 = pd.DataFrame(data = [first_layer_text],columns = first_layer_tag)


In [39]:
df = df.loc[:,~df.columns.duplicated()]
df2 = df2.loc[:,~df2.columns.duplicated()]


In [46]:
pd.concat([df,df2], sort=True)

Unnamed: 0,allocation,brief_title,condition,country,gender,has_expanded_access,healthy_volunteers,intervention_browse,intervention_model,intervention_name,...,minimum_age,nct_id,overall_status,phase,primary_purpose,source,study_first_posted,study_type,textblock,verification_date
0,Randomized,Androgen Deprivation Therapy in Treating Patie...,Prostate Cancer,Australia,Male,No,No,\n,,antiandrogen therapy,...,,NCT00110162,Unknown status,Phase 3,Treatment,National Cancer Institute (NCI),"May 4, 2005",Interventional,\n RATIONALE: Androgens can cause the gro...,June 2009
0,Randomized,Cognitive Therapy for Binge-Eating Disorder,Obesity,United States,All,No,No,,Parallel Assignment,CBT,...,18 Years,NCT00050037,Completed,,Treatment,"University of North Carolina, Chapel Hill","November 21, 2002",Interventional,\n The purpose of this study is to compar...,April 2013


In [135]:
child_list = ['brief_title', 'source', 'overall_status', 'study_type','has_expanded_access','condition',
                'study_first_posted', 'verification_date', 'last_update_posted', 'keyword', 'phase',
              'enrollment', 'intervention_browse']
gchild_list = ['textblock', 'nct_id','observational_model', 'time_perspective', 'intervention_type',
               'is_fda_regulated_drug', 'is_fda_regulated_device', 
               'defined Population', 'intervention_type', 'intervention_name', 
               'gender', 'minimum_age', 'maximum_age','healthy_volunteers', 'country',
               'mesh_term', 'allocation', 'intervention_model', 'primary_purpose', 'masking']

def parse_files(nct):
    dataframes = []
    for f in nct:
            dataframes.append(parse_xml(f))
    df = pd.concat(dataframes)
    return df
      
  

def parse_xml(xml):
        #create empty lists for the tag and text
        first_layer_tag = []
        first_layer_text = []
        
        tree = etree.parse(xml)
        root = tree.getroot()
        
        for child in root.iter([child_list]):
            first_layer_tag.append(child.tag)
            first_layer_text.append(child.text)
        for gchild in root.iter([gchild_list]):
            first_layer_tag.append(gchild.tag)
            first_layer_text.append(gchild.text)
        df = pd.DataFrame(data = [first_layer_text],
                          columns = first_layer_tag
                          )
        df = df.loc[:,~df.columns.duplicated()]
        return df
    
dataframe = parse_files(nct_files)
            

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [138]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 354534 entries, 0 to 0
Data columns (total 31 columns):
allocation                 271090 non-null object
brief_title                354534 non-null object
condition                  353674 non-null object
country                    318275 non-null object
enrollment                 347826 non-null object
gender                     353673 non-null object
has_expanded_access        348493 non-null object
healthy_volunteers         347490 non-null object
intervention_browse        125840 non-null object
intervention_model         270778 non-null object
intervention_name          317024 non-null object
intervention_type          317024 non-null object
is_fda_regulated_device    130883 non-null object
is_fda_regulated_drug      130891 non-null object
keyword                    231168 non-null object
last_update_posted         354534 non-null object
masking                    272151 non-null object
maximum_age                353673 non-null o

In [147]:
dataframe['allocation'].value_counts()

Randomized        180412
N/A                59740
Non-Randomized     30938
Name: allocation, dtype: int64

In [148]:
dataframe['overall_status'].value_counts()

Completed                    190536
Recruiting                    53662
Unknown status                39301
Terminated                    20081
Active, not recruiting        17734
Not yet recruiting            17393
Withdrawn                      9108
Enrolling by invitation        3323
Suspended                      1860
Withheld                        842
No longer available             287
Available                       233
Approved for marketing          151
Temporarily not available        23
Name: overall_status, dtype: int64

In [257]:
def parse_xml(xml):
        #create empty lists for the tag and text
    first_layer_tag = []
    first_layer_text = []
        
    tree = etree.parse(xml)
    root = tree.getroot()
        
    for child in root.iter([child_list]):
        first_layer_tag.append(child.tag)
        first_layer_text.append(child.text)
    for gchild in root.iter([gchild_list]):
        first_layer_tag.append(gchild.tag)
        first_layer_text.append(gchild.text)
    df = pd.DataFrame(data = [first_layer_text],
                      columns = first_layer_tag)
    if 'PMID' in df.columns:
        pmids_df = df.loc[:, ['PMID']]
        transpose_df = pmids_df.transpose()
        np_pmids = transpose_df.to_numpy()
        flattened = list(np_pmids.reshape(1,len(np_pmids)).flatten())
        pmids = ', '.join(flattened)
        df['PMIDs'] = pmids
        
    
    df = df.loc[:,~df.columns.duplicated()]
    return df

In [258]:
dataframe = parse_files(nct_files)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [298]:
dataframe['PMIDs'].dropna().to_numpy()

array(['12657584, 15223788, 16009841', '8512476, 8675413', '8684794', ...,
       '28332101, 30356639, 14985292, 26062039, 30001713, 23672971, 26372583, 21807669, 29175972',
       '29099872, 32102255',
       '32843804, 32563073, 32439926, 32023174, 31862217, 30590914'],
      dtype=object)

In [315]:
PMIDs = []
PMIDs_to_scrape = dataframe['PMIDs'].dropna().to_numpy()
#PMIDs_to_scrape = list(PMIDs_to_scrape)
PMIDs_to_scrape = [l.replace(",", "") for l in PMIDs_to_scrape]
PMIDs_to_scrape = [l.split(' ')for l in PMIDs_to_scrape]

for pmids in PMIDs_to_scrape:
    for pmid in pmids:
        PMIDs.append(pmid)
        



In [318]:
PMIDs = set(PMIDs)


353946

In [328]:
PMIDs

['24071672',
 '21463281',
 '15519768',
 '17944159',
 '12403940',
 '15817679',
 '23336065',
 '15509320',
 '9467544',
 '21975772',
 '22928184',
 '20409726',
 '19675323',
 '19296815',
 '26777263',
 '28763412',
 '15114946',
 '20961763',
 '15745776',
 '24273468',
 '15337162',
 '24044140',
 '15657323',
 '19592786',
 '17275456',
 '21437975',
 '25559097',
 '8043732',
 '24899268',
 '22253233',
 '16108583',
 '10960927',
 '24190696',
 '10968512',
 '28253225',
 '16303575',
 '8830912',
 '8192370',
 '26274338',
 '18062768',
 '17058240',
 '11876575',
 '25008465',
 '21576633',
 '12847385',
 '2001087',
 '12624570',
 '23905877',
 '24019860',
 '11549629',
 '31216376',
 '17197128',
 '29037005',
 '20421121',
 '24068783',
 '7057593',
 '23835180',
 '26543927',
 '10807464',
 '27704001',
 '20049301',
 '10651597',
 '11313084',
 '15557505',
 '14687813',
 '29067332',
 '30060291',
 '26963058',
 '21799064',
 '21071287',
 '29297497',
 '18295629',
 '23991951',
 '16636129',
 '14510321',
 '16399292',
 '23671401',
 '221

In [331]:
import csv
PMIDs = list(PMIDs)

pmid_df = pd.DataFrame(PMIDs, columns = ['PMIDs'])
pmid_df.to_csv('./list_of_PMIDs.csv')

In [346]:
def split_list(alist, wanted_parts=1):
    length = len(alist)
    return [ alist[i*length // wanted_parts: (i+1)*length // wanted_parts] 
             for i in range(wanted_parts) ]

PMIDs_split = split_list(PMIDs,30)


for i, pmid in enumerate(PMIDs_split):
    pd.DataFrame(pmid).to_csv('./list_of_PMIDs'+str(i)+'.csv', index = False)


    

In [350]:
pip install PyPDF2

Collecting PyPDF2
  Downloading https://files.pythonhosted.org/packages/b4/01/68fcc0d43daf4c6bdbc6b33cc3f77bda531c86b174cac56ef0ffdb96faab/PyPDF2-1.26.0.tar.gz (77kB)
Building wheels for collected packages: PyPDF2
  Running setup.py bdist_wheel for PyPDF2: started
  Running setup.py bdist_wheel for PyPDF2: finished with status 'done'
  Stored in directory: C:\Users\Jeffery Rosario\AppData\Local\pip\Cache\wheels\53\84\19\35bc977c8bf5f0c23a8a011aa958acd4da4bbd7a229315c1b7
Successfully built PyPDF2
Installing collected packages: PyPDF2
Successfully installed PyPDF2-1.26.0
Note: you may need to restart the kernel to use updated packages.


'DOSKEY' is not recognized as an internal or external command,
operable program or batch file.
You are using pip version 18.0, however version 20.2.4 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [357]:
import PyPDF2
def isFullPdf(f):
    end_content = ''
    start_content = ''
    size = os.path.getsize(f)
    if size < 1024: return False 
    with open(f, 'rb') as fin: 
        #start content 
        fin.seek(0, 0)
        start_content = fin.read(1024)
        start_content = start_content.decode("ascii", 'ignore' )
        fin.seek(-1024, 2)
        end_content = fin.read()
        end_content = end_content.decode("ascii", 'ignore' )
    start_flag = False
    #%PDF
    if start_content.count('%PDF') > 0:
        start_flag = True
    
        
    if end_content.count('%%EOF') and start_flag > 0:
        return True
    eof = bytes([0])
    eof = eof.decode("ascii")
    if end_content.endswith(eof) and start_flag:
        return True
    return False

isFullPdf('./fetched_pdfs/6360414.pdf')

True

In [249]:
file = './NCT_files/NCT0000xxxx/NCT00000151.xml'
child_list = ['brief_title', 'source', 'overall_status', 'study_type','has_expanded_access','condition',
                'study_first_posted', 'verification_date', 'last_update_posted', 'keyword', 'phase',
              'enrollment', 'intervention_browse', 'PMID']
gchild_list = ['textblock', 'nct_id','observational_model', 'time_perspective', 'intervention_type',
               'is_fda_regulated_drug', 'is_fda_regulated_device', 
               'defined Population', 'intervention_type', 'intervention_name', 
               'gender', 'minimum_age', 'maximum_age','healthy_volunteers', 'country',
               'mesh_term', 'allocation', 'intervention_model', 'primary_purpose', 'masking']
#file = './NCT_files/NCT0458xxxx/NCT04580004.xml'
#df = parse_xml(file)

#tree = etree.parse(file)
#root = tree.getroot()
df = parse_xml(file)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 24 columns):
brief_title            1 non-null object
source                 1 non-null object
overall_status         1 non-null object
phase                  1 non-null object
study_type             1 non-null object
has_expanded_access    1 non-null object
condition              1 non-null object
PMID                   1 non-null object
verification_date      1 non-null object
study_first_posted     1 non-null object
last_update_posted     1 non-null object
intervention_browse    1 non-null object
nct_id                 1 non-null object
textblock              1 non-null object
allocation             1 non-null object
primary_purpose        1 non-null object
intervention_type      1 non-null object
intervention_name      1 non-null object
gender                 1 non-null object
minimum_age            1 non-null object
maximum_age            1 non-null object
healthy_volunteers     1 non-null obje

In [239]:
pmids_df = df.loc[:, ['PMID']]
transpose_df = pmids_df.transpose()
np_pmids = transpose_df.to_numpy()
flattened = list(np_pmids.reshape(1,20).flatten())
pmids = ', '.join(flattened)
df['PMIDs'] = pmids

In [240]:
np_pmids = transpose_df.to_numpy()

In [241]:
flattened = list(np_pmids.reshape(1,20).flatten())
pmids = ', '.join(flattened)
df['PMIDs'] = pmids

In [256]:
len(np_pmids)

20

In [358]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 354534 entries, 0 to 0
Data columns (total 33 columns):
PMID                       71530 non-null object
PMIDs                      71530 non-null object
allocation                 271090 non-null object
brief_title                354534 non-null object
condition                  353674 non-null object
country                    318275 non-null object
enrollment                 347826 non-null object
gender                     353673 non-null object
has_expanded_access        348493 non-null object
healthy_volunteers         347490 non-null object
intervention_browse        125840 non-null object
intervention_model         270778 non-null object
intervention_name          317024 non-null object
intervention_type          317024 non-null object
is_fda_regulated_device    130883 non-null object
is_fda_regulated_drug      130891 non-null object
keyword                    231168 non-null object
last_update_posted         354534 non-null obj

In [360]:
dataframe['maximum_age'].value_counts()

N/A           168868
65 Years       22849
80 Years       18246
75 Years       17514
70 Years       14851
               ...  
196 Days           1
41 Days            1
143 Months         1
44 Months          1
30 Hours           1
Name: maximum_age, Length: 449, dtype: int64