# Validation.ipynb
Reproduce XML schema validation and enumerated field results from "Obstacles to the Use of Study Metadata in ClinicalTrials.gov"

In [None]:
from lxml import etree
from io import StringIO
import sys, re, os, pdb, pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

root_data_dir = '/Users/lauramiron/Large_files/CTData/'
xml_dir = root_data_dir+'AllPublicXML/'
results_dir = 'results/'
val_results_csv = results_dir+'xml_val_results.csv'

xml_files = [os.path.join(root, f) for root, dirs, files in os.walk(xml_dir) for f in files if f.split('.')[1]=='xml']
NUM_RECORDS = len(xml_files)
print_progress=True

In [None]:
def load_xml(filename_xml):
    with open(filename_xml, 'r') as xml_file:
        xml_to_check = xml_file.read()
    try:
        doc = etree.parse(StringIO(xml_to_check))
        return doc
    except IOError:
        return None
    except etree.XMLSyntaxError as err:
        return None

# Schema Validation
Check whether XML records validate against the provided XML schema (public.xsd).

Note: For simplicity, paper states that all records correctly validate against schema. In fact, two records out of 302,091 fail to validate.  NCT02321462.xml  and NCT02493517.xml both contain values 'non-inferiority' instead of 'Non-Inferiority' for field 'non_inferiority_type', which is an optional clinical results element.  

In [None]:
def open_rmenc(filename):
    with open(filename,'r') as f:
        xml_string = f.read()
        RE_XML_ENCODING = re.match(r'<\?xml[^>]+\s+(encoding\s*=\s*["\'][^"\']*["\'])\s*\?>|',xml_string).group(1)
        cleaned = re.sub(RE_XML_ENCODING,"", xml_string)
        return cleaned
    
def load_schema(filename_xsd):
    schema_to_check = open_rmenc(filename_xsd)
    xmlschema_doc = etree.parse(StringIO(schema_to_check))
    xmlschema = etree.XMLSchema(xmlschema_doc)
    return xmlschema

def _is_valid_xml(xml_doc, xmlschema):
    try:
        xmlschema.assertValid(xml_doc)
        return True
    except etree.DocumentInvalid as err:
        print(err)
        return False
    except Exception as err:
        print(err)
        return False

In [None]:
filename_xsd = root_data_dir+'public.xsd'

valid = 0
invalid = 0
malformed = 0

xmlschema = load_schema(filename_xsd)
print("Running validation on ",len(xml_files)," xml files...")
for i, xmlf in enumerate(xml_files):
    if (print_progress==True) and (i % 1000 == 0): print(i,'/',len(xml_files))
    with open(val_results_csv,'a+') as of:
        xml_doc = load_xml(xmlf)
        if not xml_doc:
            of.write(xmlf.split('/')[-1]+',Malformed XML\n')
            malformed += 1
        else:
            validates = _is_valid_xml(xml_doc,xmlschema)
            of.write(xmlf.split('/')[-1]+','+str(validates)+'\n')
            if validates: valid += 1
            else: invalid += 1
print('Total: ',len(xml_files),', Valid: ',valid,', Invalid: ',invalid,', Malformed: ',malformed)

# Enumerated Fields

In [None]:
# uncomment lines 26-32 to run to re-generate pickled dataframes of values for enumerated fields
# Our dataframes are already located in CTData
enumerated_data_dir = 'scraped/'

def xml_to_dataframe(col,recursive=False,stop=None):
    data = []
    df = pd.DataFrame()
    print(f'Scraping xml to make dataframe for col {col}...')
    for count, xmlf in enumerate(xml_files,0):
        if (stop is not None):
            if count>stop: break
        try:
            if (print_progress==True) and (count % 10000 == 0): print(f'{count}/{len(xml_files)}')    
            xml_doc = load_xml(xmlf)
            findstr = './/'+col if recursive else col
            elems = xml_doc.findall(findstr)
            for el in elems:
                data.append((xmlf.split('/')[-1],el.text))
        except Exception as err:
            print(err)
    df = pd.DataFrame(data,columns=['filename',col])
    df.to_pickle(enumerated_data_dir+col+'.p')
    return df

# scrape xml files, make pandas dataframe for each field that should be an enum
# df1 = xml_to_dataframe('primary_purpose',recursive=True)
# df2 = xml_to_dataframe('intervention_model',recursive=True)
# df3 = xml_to_dataframe('masking',recursive=True)
# df4 = xml_to_dataframe('allocation',recursive=True)
# df5 = xml_to_dataframe('arm_group_type',recursive=True)
# df6 = xml_to_dataframe('observational_model',recursive=True)
# df7 = xml_to_dataframe('time_perspective', recursive=True)

In [None]:
# Check string xml fields against list of valid values on clinicaltrials.gov
int_model_choices = ['Single Group Assignment', 'Parallel Assignment', 'Crossover Assignment', 'Factorial Assignment', 'Sequential Assignment']
mask_choices = ['Participant', 'Care Provider', 'Investigator', 'Outcomes Assessor', 'No Masking']
primary_purpose_choices = ['Treatment', 'Prevention', 'Diagnostic', 'Supportive Care', 'Screening', 'Health Services Research', 'Basic Science', 'Device Feasibility', 'Other','Educational/Counseling/Training']
alloc_choices = ['N/A', 'Randomized', 'Non-Randomized']
arm_group_type_choices = ['Experimental', 'Active Comparator', 'Placebo Comparator', 'Sham Comparator', 'No Intervention', 'Other']
obs_model_choices = ['Cohort', 'Case-Control', 'Case-Only', 'Case-Crossover', 'Ecologic or Community', 'Family-Based', 'Other']
time_perspective_choices = ['Prospective','Cross-Sectional','Retrospective','Other']


enumerated_data_dir = 'scraped/'

# Interventional Study Model
print('INTERVENTIONAL STUDY MODEL')
df = pd.read_pickle(enumerated_data_dir+'intervention_model.p')
print(df['intervention_model'].value_counts())
df_rogue = df[~df['intervention_model'].isin(int_model_choices)]
num_rogue = len(df_rogue)
print(f'Num records with rogue values: {num_rogue}, {float(num_rogue)/NUM_RECORDS:.2%}')

# Masking
print('MASKING')
mask_pattern = r'(None \(Open Label\))|(Single \((Participant(, )?|Care Provider(, )?|Investigator(, )?|Outcomes Assessor(, )?){1}\))|(Double \((Participant(, )?|Care Provider(, )?|Investigator(, )?|Outcomes Assessor(, )?){2}\))|(Triple \((Participant(, )?|Care Provider(, )?|Investigator(, )?|Outcomes Assessor(, )?){3}\))|(Quadruple \((Participant(, )?|Care Provider(, )?|Investigator(, )?|Outcomes Assessor(, )?){4}\))'
simple_mask_pattern = r'(Single|Double|Triple|Quadruple)'
df = pd.read_pickle(enumerated_data_dir+'masking.p')
print(df['masking'].value_counts())
df_rogue = df[~((df.masking.str.match(mask_pattern)) | (df.masking.str.match(simple_mask_pattern)))]
num_rogue = len(df_rogue)
print(f'Num records with rogue values: {num_rogue}, {float(num_rogue)/NUM_RECORDS:.2%}')

# Primary Purpose
print('PRIMARY PURPOSE')
df = pd.read_pickle(enumerated_data_dir+'primary_purpose.p')
print(df['primary_purpose'].value_counts())
df_rogue = df[~df['primary_purpose'].isin(primary_purpose_choices)]
num_rogue = len(df_rogue)
print(f'Num records with rogue values: {num_rogue}, {float(num_rogue)/NUM_RECORDS:.2%}')

# Allocation
print('ALLOCATION')
df = pd.read_pickle(enumerated_data_dir+'allocation.p')
print(df['allocation'].value_counts())
df_rogue = df[~df['allocation'].isin(alloc_choices)]
num_rogue = len(df_rogue)
print(f'Num records with rogue values: {num_rogue}, {float(num_rogue)/NUM_RECORDS:.2%}')

# Arm Type
print('ARM TYPE')
df = pd.read_pickle(enumerated_data_dir+'arm_group_type.p')
print(df['arm_group_type'].value_counts())
df_rogue = df[~df['arm_group_type'].isin(arm_group_type_choices)]
# may be more than one rogue arm_group_type per record, count num records with rogue values
# for consistency with rest of data table
num_rogue = df_rogue['filename'].nunique() 
print(f'Num records with rogue values: {num_rogue}, {float(num_rogue)/NUM_RECORDS:.2%}')

# Observational Model
print('OBSERVATIONAL MODEL')
df = pd.read_pickle(enumerated_data_dir+'observational_model.p')
print(df['observational_model'].value_counts())
df_rogue = df[~df['observational_model'].isin(obs_model_choices)]
# may be more than one rogue arm_group_type per record, count num records with rogue values
# for consistency with rest of data table
num_rogue = df_rogue['filename'].nunique() 
print(f'Num records with rogue values: {num_rogue}, {float(num_rogue)/NUM_RECORDS:.2%}')

# Time Perspective
def _valid_time_perspective(row):
    value = row['time_perspective']
    for term in value.split(','):
        if (term.strip()!= '') and term.strip() not in time_perspective_choices:
            return False
    return True

print('TIME PERSPECTIVE')
df = pd.read_pickle(enumerated_data_dir+'time_perspective.p')
print(df['time_perspective'].value_counts())
df_rogue = df[~df.apply(_valid_time_perspective, axis=1)]
num_rogue = len(df_rogue)
print(f'Num records with rogue values: {num_rogue}, {float(num_rogue)/NUM_RECORDS:.2%}')