In [1]:
import requests
import pandas as pd
from datetime import date
import sqlalchemy as db

In [2]:
today = date.today()
dateval = today.strftime("%m/%d/%Y")
min_rank = 1
max_rnk = 100
total = 1000
responses = []

# url = f"https://clinicaltrials.gov/api/query/full_studies?expr=COVID-19+AND+SEARCH%5BLocation%5D%28AREA%5BLocationCountry%5D+United+States%29&min_rnk={min_rank}&max_rnk={max_rnk}&fmt=json"
# response = requests.get(url)
# json = response.json()

try:
    while max_rnk < total:
        url = f"https://clinicaltrials.gov/api/query/full_studies?expr=COVID-19+AND+SEARCH%5BLocation%5D%28AREA%5BLocationCountry%5D+United+States%29&min_rnk={min_rank}&max_rnk={max_rnk}&fmt=json"
        response = requests.get(url)
        json = response.json()
        for rank in json['FullStudiesResponse']['FullStudies']:
            responses.append(rank)
        min_rank += 100
        max_rnk += 100
except:
    pass




In [3]:
len(responses)

194

In [4]:
responses[0]

{'Rank': 1,
 'Study': {'ProtocolSection': {'IdentificationModule': {'NCTId': 'NCT04349202',
    'OrgStudyIdInfo': {'OrgStudyId': '2020-134'},
    'Organization': {'OrgFullName': 'William Beaumont Hospitals',
     'OrgClass': 'OTHER'},
    'BriefTitle': 'Beaumont Health Large-scale Automated Serologic Testing for COVID-19',
    'OfficialTitle': 'Beaumont Health Large-scale Automated Serologic Testing for COVID-19',
    'Acronym': 'BLAST COVID-19'},
   'StatusModule': {'StatusVerifiedDate': 'April 2020',
    'OverallStatus': 'Recruiting',
    'ExpandedAccessInfo': {'HasExpandedAccess': 'No'},
    'StartDateStruct': {'StartDate': 'April 2020',
     'StartDateType': 'Anticipated'},
    'PrimaryCompletionDateStruct': {'PrimaryCompletionDate': 'June 2021',
     'PrimaryCompletionDateType': 'Anticipated'},
    'CompletionDateStruct': {'CompletionDate': 'June 2021',
     'CompletionDateType': 'Anticipated'},
    'StudyFirstSubmitDate': 'April 13, 2020',
    'StudyFirstSubmitQCDate': 'April 13,

In [5]:
n = 0
main_list = []
intervention_list = []
phase_list = []

for rank in responses:
    try:
        nctid = responses[n]["Study"]["ProtocolSection"]["IdentificationModule"]["NCTId"]
        brieftitle = responses[n]["Study"]["ProtocolSection"]["IdentificationModule"]["BriefTitle"]
        start_date = responses[n]["Study"]["ProtocolSection"]["StatusModule"]["StartDateStruct"]["StartDate"]
        completiondate = responses[n]["Study"]["ProtocolSection"]["StatusModule"]["CompletionDateStruct"]["CompletionDate"]
        overallstatus = responses[n]["Study"]["ProtocolSection"]["StatusModule"]["OverallStatus"]
        leadsponsor = responses[n]["Study"]["ProtocolSection"]["SponsorCollaboratorsModule"]["LeadSponsor"]["LeadSponsorName"]
        contactname = responses[n]["Study"]["ProtocolSection"]["ContactsLocationsModule"]["CentralContactList"]["CentralContact"][0]["CentralContactName"]
        contactphone = responses[n]["Study"]["ProtocolSection"]["ContactsLocationsModule"]["CentralContactList"]["CentralContact"][0]["CentralContactPhone"]
        contactemail = responses[n]["Study"]["ProtocolSection"]["ContactsLocationsModule"]["CentralContactList"]["CentralContact"][0]["CentralContactEMail"]
        briefsummary = responses[n]["Study"]["ProtocolSection"]["DescriptionModule"]["BriefSummary"]
        enrollmentcount = responses[n]["Study"]["ProtocolSection"]["DesignModule"]["EnrollmentInfo"]["EnrollmentCount"]
        primaryoutcomedesc = responses[n]["Study"]["ProtocolSection"]["OutcomesModule"]["PrimaryOutcomeList"]["PrimaryOutcome"][0]["PrimaryOutcomeMeasure"]
        facility = responses[n]["Study"]["ProtocolSection"]["ContactsLocationsModule"]["LocationList"]["Location"][0]["LocationFacility"]
        locationcity = responses[n]["Study"]["ProtocolSection"]["ContactsLocationsModule"]["LocationList"]["Location"][0]["LocationCity"]
        locationstate = responses[n]["Study"]["ProtocolSection"]["ContactsLocationsModule"]["LocationList"]["Location"][0]["LocationState"]
        locationzip = responses[n]["Study"]["ProtocolSection"]["ContactsLocationsModule"]["LocationList"]["Location"][0]["LocationZip"]
        locationcountry = responses[n]["Study"]["ProtocolSection"]["ContactsLocationsModule"]["LocationList"]["Location"][0]["LocationCountry"]
        main_dict = {"nctid":nctid,
               "brieftitle":brieftitle,
               "start_date":start_date,
               "completiondate":completiondate,
               "overallstatus":overallstatus,
               "leadsponsor":leadsponsor,
               "contactname":contactname,
               "contactphone": contactphone,
               "contactemail":contactemail,
               "briefsummary":briefsummary,
               "enrollmentcount":enrollmentcount,
               "primaryoutcomedesc":primaryoutcomedesc,
               "facility":facility,
               "locationcity":locationcity,
               "locationstate":locationstate,
               "locationzip":locationzip,
               "locationcountry":locationcountry,
               "statusDate":dateval}
        main_list.append(main_dict)
        for intervention in responses[n]["Study"]["ProtocolSection"]["ArmsInterventionsModule"]["InterventionList"]["Intervention"]:
            interventionname = intervention["InterventionName"]
            interventiontype = intervention["InterventionType"]
            interventiondesc = intervention["InterventionDescription"]
            intervention_dict = {
                "nctid": nctid,
                "interventionname": interventionname,
                "interventiontype": interventiontype,
                "interventiondesc": interventiondesc,
                "statusDate": dateval}
            intervention_list.append(intervention_dict)
        try:
            for phase in responses[n]["Study"]["ProtocolSection"]["DesignModule"]["PhaseList"]["Phase"]:
                phase_dict = {
                    "nctid": nctid,
                    "phase": phase,
                    "statusDate": dateval
                }
                phase_list.append(phase_dict)
        except:
            pass
        n+=1
    except:
        n+=1
        pass


In [6]:
trials_found = len(main_list)
trials_found

153

In [7]:
covid_df = pd.DataFrame(main_list)
intervention_df = pd.DataFrame(intervention_list)
phase_df = pd.DataFrame(phase_list)

In [8]:
covid_df.head()

Unnamed: 0,nctid,brieftitle,start_date,completiondate,overallstatus,leadsponsor,contactname,contactphone,contactemail,briefsummary,enrollmentcount,primaryoutcomedesc,facility,locationcity,locationstate,locationzip,locationcountry,statusDate
0,NCT04349202,Beaumont Health Large-scale Automated Serologi...,April 2020,June 2021,Recruiting,William Beaumont Hospitals,"Maureen Cooney, RN, BSN",248-551-0099,Maureen.Cooney@beaumont.org,The purpose of this study is to determine how ...,50000,Prevalence COVID antibodies in employees of Be...,Beaumont Health System,Royal Oak,Michigan,48073,United States,04/26/2020
1,NCT04329832,Hydroxychloroquine vs. Azithromycin for Hospit...,"March 30, 2020","December 31, 2021",Recruiting,"Intermountain Health Care, Inc.","Valerie T Aston, MBA",8015074606,Valerie.Aston@imail.org,This study will compare two drugs (hydroxychlo...,300,COVID Ordinal Outcomes Scale at 14 days,Intermountain Medical Center,Murray,Utah,84107,United States,04/26/2020
2,NCT04334382,Hydroxychloroquine vs. Azithromycin for Outpat...,"April 2, 2020","December 31, 2021",Recruiting,"Intermountain Health Care, Inc.","Valerie T Aston, MBA",8015074606,Valerie.Aston@imail.org,This study will compare two drugs (hydroxychlo...,1550,Hospitalization within 14 days of enrollment,Intermountain Medical Center,Murray,Utah,84107,United States,04/26/2020
3,NCT04339998,Assessment of Exam Findings in Coronavirus Dis...,"April 15, 2020",October 2020,Not yet recruiting,University of Minnesota,"Matthew Yocum, MD",612-626-8015,yocum007@umn.edu,Specific Aims:\n\nThe investigators will prosp...,500,POCUS Score - Lungs,University of Minnesota Medical Center (UMMC),Minneapolis,Minnesota,55455,United States,04/26/2020
4,NCT04331366,Bidirectional Oxygenation Valve in the Managem...,"April 8, 2020",May 2020,Recruiting,Emory University,"Jeffrey Miller, MD",404-778-7200,jeffrey.miller@emory.edu,The objective of this study is to determine th...,5,Change in Oxygen Saturation by Pulse Oximetry,Emory University Hospital,Atlanta,Georgia,30322,United States,04/26/2020


In [9]:
phase_df.head()

Unnamed: 0,nctid,phase,statusDate
0,NCT04329832,Phase 2,04/26/2020
1,NCT04334382,Phase 3,04/26/2020
2,NCT04331366,Not Applicable,04/26/2020
3,NCT04342663,Phase 2,04/26/2020
4,NCT04351243,Phase 2,04/26/2020


In [10]:
intervention_df.head()

Unnamed: 0,nctid,interventionname,interventiontype,interventiondesc,statusDate
0,NCT04349202,EUROIMMUN assay,Diagnostic Test,Serology testing to detect SARS-CoV-2 antibodies,04/26/2020
1,NCT04329832,Hydroxychloroquine,Drug,Patients in the hydroxychloroquine arm will re...,04/26/2020
2,NCT04329832,Azithromycin,Drug,Patients in the azithromycin arm will receive ...,04/26/2020
3,NCT04334382,Hydroxychloroquine,Drug,Patients in the hydroxychloroquine arm will re...,04/26/2020
4,NCT04334382,Azithromycin,Drug,Patients in the azithromycin arm will receive ...,04/26/2020


In [11]:
covid_df.describe()

Unnamed: 0,nctid,brieftitle,start_date,completiondate,overallstatus,leadsponsor,contactname,contactphone,contactemail,briefsummary,enrollmentcount,primaryoutcomedesc,facility,locationcity,locationstate,locationzip,locationcountry,statusDate
count,153,153,153,153,153,153,153,153,153,153,153,153,153,153,153,153,153,153
unique,153,152,54,89,2,97,144,143,143,153,77,150,120,57,28,89,2,1
top,NCT04322682,COVID-19 Plasma Collection,April 2020,April 2021,Recruiting,Massachusetts General Hospital,"Sabine Hazan, MD",312-947-0065,Valerie.Aston@imail.org,This is a clinical study for the prevention of...,500,Mortality,Massachusetts General Hospital,New York,New York,10032,United States,04/26/2020
freq,1,2,24,10,107,6,3,2,2,1,9,3,6,15,22,6,152,153


In [12]:
#check if there are Null cells in covid_df
covid_df.isnull().values.any()

False

In [13]:
# look at datatypes
# covid_df.dtypes

In [14]:
# remove day from start date column
covid_df['starting_date']=covid_df['start_date'].str.split(r'\s+').apply(lambda x: x[0:1] + x[2:] if len(x) > 2 else x).str.join(' ')
# covid_df
covid_df['completion_date']=covid_df['completiondate'].str.split(r'\s+').apply(lambda x: x[0:1] + x[2:] if len(x) > 2 else x).str.join(' ')

# drop initial date columns
covid_df = covid_df.drop(['start_date','completiondate' ], axis=1)
covid_df


Unnamed: 0,nctid,brieftitle,overallstatus,leadsponsor,contactname,contactphone,contactemail,briefsummary,enrollmentcount,primaryoutcomedesc,facility,locationcity,locationstate,locationzip,locationcountry,statusDate,starting_date,completion_date
0,NCT04349202,Beaumont Health Large-scale Automated Serologi...,Recruiting,William Beaumont Hospitals,"Maureen Cooney, RN, BSN",248-551-0099,Maureen.Cooney@beaumont.org,The purpose of this study is to determine how ...,50000,Prevalence COVID antibodies in employees of Be...,Beaumont Health System,Royal Oak,Michigan,48073,United States,04/26/2020,April 2020,June 2021
1,NCT04329832,Hydroxychloroquine vs. Azithromycin for Hospit...,Recruiting,"Intermountain Health Care, Inc.","Valerie T Aston, MBA",8015074606,Valerie.Aston@imail.org,This study will compare two drugs (hydroxychlo...,300,COVID Ordinal Outcomes Scale at 14 days,Intermountain Medical Center,Murray,Utah,84107,United States,04/26/2020,March 2020,December 2021
2,NCT04334382,Hydroxychloroquine vs. Azithromycin for Outpat...,Recruiting,"Intermountain Health Care, Inc.","Valerie T Aston, MBA",8015074606,Valerie.Aston@imail.org,This study will compare two drugs (hydroxychlo...,1550,Hospitalization within 14 days of enrollment,Intermountain Medical Center,Murray,Utah,84107,United States,04/26/2020,April 2020,December 2021
3,NCT04339998,Assessment of Exam Findings in Coronavirus Dis...,Not yet recruiting,University of Minnesota,"Matthew Yocum, MD",612-626-8015,yocum007@umn.edu,Specific Aims:\n\nThe investigators will prosp...,500,POCUS Score - Lungs,University of Minnesota Medical Center (UMMC),Minneapolis,Minnesota,55455,United States,04/26/2020,April 2020,October 2020
4,NCT04331366,Bidirectional Oxygenation Valve in the Managem...,Recruiting,Emory University,"Jeffrey Miller, MD",404-778-7200,jeffrey.miller@emory.edu,The objective of this study is to determine th...,5,Change in Oxygen Saturation by Pulse Oximetry,Emory University Hospital,Atlanta,Georgia,30322,United States,04/26/2020,April 2020,May 2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,NCT04333732,CROWN CORONATION: Chloroquine RepurpOsing to h...,Not yet recruiting,Washington University School of Medicine,"Linda Yun, BS",314-273-2240,lindayun@wustl.edu,Healthcare workers are at the frontline of the...,55000,Symptomatic COVID-19,Washington University School of Medicine,Saint Louis,Missouri,63110,United States,04/26/2020,April 2020,February 2021
149,NCT04320472,Acute Encephalopathy in Critically Ill Patient...,Recruiting,Ictal Group,"Stephane LEGRIEL, MD, PhD",33139638839,neurocovid19study@ictalgroup.org,Infection with SARS-CoV-2 or severe acute resp...,250,prevalence,Jackson Memorial Health System; University of ...,Miami,Florida,33136,United States,04/26/2020,March 2020,December 2020
150,NCT03323463,Major De-escalation to 30 Gy for Select Human ...,Recruiting,Memorial Sloan Kettering Cancer Center,"Nancy Lee, MD",212-639-3341,leen2@mskcc.org,The purpose of this study is to demonstrate th...,300,Effectiveness of study treatment for participa...,Memoral Sloan Kettering Basking Ridge,Basking Ridge,New Jersey,07920,United States,04/26/2020,October 2017,October 2021
151,NCT02915198,Investigation of Metformin in Pre-Diabetes on ...,Recruiting,VA Office of Research and Development,"Gregory G Schwartz, PhD MD",(720) 723-6070,Gregory.Schwartz@va.gov,This research will help us to learn if the med...,7868,"Time in days to death, non-fatal myocardial in...","Phoenix VA Health Care System, Phoenix, AZ",Phoenix,Arizona,85012,United States,04/26/2020,February 2019,August 2024


In [15]:
engine = db.create_engine('sqlite:///COVID-Clinical-Trials.sqlite')

metadata = db.MetaData()

covid_df.to_sql("COVID_ClinicalTrials", con=engine, if_exists="replace")
intervention_df.to_sql("Interventions", con=engine, if_exists="replace")
phase_df.to_sql("Phase_Recordings", con=engine, if_exists="replace")

In [16]:
connection = engine.connect()
loclist = connection.execute("SELECT DISTINCT locationcity, locationstate, locationzip FROM COVID_ClinicalTrials")


In [17]:
loclist

<sqlalchemy.engine.result.ResultProxy at 0x1847199cac8>

In [18]:
d, a = {}, []
for rowproxy in loclist:
    # rowproxy.items() returns an array like [(key0, value0), (key1, value1)] > thank you stackoverflow!
    for column, value in rowproxy.items():
        # build up the dictionary
        d = {**d, **{column: value}}
    a.append(d)
connection.close()
a

[{'locationcity': 'Royal Oak',
  'locationstate': 'Michigan',
  'locationzip': '48073'},
 {'locationcity': 'Murray', 'locationstate': 'Utah', 'locationzip': '84107'},
 {'locationcity': 'Minneapolis',
  'locationstate': 'Minnesota',
  'locationzip': '55455'},
 {'locationcity': 'Atlanta',
  'locationstate': 'Georgia',
  'locationzip': '30322'},
 {'locationcity': 'Belleville',
  'locationstate': 'Illinois',
  'locationzip': '62220'},
 {'locationcity': 'New York',
  'locationstate': 'New York',
  'locationzip': '10029'},
 {'locationcity': 'Austin', 'locationstate': 'Texas', 'locationzip': '78705'},
 {'locationcity': 'Durham',
  'locationstate': 'North Carolina',
  'locationzip': '27710'},
 {'locationcity': 'Scottsdale',
  'locationstate': 'Arizona',
  'locationzip': '85254'},
 {'locationcity': 'Hackensack',
  'locationstate': 'New Jersey',
  'locationzip': '07601'},
 {'locationcity': 'Honolulu',
  'locationstate': 'Hawaii',
  'locationzip': '96813'},
 {'locationcity': 'San Francisco',
  'l

In [19]:
a[0]["locationzip"]
a[0]["locationcity"]
querylist = []

for n in a:
    querylist.append(f"{n['locationcity']}, {n['locationstate']} {n['locationzip']}")
querylist

['Royal Oak, Michigan 48073',
 'Murray, Utah 84107',
 'Minneapolis, Minnesota 55455',
 'Atlanta, Georgia 30322',
 'Belleville, Illinois 62220',
 'New York, New York 10029',
 'Austin, Texas 78705',
 'Durham, North Carolina 27710',
 'Scottsdale, Arizona 85254',
 'Hackensack, New Jersey 07601',
 'Honolulu, Hawaii 96813',
 'San Francisco, California 94022',
 'Chicago, Illinois 60452',
 'Philadelphia, Pennsylvania 19107',
 'Chicago, Illinois 60637',
 'Kansas City, Kansas 66160',
 'Northglenn, Colorado 80260',
 'Seattle, Washington 98195',
 'Aurora, Colorado 80045',
 'New York, New York 10032',
 'New York, New York 10022',
 'Nashville, Tennessee 37232',
 'Stanford, California 94305',
 'Philadelphia, Pennsylvania 19104',
 'Scottsdale, Arizona 85258',
 'Houston, Texas 77027',
 'Bethesda, Maryland 20892',
 'Kansas City, Missouri 64111',
 'Ventura, California 93003',
 'Boston, Massachusetts 02114',
 'New York, New York 10016',
 'San Francisco, California 94143',
 'Saint Louis, Missouri 63110',
 

In [20]:
from info import KEY
url = "http://open.mapquestapi.com/geocoding/v1/address?"
results = []

for n in querylist:
    query = f"{url}key={KEY}&location={n}"
    response = requests.get(query)
    json = response.json()
    resultDict = {}
    latitude = json["results"][0]["locations"][0]["latLng"]["lat"]
    longitude = json["results"][0]["locations"][0]["latLng"]["lng"]
    locationzip = json["results"][0]["providedLocation"]["location"]
    resultDict = {
        "locationzip":locationzip,
        "longitude":longitude,
        "latitude":latitude
    }
    results.append(resultDict)
results

[{'locationzip': 'Royal Oak, Michigan 48073',
  'longitude': -83.16537,
  'latitude': 42.510516},
 {'locationzip': 'Murray, Utah 84107',
  'longitude': -111.888417,
  'latitude': 40.666784},
 {'locationzip': 'Minneapolis, Minnesota 55455',
  'longitude': -93.265469,
  'latitude': 44.9773},
 {'locationzip': 'Atlanta, Georgia 30322',
  'longitude': -84.390185,
  'latitude': 33.749099},
 {'locationzip': 'Belleville, Illinois 62220',
  'longitude': -89.983993,
  'latitude': 38.52005},
 {'locationzip': 'New York, New York 10029',
  'longitude': -73.986614,
  'latitude': 40.730646},
 {'locationzip': 'Austin, Texas 78705',
  'longitude': -97.727404,
  'latitude': 30.288272},
 {'locationzip': 'Durham, North Carolina 27710',
  'longitude': -78.901805,
  'latitude': 35.996653},
 {'locationzip': 'Scottsdale, Arizona 85254',
  'longitude': -111.899236,
  'latitude': 33.509122},
 {'locationzip': 'Hackensack, New Jersey 07601',
  'longitude': -74.043474,
  'latitude': 40.885933},
 {'locationzip': 'H

In [21]:
results_df = pd.DataFrame(results)
results_df.head()

Unnamed: 0,locationzip,longitude,latitude
0,"Royal Oak, Michigan 48073",-83.16537,42.510516
1,"Murray, Utah 84107",-111.888417,40.666784
2,"Minneapolis, Minnesota 55455",-93.265469,44.9773
3,"Atlanta, Georgia 30322",-84.390185,33.749099
4,"Belleville, Illinois 62220",-89.983993,38.52005


In [22]:
engine = db.create_engine('sqlite:///COVID-Clinical-Trials.sqlite')

metadata = db.MetaData()

results_df.to_sql("lat_long", con=engine, if_exists="replace")