In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import re

In [2]:
def search(NCTid):
    online = "https://clinicaltrials.gov/ct2/show/" + NCTid
    req = requests.get(online)
    webpage = req.text
    soup = bs(webpage, 'html.parser')
    
    sponsor = soup.find('div', {'class': 'tr-info-text'}).string.strip()
    compound = soup.find('table', class_='ct-data_table tr-data_table').contents[3].find_all('span', text=re.compile(r'^((?!placebo).)*$', re.I))[1].string.split(' ')[1]
    study = soup.find('h1', class_='tr-h1 ct-sans-serif tr-solo_record').string.split('(')[-1][:-1]
    development_stage = soup.find('table', class_='ct-data_table tr-data_table').contents[3].contents[5].span.string
    phase = development_stage.split(' ')[-1]
    start_time = re.split('\s|,', soup.find_all('td', headers="studyInfoColData")[-3].string)[-1]
    if not start_time.isdigit():
        start_time = re.split('\s|,', soup.find_all('td', headers="studyInfoColData")[-2].string)[-1]

    format1 = soup.find('p', text=re.compile(r'Inclusion criteria', re.I))
    if format1:
        disease_inclusion = ''
        for i in format1.parent.find_all(['p','li']):
            if i.string:
                if 'exclusion' in i.string or 'Exclusion' in i.string:
                    break
                disease_inclusion += (i.string + ';')
    else:
        disease_inclusion = re.split(re.compile(r'exclusion criteria', re.I), soup.find('div', text=re.compile(r'Inclusion criteria', re.I)).string)[0]
    duration = re.search('([0-9]+)?( |-)?((W|w)eeks?|(D|d)ays?|(Y|y)ears?|(M|m)onths?) ?([0-9]+)?', soup.find(text='Primary Outcome Measures ').parent.nextSibling.nextSibling.li.contents[0]).group(0).strip()
    
    return pd.DataFrame({'key.readouts': [' '], 'compound': [compound], 'calss': [' '], 'development.stage': [' '], 'brand': [' '], 
                         'sponsor': [sponsor], 'study': [study], 'phase': [phase], 'NCT.id': [NCTid], 
                         'start.time': [start_time], 'online': [online], 'local': [' '], 'supp/notes': [' '], 
                         'disease.inclusion': [disease_inclusion], 'inclusion.notes': [' '], 'primary.endpoints': [' '], 
                         'duration.RCT/db.Follow-up': [duration]})

In [250]:
sle_list = ['NCT01649765', 'NCT01345253', 'NCT00424476', 'NCT00410384', 'NCT00071487', 'NCT01632241', 'NCT01705977', 'NCT01484496', 'NCT02446912', 'NCT02446899', 'NCT01438489', \
                'NCT02708095', 'NCT02962960', 'NCT02847598', 'NCT02804763', 'NCT02504645', 'NCT01135459', 'NCT02885610', 'NCT02349061', 'NCT02908100', 'NCT03161483', 'NCT02660944', \
                 'NCT02725515', 'NCT02554019', 'NCT02472795', 'NCT02465580', 'NCT02665364', 'NCT01972568', 'NCT00624338', 'NCT00137969', 'NCT02975336', 'NCT02533570', 'NCT00119678', \
                 'NCT02437890', 'NCT01777256', 'NCT02265744', 'NCT01395745', 'NCT01162681', 'NCT01405196', 'NCT01283139', 'NCT01205438', 'NCT01196091', 'NCT01262365', 'NCT01261793',\
                 'NCT00111306', 'NCT00383214', 'NCT00624351', 'NCT00962832', 'NCT00539838']

sle = pd.DataFrame()
for disease in sle_list:
    sle = sle.append(search(disease))

In [251]:
sle.to_csv('SLE_webcrawler.csv')

In [3]:
rrms_list = ['NCT00676715','NCT01247324','NCT01412333','NCT00420212','NCT00451451','NCT01838668','NCT00835770','NCT00168701','NCT02634307','NCT03093324','NCT00906399',
              'NCT00027300','NCT00030966','NCT01405820','NCT01440101','NCT00213135','NCT00436826','NCT02294058','NCT01628393','NCT02047734','NCT02425644','NCT02907177',
              'NCT01006265','NCT01457924','NCT00640328','NCT02792218','NCT02792231','NCT01185821','NCT00879658','NCT01665144','NCT00289978','NCT00355134','NCT00537082',
              'NCT00333138','NCT00340834']

rrms = pd.DataFrame()
for disease in rrms_list:
    rrms = rrms.append(search(disease))

In [7]:
rrms.to_csv('RRMS_webcrawler.csv', index=False) 