In [1]:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import time
import pandas as pd
import numpy as np
import glob

In [2]:
# Pull in results from previous webscraping to get drugs
medsDF = pd.read_csv('Medications_SideEffects.csv', index_col=0)

In [3]:
# Dropping all rows where I have no side effect information.
# DISCLAIMER: All of the medications dropped have been checked, and they all have fewer than 30 reviews on webMD (and are for schizophrenia)
medsDF = medsDF.dropna(axis=0, how='all', subset=['More common', 'Less common', 'Incidence not known'])
medsDF = medsDF.fillna('')

In [4]:
# Removing the current index scheme to make appending to the end of the dataframe not break the organization
medsDF = medsDF.reset_index()
medsDF = medsDF.drop(columns = ['index'])

In [5]:
# Handtuning dropping of daytrana because it had issues on webMD
ind = medsDF[medsDF['Medication'].eq('Daytrana')].index[0]
medsDF.loc[ind]['Medication'] = 'Daytrana transdermal'

# Handtuning lithium, because webMD calls it lithium carbonate
ind = medsDF[medsDF['Medication'].eq('Lithium')].index[0]
medsDF.loc[ind]['Medication'] = 'Lithium carbonate'

In [6]:
# Next, we want to pull the detailed "Drug Results" pages from WebMD
def searchWebMD(searchterm, rootUrl='https://www.webmd.com/drugs/2/search?type=drugs&query='):
    
    # Getting the query page html
    if searchterm.find(' ') != -1:
        searchterm.replace(' ', '-')
    response = requests.get(rootUrl+searchterm)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Checking if there are matches for the drug
    content = soup.find('p', attrs={'class':'no-matches'})
    if content:
        return False, False
    else:
        # Going through every match and appending them to lists to write to the metadata file
        links = []
        medications = []
        searchLink = False
        
        content = soup.find('ul', attrs={'class':'exact-match'})
        if not content: content = soup # In case there is no 'exact match' section
        for link in content.find_all('a', href=True):
            if str(link).find('result_') != -1:
                med = str(link)
                med = med[med.find('details">')+9:med.rfind('</a>')]
                medications.append(med)
                
                searchLink = str(link)
                searchLink = searchLink[searchLink.find('href="')+6:searchLink.rfind('"')]
                searchLink = 'https://www.webmd.com'+searchLink
                links.append(searchLink)

        return links, medications


In [7]:
# Going through each medication and finding every search result
webMDParent = []
webMDExactMeds = []
for ind, medication in zip(medsDF.index, medsDF['Medication']):
    AllLinks = []
    AllExact = []
    
    # Searching for the medication itself
    searchLinks, exactMeds = searchWebMD(medication)
    
    # If results are found, appending them to the list of links
    if searchLinks:
        print(searchLinks)
        for sl, em in zip(searchLinks, exactMeds):
            AllLinks.append(sl)
            AllExact.append(em)
            
    # If results are not found, or if there simply are other names for a medication, querying those
    if not searchLinks or medsDF.loc[ind]['Alternate names']: 
        if medsDF.loc[ind]['Alternate names']:
            
            # Checking for alternate names
            for name in medsDF.loc[ind]['Alternate names'].split(','):
                name = name.replace(' ','')
                searchLinks, exactMeds = searchWebMD(name)
                if searchLinks:
                    print(searchLinks)
                    for sl, em in zip(searchLinks, exactMeds):
                        AllLinks.append(sl)
                        AllExact.append(em)
                time.sleep(0.1)
            if not searchLinks: 
                AllLinks.append(np.NaN)
                AllExact.append(np.NaN)
        else:
            AllLinks.append(np.NaN)
            AllExact.append(np.NaN)
    
    if len(AllLinks) > 1:
        while np.NaN in AllLinks: AllLinks.remove(np.NaN)
        while np.NaN in AllExact: AllExact.remove(np.NaN)
    elif np.NaN in AllLinks:
        AllLinks = np.NaN
        AllExact = np.NaN
        
    webMDParent.append(AllLinks)
    webMDExactMeds.append(AllExact)
    time.sleep(0.1)
    
medsDF['Parent links'] = webMDParent
medsDF['Exact medications'] = webMDExactMeds

['https://www.webmd.com/drugs/2/drug-12114-94/methylphenidate-hcl/details', 'https://www.webmd.com/drugs/2/drug-19735/methylphenidate-powder/details', 'https://www.webmd.com/drugs/2/drug-12114-4094/methylphenidate-hcl-cd/details', 'https://www.webmd.com/drugs/2/drug-12114-439/methylphenidate-hcl-solution/details', 'https://www.webmd.com/drugs/2/drug-12114-438/methylphenidate-hcl-tablet-chewable/details', 'https://www.webmd.com/drugs/2/drug-144064-641/methylphenidate-patch-24-hours/details', 'https://www.webmd.com/drugs/2/drug-12114-8094/methylphenidate-er-tablet/details', 'https://www.webmd.com/drugs/2/drug-12114-5094/methylphenidate-er-tablet-24-hr/details', 'https://www.webmd.com/drugs/2/drug-173889-1850/methylphenidate-tablet-disintegrating-er-biphasic-24-hr-tablet-disintegrating-er-hr/details', 'https://www.webmd.com/drugs/2/drug-12114-1516/methylphenidate-suspension-24-hr-reconstituted-suspension-er-reconstituted/details']
['https://www.webmd.com/drugs/2/drug-16869-439/methylin/de

['https://www.webmd.com/drugs/2/drug-1774-95/fluoxetine-hcl/details', 'https://www.webmd.com/drugs/2/drug-1774-5095/fluoxetine-dr/details', 'https://www.webmd.com/drugs/2/drug-78206-1274/olanzapine-fluoxetine-hcl/details', 'https://www.webmd.com/drugs/2/drug-94049/fluoxetine-bulk-100-powder/details', 'https://www.webmd.com/drugs/2/drug-148135/fluoxetine-dietary-supp-no-8-capsule/details', 'https://www.webmd.com/drugs/2/drug-148137/fluoxetine-dietary-supp-no-17-capsule/details']
['https://www.webmd.com/drugs/2/drug-1774-95/fluoxetine-hcl/details', 'https://www.webmd.com/drugs/2/drug-1774-5095/fluoxetine-dr/details', 'https://www.webmd.com/drugs/2/drug-78206-1274/olanzapine-fluoxetine-hcl/details', 'https://www.webmd.com/drugs/2/drug-94049/fluoxetine-bulk-100-powder/details', 'https://www.webmd.com/drugs/2/drug-148135/fluoxetine-dietary-supp-no-8-capsule/details', 'https://www.webmd.com/drugs/2/drug-148137/fluoxetine-dietary-supp-no-17-capsule/details']
['https://www.webmd.com/drugs/2/dr

['https://www.webmd.com/drugs/2/drug-8876-140/buspirone-hcl/details', 'https://www.webmd.com/drugs/2/drug-74939/buspirone-bulk-100-powder/details']
['https://www.webmd.com/drugs/2/drug-920-6006/klonopin/details', 'https://www.webmd.com/drugs/2/drug-920-4005/klonopin-tablet-disintegrating/details']
['https://www.webmd.com/drugs/2/drug-14403-6006/clonazepam/details', 'https://www.webmd.com/drugs/2/drug-14403-4005/clonazepam-tablet-disintegrating/details', 'https://www.webmd.com/drugs/2/drug-155265/clonazepam-bulk-100-powder/details']
['https://www.webmd.com/drugs/2/drug-9824-7244/xanax/details', 'https://www.webmd.com/drugs/2/drug-75324-367/xanax-xr/details']
['https://www.webmd.com/drugs/2/drug-8171-7244/alprazolam/details', 'https://www.webmd.com/drugs/2/drug-8171-367/alprazolam-er/details', 'https://www.webmd.com/drugs/2/drug-8171-510/alprazolam-odt/details', 'https://www.webmd.com/drugs/2/drug-8171-6367/alprazolam-concentrate/details', 'https://www.webmd.com/drugs/2/drug-8115-6367/al

In [8]:
# Some webMD pages don't have anything under Drug Results, they just land on the med page (ugh)...fixing
ind = medsDF[medsDF['Medication'].eq('Citalopram')].index[0]
medsDF.loc[ind]['Parent links'] = ['https://www.webmd.com/drugs/2/drug-1701/citalopram-oral/details']
medsDF.loc[ind]['Exact medications'] = ['citalopram-oral']


ind = medsDF[medsDF['Medication'].eq('Escitalopram ')].index[0]
medsDF.loc[ind]['Medication'] = 'Escitalopram'
medsDF.loc[ind]['Parent links'] = ['https://www.webmd.com/drugs/2/drug-63989/escitalopram-oxalate-oral/details']
medsDF.loc[ind]['Exact medications'] = ['escitalopram-oxalate-oral']


ind = medsDF[medsDF['Medication'].eq('Tranylcypromine')].index[0]
medsDF.loc[ind]['Parent links'] = ['https://www.webmd.com/drugs/2/drug-6966/tranylcypromine-oral/details']
medsDF.loc[ind]['Exact medications'] = ['tranylcypromine-oral']

ind = medsDF[medsDF['Medication'].eq('Clozapine')].index[0]
medsDF.loc[ind]['Parent links'] = ['https://www.webmd.com/drugs/2/drug-5200/clozapine-oral/details']
medsDF.loc[ind]['Exact medications'] = ['clozapine-oral']


ind = medsDF[medsDF['Medication'].eq('Iloperidone')].index[0]
medsDF.loc[ind]['Parent links'] = ['https://www.webmd.com/drugs/2/drug-153411/iloperidone-oral/details']
medsDF.loc[ind]['Exact medications'] = ['iloperidone-oral']


ind = medsDF[medsDF['Medication'].eq('Lurasidone')].index[0]
medsDF.loc[ind]['Parent links'] = ['https://www.webmd.com/drugs/2/drug-155126/lurasidone-oral/details']
medsDF.loc[ind]['Exact medications'] = ['lurasidone-oral']


In [9]:
# My data is biased towards chemical name, searching for brand name to get more reviews
def check4brandname(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    content = soup.find('div', attrs={'class':'drug-names'})
    brandname = ''
    if content:
        for child in content.findChildren():
            if str(child)[:3] == '<p>' and str(child).find('COMMON') != -1:
                brandname = str(child)
                brandname = brandname[brandname.find(':')+2:brandname.rfind('</p>')]
                if brandname.find(','):
                    brandname = brandname.replace(' ','')
                    brandname = brandname.split(',')
                break
    return brandname
                
results = []
for ind, medication, link in zip(medsDF.index, medsDF['Medication'], medsDF['Parent links']):
    brandname = None
    if type(link) != float: 
        brandname = check4brandname(link[0])
    
    if brandname:
        for name in brandname:
            print(medication, name)
            searchLinks, exactMeds = searchWebMD(name)
            if not searchLinks: 
                searchLinks = 'checkme'
                exactMeds = np.NaN
            row_copy = {'Condition': medsDF.loc[ind]['Condition'],
                        'Medication': name,
                        'Alternate names': medication,
                        'More common': medsDF.loc[ind]['More common'],
                        'Less common': medsDF.loc[ind]['Less common'],
                        'Incidence not known': medsDF.loc[ind]['Incidence not known'],
                        'Parent links': searchLinks,
                        'Exact medications': exactMeds}
            
        # Search for brandname links, add to list that will become a new dataframe
        results.append(row_copy)
    time.sleep(0.1)

Dexmethylphenidate  Focalin
Dextroamphetamine  Adderall
Amphetamine  Evekeo
Guanfacine  Tenex
Clonidine Hydrochloride  Catapres
Nortriptyline  Aventyl
Nortriptyline  Pamelor
Desipramine  Norpramin
Imipramine  Tofranil
Bupropion  Aplenzin
Bupropion  WellbutrinXL
Lithium carbonate Eskalith
Valproic acid  Depakene
carbamazepine Tegretol
Lamotrigine Lamictal
quetiapine Seroquel
olanzapine Zyprexa
fluoxetine Prozac
fluoxetine Sarafem
Fluoxetine Prozac
Fluoxetine Sarafem
Sertraline Zoloft
Citalopram Celexa
Escitalopram Lexapro
venlafaxine Effexor
Duloxetine Cymbalta
Bupropion Aplenzin
Bupropion WellbutrinXL
Mirtazapine RemeronSoltab
Aripiprazole  Abilify
Quetiapine Seroquel
Phenelzine Nardil
Tranylcypromine Parnate
Selegiline patch  Emsam
Thiothixene  Navane
Aripiprazole Abilify
Asenapine Saphris
Clozapine Clozaril
Clozapine Versacloz
Iloperidone Fanapt
Lurasidone Latuda
Olanzapine Zyprexa
Paliperidone  Invega
Risperidone  Risperdal
Quetiapine Seroquel
Ziprasidone Geodon


In [10]:
# Creating a data frame from those results
brandnamesDF = pd.DataFrame(results)

In [11]:
# Dropping the duplicate rows that arose due to the existence of many parent links
brandnamesDF = brandnamesDF.drop_duplicates(subset=['Condition', 'Medication', 'Alternate names'])

In [12]:
# Making a backup just in case the next line needs to be repeated
medsDFbackup = medsDF.copy(deep=True)

In [13]:
# Concatenating the brand names dataframe onto the original medications dataframe
medsDF = pd.concat([medsDF, brandnamesDF], ignore_index=True, sort=False)

# Some hand tuned accessing of data that didn't appear in the first sweep

In [14]:
# Hand grabbed because of weird webMD issue
review_pages = [
    'https://www.webmd.com/drugs/drugreview-1820-Pamelor-oral.aspx?drugid=1820&drugname=Pamelor-oral',
    'https://www.webmd.com/drugs/drugreview-76851-Wellbutrin-XL-oral.aspx?drugid=76851&drugname=Wellbutrin-XL-oral',
    'https://www.webmd.com/drugs/drugreview-19825-Sarafem-oral.aspx?drugid=19825&drugname=Sarafem-oral',
    'https://www.webmd.com/drugs/drugreview-35-Zoloft-oral.aspx?drugid=35&drugname=Zoloft-oral',
    'https://www.webmd.com/drugs/drugreview-91491-Cymbalta-oral.aspx?drugid=91491&drugname=Cymbalta-oral',
    'https://www.webmd.com/drugs/drugreview-13707-Remeron-oral.aspx?drugid=13707&drugname=Remeron-oral',
    'https://www.webmd.com/drugs/drugreview-9353-Nardil-oral.aspx?drugid=9353&drugname=Nardil-oral',
    'https://www.webmd.com/drugs/drugreview-95354-Emsam-transdermal.aspx?drugid=95354&drugname=Emsam-transdermal',
    'https://www.webmd.com/drugs/drugreview-6936-Norpramin-oral.aspx?drugid=6936&drugname=Norpramin-oral',
    'https://www.webmd.com/drugs/drugreview-8603-Celexa-oral.aspx?drugid=8603&drugname=Celexa-oral',
    'https://www.webmd.com/drugs/drugreview-6965-Parnate-oral.aspx?drugid=6965&drugname=Parnate-oral',
    'https://www.webmd.com/drugs/drugreview-155134-Latuda-oral.aspx?drugid=155134&drugname=Latuda-oral',
    'https://www.webmd.com/drugs/drugreview-153413-Fanapt-oral.aspx?drugid=153413&drugname=Fanapt-oral',
    'https://www.webmd.com/drugs/drugreview-63990-Lexapro-oral.aspx?drugid=63990&drugname=Lexapro-oral',
    'https://www.webmd.com/drugs/drugreview-165442-Versacloz-oral.aspx?drugid=165442&drugname=Versacloz-oral'
]

In [15]:
# Replacing checkme with parent links
def getParentfromReview(revLink):
    response = requests.get(revLink)
    soup = BeautifulSoup(response.content, 'html.parser')

    content = soup.find('h1', attrs={'class':'backToDetails'})
    parentLink = str(content.find('a', href=True))
    
    parentLink = parentLink[parentLink.find('href="')+6:parentLink.rfind('"')]
    parentLink = 'https://www.webmd.com' + parentLink
    return parentLink

def getExactMed(revLink):
    return revLink[revLink.rfind('=')+1:]
    
for ind, medication in zip(medsDF.index, medsDF['Medication']):
    if medsDF.loc[ind]['Parent links'] == 'checkme':
        rev = [link for link in review_pages if link.lower().find(medication.lower()) != -1]
        if rev: 
            rev = rev[0]
            medsDF.loc[ind]['Parent links'] = [getParentfromReview(rev)]
            medsDF.loc[ind]['Exact medications'] = [getExactMed(rev)]

# Handtuning for some formatting weirdness ---> NEEDS TO BE CHECKED AND FIXED
for medorig, medication in zip(['WellbutrinXL']+['RemeronSoltab'],
                               ['Wellbutrin-XL']+ ['Remeron']):
    ind = medsDF[medsDF['Medication'].eq(medorig)].index
    for i in ind:
        if medsDF.loc[i]['Parent links'] == 'checkme':
            rev = [link for link in review_pages if link.lower().find(medication.lower()) != -1]
            if rev: 
                rev = rev[0]
                medsDF.loc[i]['Parent links'] = [getParentfromReview(rev)]
                medsDF.loc[i]['Exact medications'] = [getExactMed(rev)]
            

In [16]:
# Adding rows to account for a major medication in treatment guidelines that's not on NAMI
row_missing = {'Condition': 'Bipolar-Disorder',
               'Medication': 'Depakote',
               'Alternate names': 'Divalproex',
               'More common':'Acid or sour stomach; belching; body aches or pain; change in vision; congestion; continuing ringing or buzzing or other unexplained noise in the ears; hair loss or thinning of the hair; hearing loss; heartburn; impaired vision; lack or loss of strength; loss of memory; problems with memory; rash; seeing double; tender, swollen glands in the neck; trouble with swallowing; uncontrolled eye movements; voice changes; weight gain; weight loss',
               'Less common': 'Absent, missed, or irregular menstrual periods; back pain; burning, dry, or itching eyes; change in taste or bad unusual or unpleasant (after) taste; coin-shaped lesions on the skin; cough producing mucus; cramps; dandruff; discharge or excessive tearing; dry skin; earache; excess air or gas in the stomach or intestines; eye pain; feeling of constant movement of self or surroundings; full feeling; heavy bleeding; increased appetite; itching of the vagina or genital area; itching skin; loss of bowel control; neck pain; oily skin; pain; pain during sexual intercourse; pain or tenderness around the eyes and cheekbones ; passing gas; rash with flat lesions or small raised lesions on the skin; redness or swelling in the ear; redness, pain, swelling of the eye, eyelid, or inner lining of the eyelid; redness, swelling, or soreness of the tongue; sensation of spinning; sneezing; stiff neck; stopping of menstrual bleeding; thick, white vaginal discharge with no odor or with a mild odor',
               'Parent links': ['https://www.webmd.com/drugs/2/drug-1788/depakote-oral/details'],
               'Exact medications': ['Depakote-oral']
              }
medsDF = medsDF.append(row_missing, ignore_index=True, sort=False)
row_missing = {'Condition': 'Bipolar-Disorder',
               'Medication': 'Divalproex',
               'Alternate names': 'Depakote',
               'More common':'Acid or sour stomach; belching; body aches or pain; change in vision; congestion; continuing ringing or buzzing or other unexplained noise in the ears; hair loss or thinning of the hair; hearing loss; heartburn; impaired vision; lack or loss of strength; loss of memory; problems with memory; rash; seeing double; tender, swollen glands in the neck; trouble with swallowing; uncontrolled eye movements; voice changes; weight gain; weight loss',
               'Less common': 'Absent, missed, or irregular menstrual periods; back pain; burning, dry, or itching eyes; change in taste or bad unusual or unpleasant (after) taste; coin-shaped lesions on the skin; cough producing mucus; cramps; dandruff; discharge or excessive tearing; dry skin; earache; excess air or gas in the stomach or intestines; eye pain; feeling of constant movement of self or surroundings; full feeling; heavy bleeding; increased appetite; itching of the vagina or genital area; itching skin; loss of bowel control; neck pain; oily skin; pain; pain during sexual intercourse; pain or tenderness around the eyes and cheekbones ; passing gas; rash with flat lesions or small raised lesions on the skin; redness or swelling in the ear; redness, pain, swelling of the eye, eyelid, or inner lining of the eyelid; redness, swelling, or soreness of the tongue; sensation of spinning; sneezing; stiff neck; stopping of menstrual bleeding; thick, white vaginal discharge with no odor or with a mild odor',
               'Parent links': ['https://www.webmd.com/drugs/2/drug-6000-8019/divalproex-oral/divalproex-sodium-enteric-coated-tablet-oral/details'],
               'Exact medications': ['divalproex-oral']}
medsDF = medsDF.append(row_missing, ignore_index=True, sort=False)

In [17]:
# Saving final result
medsDF.to_csv('Medications_SideEffects_brandAndlinks.csv')