In [1]:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import time
import pandas as pd

In [2]:
# Let's start by collecting conditions and the associated drugs
# https://stackoverflow.com/questions/1080411/retrieve-links-from-web-page-using-python-and-beautifulsoup
root_site = 'https://www.nami.org'
conditions_page = 'https://www.nami.org/learn-more/mental-health-conditions'
response = requests.get(conditions_page)
soup = BeautifulSoup(response.content, 'html.parser')

In [3]:
# Get the list of urls associated with the different conditions
treatment_pages = [str(link) for link in soup.find_all('a', href=True) if str(link).find('Treatment') != -1 and str(link).find('Mental-Health-Conditions') != -1]

In [4]:
# Make it into a functional link for each page
tpages_cleaned = [root_site+tpage[tpage.find('"')+1:tpage.rfind('"')] for tpage in treatment_pages]

In [5]:
# Okay, veeeeeeery veeeeeeery carefully, go to each page and
# pull the medications, and grab the condition from the link

# Starting with the conditions
lenMHC = len('Mental-Health-Conditions/')
conditions = [tpage[tpage.find('Mental-Health-Conditions/')+lenMHC:tpage.rfind('/Treatment')] for tpage in tpages_cleaned]

In [6]:
def process_medication_bullets(listOfBullets):
    # Breaking down text for each bullet into actual text we see on webpage, 
    # including pulling text from those that are hyperlinks
    listOfMeds = []
    for List in listOfBullets: 
        for item in List.split('<li>'):
            if item.find('</li>') != -1:
                if item.find('</a>') != -1:
                    key1 = '"_blank">'
                    key2 = '</a>'
                    item = item[item.find(key1)+len(key1):item.rfind(key2)]
                elif item.find('<strong>') != -1:
                    item = item[item.find('<strong>')+8:item.rfind('</strong>')]
                else:
                    item = item[:item.rfind('</li>')]
                listOfMeds.append(item)
    
    return listOfMeds
    
def pull_medications(url):
    # Grab list of medications from NAMI treatment pages
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    content = soup.find('div', attrs={'class':'treatments-content tab-content'})
    bulletLists = [str(child) for child in content.findChildren() if str(child).find('<ul>') != -1 and str(child).find('Mental-Health-Medications') != -1]
    medications = process_medication_bullets(bulletLists)
    
    if not medications:
        bulletLists = [str(link) for link in soup.find_all('a', href=True) if str(link).find('Mental-Health-Medications') != -1]
        medications = [link[link.find('_blank">')+8:link.rfind('</a>')] for link in bulletLists]
    return medications

In [7]:
conditionsAndTreatments = {}
for condition, url in zip(conditions, tpages_cleaned):
    print(condition)
    print(url)
    conditionsAndTreatments[condition] = pull_medications(url)
    time.sleep(0.1)

ADHD
https://www.nami.org/Learn-More/Mental-Health-Conditions/ADHD/Treatment
Anxiety-Disorders
https://www.nami.org/Learn-More/Mental-Health-Conditions/Anxiety-Disorders/Treatment
Related-Conditions/Autism
https://www.nami.org/Learn-More/Mental-Health-Conditions/Related-Conditions/Autism/Treatment
Bipolar-Disorder
https://www.nami.org/Learn-More/Mental-Health-Conditions/Bipolar-Disorder/Treatment
Borderline-Personality-Disorder
https://www.nami.org/Learn-More/Mental-Health-Conditions/Borderline-Personality-Disorder/Treatment
Depression
https://www.nami.org/Learn-More/Mental-Health-Conditions/Depression/Treatment
Dissociative-Disorders
https://www.nami.org/Learn-More/Mental-Health-Conditions/Dissociative-Disorders/Treatment
Early-Psychosis-and-Psychosis
https://www.nami.org/Learn-More/Mental-Health-Conditions/Early-Psychosis-and-Psychosis/Treatment
Eating-Disorders
https://www.nami.org/Learn-More/Mental-Health-Conditions/Eating-Disorders/Treatment
Obsessive-compulsive-Disorder
https://w

In [8]:
# Building the dataframe
conditionsList = []
medsList = []
for condition in conditionsAndTreatments:
    for med in conditionsAndTreatments[condition]:
        conditionsList.append(condition)
        medsList.append(med)
df = pd.DataFrame({'Condition':conditionsList, 'Medication':medsList})


# Cleaning the dataframe a little bit by removing the "medication" entries
for row in df.T: 
    med =  df.loc[row]['Medication']
    if med.find('medication') != -1:
        df.drop(index=row, inplace=True)
        
# Removing an ambiguous medication name ("transdermal")        
df.drop(index=1, inplace=True)
df.loc[2]['Medication'] = 'Daytrana'

# Splitting off alternate names for the medication
alternate_names = []  ;  new_med_names = []
for row in df.T:
    med = df.loc[row]['Medication']
    if med.find('(') != -1:
        alternate_names.append(med[med.index('(')+1:med.rfind(')')])
        new_med_names.append(med[:med.index('(')])
    else:
        alternate_names.append('')
        new_med_names.append(med)
        
df['Medication'] = new_med_names
df['Alternate names'] = alternate_names

# Altering one entry that contains an "or"
df.loc[6]['Alternate names'] += ', {:s}'.format('Dimesylate')
df.loc[6]['Medication'] = df.loc[6]['Medication'][:df.loc[6]['Medication'].find(' or')]



In [9]:
# Additional results on medications for anxiety sourced from:
# https://guardianadlitem.org/wp-content/uploads/2015/09/4-Appendix-A-Psychotropic-Medications.pdf
df_anxiety = pd.DataFrame([{'Condition': 'Anxiety', 'Medication': 'Ativan', 'Alternate names': 'Lorazepam'},
                              {'Condition': 'Anxiety', 'Medication': 'Buspar', 'Alternate names': 'Buspirone'},
                               {'Condition': 'Anxiety', 'Medication': 'Klonopin', 'Alternate names': 'Clonazepam'},
                               {'Condition': 'Anxiety', 'Medication': 'Xanax', 'Alternate names': 'Alprazolam'},
                               {'Condition': 'Anxiety', 'Medication': 'Valium', 'Alternate names': 'Diazepam'}
                              ])

df = df.append(df_anxiety, ignore_index=True, sort=False)

In [11]:
# Certain medications are mispelled (weird)
misspelled = ['olanzepine', 'Venlafazine', 'Desvenlafazine']
correct = ['olanzapine', 'venlafaxine', 'desvenlafaxine']

for miss, right in zip(misspelled, correct):
    inds = df[df['Medication'].eq(miss)].index
    for ind in inds:
        df.loc[ind]['Medication'] = right

In [13]:
# Next, we want to pull side effects from drugs.com
def searchDrugsdotCom(searchterm, rootUrl='https://www.drugs.com/search.php?searchterm='):
    if searchterm.find(' ') != -1:
        searchterm.replace(' ', '-')
    response = requests.get(rootUrl+searchterm)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    sideEffLink = False
    for link in soup.find_all('a', href=True):
        if str(link).find('>Side Effects') != -1:
            sideEffLink = str(link)
            print(str(link))
            sideEffLink = sideEffLink[sideEffLink.find('"')+1:sideEffLink.rfind('"')]
            sideEffLink = 'https://www.drugs.com' + sideEffLink
            break
            
    if not sideEffLink:
        return False
    else:
        return sideEffLink

def processSideEffects(bulletList):
    # first need to split away from immediate medical side effects (need to go to urgent care)
    # Finding the labels of interest on the webpage
    # Also trying to find the length of the list of bullets following the labels
    locs = []
    for i, string in enumerate(bulletList):
        if string == '<i>More common</i>' or string == '<i>Less common</i>' or string == '<i>Incidence not known</i>':
            locs.append(i)
            count = 0
        elif string[:3] == '<i>' and string[-4:] == '</i>' and len(locs):
            if type(locs[-1]) == int:
                locs[-1] = (locs[-1], i-locs[-1]) 
    sideEffects = {}
    labelAndSE = []

    # Cleaning up a little bit on the length of the list of bullets
    for i in range(len(locs)-1):
        if type(locs[i]) == int:
            if type(locs[i+1]) != tuple:
                locs[i] = (locs[i], locs[i+1]-locs[i])
            else:
                locs[i] = (locs[i], locs[i+1][0]-locs[i])
    
    # Parsing the labels and list of bullets
    locLabels = [loc[0] for loc in locs]
    countINK = 0
    for j, locLabel in enumerate(locLabels):
        label = bulletList[locLabel]
        label = label[label.find('<i>')+3:label.rfind('</i>')]
        sEffects = []
        if label == 'Incidence not known': countINK += 1
        for i in range(locs[j][1]):
            if bulletList[locLabel+i].find('ul') != -1 and bulletList[locLabel+i].find('<p>') == -1:
                effects = [string[:string.find('</li>')] for string in bulletList[locLabel+i].split('<li>')]
                for effect in effects[1:]:
                    # Handling links appropriately
                    if effect.find('<a') != -1:
                        effect = effect[effect.rfind('>', 0, effect.find('</a>')):effect.rfind('</a>')]
                        # Sometimes there are two instances of a link in a link, this fixes that
                        if effect.find('<') != -1 or effect.find('>') != -1:
                            effect = effect[effect.rfind('>', 0, effect.find('</a>')):effect.rfind('</a>')]

                    sEffects.append(effect)
                    
        # Because the non-worrying side effects always come second, will overwrite in dictionary
        sEffectStr = ''
        for sEffect in sEffects: sEffectStr += '{:s}; '.format(sEffect)
        sEffectStr = sEffectStr.replace('>','')
        sideEffects[label] = sEffectStr
        
    if 'More common' not in sideEffects: sideEffects['More common'] = ''
    if 'Less common' not in sideEffects: sideEffects['Less common'] = ''
    if 'Incidence not known' not in sideEffects or countINK == 1: sideEffects['Incidence not known'] = ''

    
    return sideEffects
            
            
def getSideEffects(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    content = soup.find('div', attrs={'class':'contentBox'})
    bulletLists = [str(child) for child in content.findChildren() if str(child).find('<ul>') != -1 or str(child).find('<i>') != -1]
    
    effectDict = processSideEffects(bulletLists)
    return effectDict

In [14]:
# Getting the three classes of side effects
more_common = []
less_common = []
inc_not_known = []

# Going through the dataframe by medication
for ind, medication in zip(df.index, df['Medication']):
    searchLink = searchDrugsdotCom(medication)
    if not searchLink:
        if df.loc[ind]['Alternate names']:
            for name in df.loc[ind]['Alternate names'].split(','):
                name = name.replace(' ','')
                searchLink = searchDrugsdotCom(name)
                if searchLink:
                    break
    if not searchLink:
        more_common.append('')
        less_common.append('')
        inc_not_known.append('')
    else:
        print(searchLink)
        side_effects = getSideEffects(searchLink)
    
        more_common.append(side_effects['More common'])
        less_common.append(side_effects['Less common'])
        inc_not_known.append(side_effects['Incidence not known'])
    
    time.sleep(0.1)
        
df['More common'] = more_common
df['Less common'] = less_common
df['Incidence not known'] = inc_not_known

<a href="/sfx/methylphenidate-side-effects.html">Side Effects</a>
https://www.drugs.com/sfx/methylphenidate-side-effects.html
<a href="/sfx/methylphenidate-side-effects.html">Side Effects</a>
https://www.drugs.com/sfx/methylphenidate-side-effects.html
<a href="/sfx/dexmethylphenidate-side-effects.html">Side Effects</a>
https://www.drugs.com/sfx/dexmethylphenidate-side-effects.html
<a href="/sfx/dextroamphetamine-side-effects.html">Side Effects</a>
https://www.drugs.com/sfx/dextroamphetamine-side-effects.html
<a href="/sfx/amphetamine-side-effects.html">Side Effects</a>
https://www.drugs.com/sfx/amphetamine-side-effects.html
<a href="/sfx/vyvanse-side-effects.html">Side Effects</a>
https://www.drugs.com/sfx/vyvanse-side-effects.html
<a href="/sfx/atomoxetine-side-effects.html">Side Effects</a>
https://www.drugs.com/sfx/atomoxetine-side-effects.html
<a href="/sfx/guanfacine-side-effects.html">Side Effects</a>
https://www.drugs.com/sfx/guanfacine-side-effects.html
<a href="/sfx/kapvay-sid

In [16]:
df.to_csv('Medications_SideEffects.csv')