In [1]:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import time
import pandas as pd
import numpy as np

In [2]:
# Pull in results from previous webscraping to get drugs
medsDF = pd.read_csv('Medications_SideEffects.csv', index_col=0)
medsDF = medsDF.dropna(axis=0, how='all', subset=['More common', 'Less common'])
medsDF = medsDF.fillna('')

In [3]:
medsDF

Unnamed: 0,Condition,Medication,Alternate names,More common,Less common
0,ADHD,Methylphenidate,"Methylin, Ritalin, Concerta",Headache; loss of appetite; nervousness; stoma...,Anger; decreased appetite; dizziness; drowsine...
2,ADHD,Daytrana,,Headache; loss of appetite; nervousness; stoma...,Anger; decreased appetite; dizziness; drowsine...
3,ADHD,Dexmethylphenidate,"Focalin XR, SR Focalin",Acid or sour stomach; belching; heartburn; ind...,Twitching;
5,ADHD,Amphetamine,"Adderall, Adderall XR",Dry mouth; loss of appetite; sore throat; stom...,Belching; dizziness; heartburn; indigestion; s...
6,ADHD,Lisdexamfetemine,"Vyvanse, Dimesylate",Decreased appetite; headache; nausea; upper ab...,Crying; depersonalization; dry mouth; dysphori...
7,ADHD,Atomoxetine,Strattera,Acid or sour stomach; belching; bleeding betwe...,Abnormal dreams; abnormal orgasm; back pain; b...
8,ADHD,Guanfacine,Intuniv,Constipation; dizziness; dryness of the mouth;...,Belching; decreased appetite; decreased sexual...
9,ADHD,Clonidine Hydrochloride,Kapvay,Constipation;,Darkening of the skin; decreased sexual abilit...
13,ADHD,Bupropion,"Wellbutrin, Wellbutrin XL, Welbutrin SR",Constipation; decrease in appetite; dizziness;...,Blurred vision; change in sense of taste; drow...
14,ADHD,Venlafazine,"Effexor, Effexor XR",Abnormal dreams; chills; constipation; decreas...,Change in taste; muscle tension; yawning;


In [4]:
# Next, we want to pull side effects from drugs.com
def searchWebMD(searchterm, rootUrl='https://www.webmd.com/drugs/2/search?type=drugs&query='):
    if searchterm.find(' ') != -1:
        searchterm.replace(' ', '-')
    response = requests.get(rootUrl+searchterm)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    

    content = soup.find('p', attrs={'class':'no-matches'})
    if content:
        return False
    else:
        searchLink = False
        for link in soup.find_all('a', href=True):
            if str(link).find('result_1') != -1:
                searchLink = str(link)
                searchLink = searchLink[searchLink.find('href="')+6:searchLink.rfind('"')]
                searchLink = 'https://www.webmd.com'+searchLink
                break
        return searchLink


In [5]:
webMDParent = []
for ind, medication in zip(medsDF.index, medsDF['Medication']):
    searchLink = searchWebMD(medication)
    if not searchLink: 
        if medsDF.loc[ind]['Alternate names']:
            for name in medsDF.loc[ind]['Alternate names'].split(','):
                name = name.replace(' ','')
                searchLink = searchWebMD(name)
                if searchLink:
                    print(searchLink)
                    webMDParent.append(searchLink)
                    break
                time.sleep(1)
            if not searchLink: webMDParent.append(np.NaN)
        else:
            webMDParent.append(np.NaN)
    else:
        print(searchLink)
        webMDParent.append(searchLink)
    time.sleep(1)
    
medsDF['Parent links'] = webMDParent

https://www.webmd.com/drugs/2/drug-12114-94/methylphenidate-hcl/details
https://www.webmd.com/drugs/2/drug-22251-2094/dexmethylphenidate-hcl/details
https://www.webmd.com/drugs/2/drug-167647-1644/amphetamine-sulfate-tablet/details
https://www.webmd.com/drugs/2/drug-5481-8024/guanfacine-hcl/details
https://www.webmd.com/drugs/2/drug-11754-24/clonidine-hcl/details
https://www.webmd.com/drugs/2/drug-13507-7155/bupropion-xl/details
https://www.webmd.com/drugs/2/drug-1836-5047/effexor-tablet/details
https://www.webmd.com/drugs/2/drug-14374-42/lithium/details
https://www.webmd.com/drugs/2/drug-8885-19/valproic-acid/details
https://www.webmd.com/drugs/2/drug-1493-5/carbamazepine/details
https://www.webmd.com/drugs/2/drug-4582-4217/lamotrigine/details
https://www.webmd.com/drugs/2/drug-4689-8274/quetiapine-fumarate/details
https://www.webmd.com/drugs/2/drug-1774-95/fluoxetine-hcl/details
https://www.webmd.com/drugs/2/drug-1774-95/fluoxetine-hcl/details
https://www.webmd.com/drugs/2/drug-1-8095

In [6]:
medsDF = medsDF.dropna(axis=0, how='all', subset=['Parent links'])

In [7]:
# My data is biased towards chemical name, searching for brand name to get more reviews
def check4brandname(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    content = soup.find('div', attrs={'class':'drug-names'})
    brandname = ''
    if content:
        for child in content.findChildren():
            if str(child)[:3] == '<p>' and str(child).find('COMMON') != -1:
                brandname = str(child)
                brandname = brandname[brandname.find(':')+2:brandname.rfind('</p>')]
                if brandname.find(','):
                    brandname = brandname.replace(' ','')
                    brandname = brandname.split(',')
                break
    return brandname
                
for ind, medication, link in zip(medsDF.index, medsDF['Medication'], medsDF['Parent links']):
    brandname = check4brandname(link)
    
    if brandname:
        for name in brandname:
            print(medication, name)
            blank_row = medsDF.loc[ind]
            searchLink = searchWebMD(medication)
            if not searchLink: searchLink = np.NaN
            row_copy = {'Condition': medsDF.loc[ind]['Condition'],
                        'Medication': name,
                        'Alternate names': medication,
                        'More common': medsDF.loc[ind]['More common'],
                        'Less common': medsDF.loc[ind]['Less common'],
                        'Parent links': searchLink}
            medsDF = medsDF.append(row_copy, ignore_index=True)
    
    # Search for brandname links, add to dataframe
    time.sleep(1)

Dexmethylphenidate  Focalin
Amphetamine  Evekeo
Guanfacine  Tenex
Clonidine Hydrochloride  Catapres
Bupropion  Aplenzin
Bupropion  WellbutrinXL
Valproic acid  Depakene
carbamazepine Tegretol
Lamotrigine Lamictal
quetiapine Seroquel
fluoxetine Prozac
fluoxetine Sarafem
Fluoxetine Prozac
Fluoxetine Sarafem
Sertraline Zoloft
Paroxetine Paxil
Duloxetine Cymbalta
Bupropion Aplenzin
Bupropion WellbutrinXL
Mirtazapine RemeronSoltab
Aripiprazole  Abilify
Quetiapine Seroquel
Phenelzine Nardil
Selegiline patch  Emsam
Aripiprazole Abilify
Asenapine Saphris
Olanzapine Zyprexa
Paliperidone  Invega
Risperidone  Risperdal
Quetiapine Seroquel
Ziprasidone Geodon


In [8]:
medsDF

Unnamed: 0,Condition,Medication,Alternate names,More common,Less common,Parent links
0,ADHD,Methylphenidate,"Methylin, Ritalin, Concerta",Headache; loss of appetite; nervousness; stoma...,Anger; decreased appetite; dizziness; drowsine...,https://www.webmd.com/drugs/2/drug-12114-94/me...
1,ADHD,Dexmethylphenidate,"Focalin XR, SR Focalin",Acid or sour stomach; belching; heartburn; ind...,Twitching;,https://www.webmd.com/drugs/2/drug-22251-2094/...
2,ADHD,Amphetamine,"Adderall, Adderall XR",Dry mouth; loss of appetite; sore throat; stom...,Belching; dizziness; heartburn; indigestion; s...,https://www.webmd.com/drugs/2/drug-167647-1644...
3,ADHD,Guanfacine,Intuniv,Constipation; dizziness; dryness of the mouth;...,Belching; decreased appetite; decreased sexual...,https://www.webmd.com/drugs/2/drug-5481-8024/g...
4,ADHD,Clonidine Hydrochloride,Kapvay,Constipation;,Darkening of the skin; decreased sexual abilit...,https://www.webmd.com/drugs/2/drug-11754-24/cl...
...,...,...,...,...,...,...
58,Bipolar-Disorder,Zyprexa,Olanzapine,Acid or sour stomach; belching; body aches or ...,"Absent, missed, or irregular menstrual periods...",https://www.webmd.com/drugs/2/drug-1644-9274/o...
59,Bipolar-Disorder,Invega,Paliperidone,Dizziness; drowsiness (mild); lightheadedness;...,Actions that are out of control; behavioral ch...,https://www.webmd.com/drugs/2/drug-146718-882/...
60,Bipolar-Disorder,Risperdal,Risperidone,Acid or sour stomach; belching; body aches or ...,"Absent, missed, or irregular menstrual periods...",https://www.webmd.com/drugs/2/drug-6283-2034/r...
61,Bipolar-Disorder,Seroquel,Quetiapine,Dizziness; drowsiness (mild); lightheadedness;...,Actions that are out of control; behavioral ch...,https://www.webmd.com/drugs/2/drug-4689-8274/q...


In [26]:
def pull_review(userPost, revnum=1):
    info = {}
    
    # Pulling basic post info
    info['conditionInfo'] = userPost.find('div', attrs={'class':'conditionInfo'}).text
    info['date'] = userPost.find('div', attrs={'class':'date'}).text
    info['reviewer'] = userPost.find('p', attrs={'class':'reviewerInfo'}).text
    
    # Pulling stars info
    content = userPost.find('div', attrs={'id':'ctnStars'})
    catsAndScores = []
    for child in content.findChildren():
        nextisScore=False
        for grandChild in child.findChildren():
            if nextisScore:
                score = grandChild.find('span', attrs={'class':'current-rating'}).text
                score = score[score.rfind(' ')+1:]
                catsAndScores.append((category, score))
                nextisScore=False
            if str(grandChild).find('category') != -1:
                category = str(grandChild)
                category = category[category.find('gory">')+6:category.rfind('</p>')]
                nextisScore=True
    catsAndScores = dict(catsAndScores)
    info['Effectiveness'] = catsAndScores['Effectiveness']
    info['Satisfaction'] = catsAndScores['Satisfaction']
    info['Ease of Use'] = catsAndScores['Ease of Use']
    
    # Pulling comment and cleaning it up
    text = userPost.find('p', attrs={'id':'comTrunc{:g}'.format(revnum)}).text
    text = text.replace('<strong>Comment:</strong>','').replace('<br','')
    info['Comment'] = text
    
    return info

def proceed2NextPage(url):
    num = int(url[url.find('pageIndex=')+10:url.rfind('&sortby')])
    newurl = url.replace('pageIndex={:g}'.format(num+1))
    return newurl

def scrollReviews(reviewPage0, reviewsPerPage=5):
    # Going to the reviews page and finding the total number of reviews
    response = requests.get(reviewPage0)
    soup = BeautifulSoup(response.content, 'html.parser')

    content = soup.find('span', attrs={'class':'totalreviews'})
    totalreviews = content.text
    totalreviews = totalreviews[:totalreviews.rfind('Total')-1]

    # Counting the number of pages
    totalreviews = int(totalreviews)
    num_pages = totalreviews // reviewsPerPage
    if totalreviews % reviewsPerPage: num_pages += 1
    
    # Iterating through pages and grabbing review data
    all_reviews = []
    counter=0
    for npage in range(num_pages):
        # Finding the relevant page
        if npage == 0:
            url = reviewPage0
        else:
            url = proceed2NextPage(url)
    
        # Get heading above userPosts
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        content = soup.find('div', attrs={'id':'ratings_fmt'})
    
        revnum = 1
        for child in content.findChildren():
            if str(child).find('userPost') != -1:
                child_info = pull_review(child, revnum=revnum)
                all_reviews.append(child_info)
                revnum += 1
        
        if counter == 0:
            break
            
        time.sleep(1)
    # Temporary
    return all_reviews
        
    # Process list all_reviews into dataframe and return
        
        
        
def getReviewsLink(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    content = soup.find('div', attrs={'class':'drug-review-lowest'})
    searchLink = ''
    for child in content.findChildren():
        if str(child).find('"drug-review"') != -1:
            searchLink = str(child)
            searchLink = searchLink[searchLink.find('href="')+6:searchLink.rfind('">')]
            searchLink += '&pageIndex=0&sortby=3' # grabs reviews for every condition
            searchLink = 'https://www.webmd.com'+searchLink
            break
            
    return searchLink

In [27]:
link = medsDF.loc[0]['Parent links']
reviewLink = getReviewsLink(link)
testvar = scrollReviews(reviewLink)

In [28]:
testvar


[{'conditionInfo': '\r\n\t\t\t\t\tCondition: Attention Deficit Disorder with Hyperactivity',
  'date': '12/30/2019 10:57:43 AM',
  'reviewer': 'Reviewer: Anonymous, 13-18 on Treatment for 1 to 6 months (Patient) ',
  'Effectiveness': '5',
  'Satisfaction': '3',
  'Ease of Use': '5',
  'Comment': 'Comment: Very powerful for fixing attention problems. It makes it much easier to work, however the side effects are strange and concerning. There is a significant loss of appetite and dry mouth, but these are not terribly strange. While the medicine wears off (which takes hours), paranoia and confusion can set in. While these happen, there is a feeling of freezing despite the actual temperature, possibly d\r\n\t\t\t\t\t...\r\n\t\t\t\t\tShow Full Comment'},
 {'conditionInfo': '\r\n\t\t\t\t\tCondition: Attention Deficit Disorder with Hyperactivity',
  'date': '6/18/2019 2:27:43 AM',
  'reviewer': 'Reviewer: 25-34 Female  on Treatment for 1 to less than 2 years (Patient) ',
  'Effectiveness': '4'