# Cleaning and creating a distinct file to make scraping separate from altering the metadata file (Medications_SideEffects_BrandsAndLinks.csv)

In [1]:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import time
import pandas as pd
import numpy as np
import glob

In [2]:
# Grabbing original medications/SE dataframe
def makeListofStr(s: str) -> list:
    List = s[1:-1].split("'")
    List = [string for string in List if string and string != ', ']
    return List

medsDF = pd.read_csv('Medications_SideEffects_brandAndlinks.csv', index_col=0,
                    converters={'Parent links': lambda x: makeListofStr(x), 'Exact medications': lambda x: makeListofStr(x)})

In [3]:
def pull_review(userPost, revnum=1):
    info = {}
    
    # Pulling basic post info
    info['conditionInfo'] = userPost.find('div', attrs={'class':'conditionInfo'}).text.replace('\r\n\t\t\t\t\t','')
    info['date'] = userPost.find('div', attrs={'class':'date'}).text
    info['reviewer'] = userPost.find('p', attrs={'class':'reviewerInfo'}).text
    
    # Pulling stars info
    content = userPost.find('div', attrs={'id':'ctnStars'})
    catsAndScores = []
    for child in content.findChildren():
        nextisScore=False
        for grandChild in child.findChildren():
            if nextisScore:
                score = grandChild.find('span', attrs={'class':'current-rating'}).text
                score = score[score.rfind(' ')+1:]
                catsAndScores.append((category, score))
                nextisScore=False
            if str(grandChild).find('category') != -1:
                category = str(grandChild)
                category = category[category.find('gory">')+6:category.rfind('</p>')]
                nextisScore=True
    catsAndScores = dict(catsAndScores)
    info['Effectiveness'] = catsAndScores['Effectiveness']
    info['Satisfaction'] = catsAndScores['Satisfaction']
    info['Ease of Use'] = catsAndScores['Ease of Use']
    
    # Pulling comment and cleaning it up
    text = userPost.find('p', attrs={'style':'display:none',
                                     'id':'comFull{:g}'.format(revnum)}).text
    text = text.replace('<strong>Comment:</strong>','').replace('<br','')
    text = text.replace('Hide Full Comment', '').replace('Comment:','')
    info['Comment'] = text
    
    return info

def proceed2NextPage(url):
    num = int(url[url.find('pageIndex=')+10:url.rfind('&sortby')])
    newurl = url.replace('pageIndex={:g}'.format(num),
                         'pageIndex={:g}'.format(num+1))
    return newurl

def scrollReviews(reviewPage0, reviewsPerPage=5):
    # Going to the reviews page and finding the total number of reviews
    response = requests.get(reviewPage0)
    soup = BeautifulSoup(response.content, 'html.parser')

    content = soup.find('span', attrs={'class':'totalreviews'})
    if not content: # In case there are no reviews
        return None
    totalreviews = content.text
    totalreviews = totalreviews[:totalreviews.rfind('Total')-1]

    # Counting the number of pages
    totalreviews = int(totalreviews)
    num_pages = totalreviews // reviewsPerPage
    if totalreviews % reviewsPerPage: num_pages += 1
    
    # Iterating through pages and grabbing review data
    all_reviews = []
    for npage in range(num_pages):
        # Finding the relevant page
        if npage == 0:
            url = reviewPage0
        else:
            url = proceed2NextPage(url)
    
        # Get heading above userPosts
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        content = soup.find('div', attrs={'id':'ratings_fmt'})
    
        revnum = 1
        for child in content.findChildren():
            if str(child).find('userPost') != -1:
                child_info = pull_review(child, revnum=revnum)
                all_reviews.append(child_info)
                revnum += 1
            
        time.sleep(0.1)
        
    # Process list all_reviews into dataframe and return
    return pd.DataFrame(all_reviews)
        
        
def getReviewsLink(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    content = soup.find('div', attrs={'class':'drug-review-lowest'})
    if content:
        searchLink = ''
        for child in content.findChildren():
            if str(child).find('"drug-review"') != -1:
                searchLink = str(child)
                searchLink = searchLink[searchLink.find('href="')+6:searchLink.rfind('">')]
                searchLink += '&pageIndex=0&sortby=3&conditionFilter=-1' # grabs reviews for every condition
                searchLink = 'https://www.webmd.com'+searchLink
                break
            
        return searchLink
    else:
        return False
    
def formatExactMed(s: str)->str:
    s = s.strip() # remove preceding and trailing white space
    s=s.replace(',','-') # Get rid of commas
    s=s.replace(' ', '-') # Get rid of white space
    s=s.replace('/', 'per') # remove things that make it look like a path
    return s

In [4]:
# Scrapes WebMD and pulls review for every medication it can
for ind, medication in zip(medsDF.index,medsDF['Medication']):
    for link, exactMed in zip(medsDF.loc[ind]['Parent links'], medsDF.loc[ind]['Exact medications']):
        if not glob.glob('Reviews/{:s}_reviews.csv'.format(formatExactMed(exactMed))):
            reviewLink = getReviewsLink(link)
        else:
            reviewLink = None

        if reviewLink:
            reviewsDF = scrollReviews(reviewLink)
            if type(reviewsDF) != type(None):
                reviewsDF.to_csv('Reviews/{:s}_reviews.csv'.format(formatExactMed(exactMed)), sep='$')