# Scrape Drug Reviews from Drugs.com

In [3]:
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd

## Scrape drug reviews from Drugs.com

We create a dataframe with the following format:

#### | drug name | review | rating |

In [4]:
def main():
    # 1. Get data from the "contraceptives" page on drugs.com
    url = "https://www.drugs.com/drug-class/contraceptives.html"
    r, webpage = get_webpage(url)

    # 2. Get table with drug names, reviews, etc.
    table_body = webpage.find("table", {"class":"ddc-table-sortable"})
    # create an empty dataframe
    drug_data = pd.DataFrame(columns=['date', 'sentiment', 'review'])
    
    # loop through the table with drug names
    for row in table_body.find_all("tr")[1:-1]:
        # get the drug name
        drug_name = row.td.a.b.string
        
    
        # get the number of reviews
        if row.find("a", {"class":"ddc-text-nowrap"}):
            # grab the number and convert the string into an integer
            drug_reviews = row.find("a", {"class":"ddc-text-nowrap"}).string.split(' ')[0]
            drug_reviews = int(drug_reviews.replace(',', ''))
        
            url_ = "https://www.drugs.com" + str(row.find("a", {"class":"ddc-text-nowrap"}, href=True)['href'])
            # if there are more than 100 reviews, we add it to the dataframe
            if drug_reviews >= 100:
                # get the reviews, dates, and sentiments of all drug reviews posted after 2021
                reviews = get_reviews(url_, drug_name, drug_reviews, '2021')
            
    return reviews 

In [None]:
def get_webpage(link):
    '''Get the contents of a webpage.
    Input: 
        link = Link to the desired webpage
    Output: Beautiufl Soup Object containing the HTML data
    '''
    # load the webpage content 
    r = requests.get(link)
    # convert to a beautiful soup object 
    webpage = bs(r.content)
    return r, webpage


def get_reviews(url_, drug_name, drug_reviews, date_cutoff=False):
    '''Get all of the reviews of a specific drug
    Input: 
        drug_name = name of the drug
        drug_reviews = number of reviews
        date_cutoff = Do you want to limit the date of reviews collected?
    Output: A list containing a list for each review: [str('Review goes here'), int(rating)]
    '''
    #------------------------#
    # 1. initalize variables #
    #------------------------#
    reviews, isHaveNextPage, page = [], True, 0
    
    #---------------------------------------------------------#
    # 2. "clicking" the "#### Reviews" hyperlink in the table #
    #---------------------------------------------------------#
    review_page_url = url_
    
    #---------------------------------------#
    # 3. cycle through each page of reviews #
    #---------------------------------------#
    while isHaveNextPage: 
        
        # access the page and sort the reviews by most recent reviews
        r_, review_page_content = get_webpage(review_page_url + f"?sort_reviews=most_recent&page={page}")
        
        # TO-DO: There is an issue here with grabbing reviews after page 5
        list_of_review_boxes = review_page_content.find_all("div", {"class":"ddc-comment ddc-box ddc-mgb-2"})
        
        # grab the date, review paragraph, and rating
        for review in list_of_review_boxes:
            # 1. find the date, if it is at the cutoff year, ignore
            head = review.find("div", {"class":"ddc-comment-header"})
            date = head.find("span", string=re.compile(", ")).string
            if date.endswith(date_cutoff):
                return reviews
            
            # 3. get the review paragraph
            if review.p.b:
                review_paragraph = str(review.p.b.next_sibling).strip()[1:-1]
            else:
                review_paragraph = str(review.p).strip()[1:-1]
            
            # 2. find the review rating, if it exists, if not, ignore
            sentiment = None
            if review.find("div", {"class":"ddc-rating-summary"}):
                rating = int(review.find("div", {"class":"ddc-rating-summary"}).span.b.string)
                if rating < 5:
                    sentiment = 'pos'
                else:
                    sentiment = 'neg'
            else:
                continue 
    
            reviews.append(list([date, review_paragraph, sentiment]))
            
        # go to next page if not on the last page
        if review_page_content.find_all("li",class_='ddc-paging-item-next') is None:
            isHaveNextPage=False
        page += 1
    print("Done with ", drug_name)
    return reviews
    

if __name__ == "__main__":
    reviews = main()
    reviews.head()