# Part 1: Extract list of all contraceptives listed on WebMD with reviews. 

In [0]:
import pandas as pd
import html5lib
import requests
from bs4 import BeautifulSoup
import re
import urllib

In [0]:
#extract list of all contraceptives in webmd and save them into a DataFrame
contraception_url = "https://www.webmd.com/drugs/2/condition-3454/pregnancy%20contraception"

header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

r = requests.get(contraception_url, headers=header)

contraceptives = pd.read_html(r.text)[0]

In [35]:
#going to extract links to drug reviews
req = urllib.request.Request(contraception_url, headers={'User-Agent' : "Magic Browser"}) 
html_page = urllib.request.urlopen( req )
soup = BeautifulSoup(html_page)

contraceptive_table = soup.find('table', {'class': 'drugs-treatments-table'})
review_links = []

for drug_link in contraceptive_table.findAll('a'):
    link = drug_link.get('href')
    if "drugreview" in link:
      review_links.append("https://www.webmd.com" + link)

contraceptives['Review_Link'] = pd.Series(review_links)

['https://www.webmd.com/drugs/drugreview-20420-mirena-device.aspx?drugid=20420&drugname=mirena-device', 'https://www.webmd.com/drugs/drugreview-64195-sprintec.aspx?drugid=64195&drugname=sprintec', 'https://www.webmd.com/drugs/drugreview-95358-yaz.aspx?drugid=95358&drugname=yaz', 'https://www.webmd.com/drugs/drugreview-78136-tri-sprintec.aspx?drugid=78136&drugname=tri-sprintec', 'https://www.webmd.com/drugs/drugreview-95194-loestrin-24-fe-tablet.aspx?drugid=95194&drugname=loestrin-24-fe-tablet', 'https://www.webmd.com/drugs/drugreview-63410-nuvaring-ring.aspx?drugid=63410&drugname=nuvaring-ring', 'https://www.webmd.com/drugs/drugreview-13028-paragard-t-380-a-device.aspx?drugid=13028&drugname=paragard-t-380-a-device', 'https://www.webmd.com/drugs/drugreview-144857-implanon-implant.aspx?drugid=144857&drugname=implanon-implant', 'https://www.webmd.com/drugs/drugreview-53357-depo-provera-contraceptive-suspension.aspx?drugid=53357&drugname=depo-provera-contraceptive-suspension', 'https://www

In [36]:
contraceptives.head(10)

Unnamed: 0,Drug Name,Indication,Type,User Reviews,Review_Link
0,Mirena Intrauterine Device,On Label,RX,1763 Reviews,https://www.webmd.com/drugs/drugreview-20420-m...
1,Sprintec,On Label,RX,960 Reviews,https://www.webmd.com/drugs/drugreview-64195-s...
2,YAZ,On Label,RX,683 Reviews,https://www.webmd.com/drugs/drugreview-95358-y...
3,Tri-Sprintec,On Label,RX,640 Reviews,https://www.webmd.com/drugs/drugreview-78136-t...
4,Loestrin 24 Fe tablet,On Label,RX,620 Reviews,https://www.webmd.com/drugs/drugreview-95194-l...
5,"NuvaRing Ring, Vaginal",On Label,RX,591 Reviews,https://www.webmd.com/drugs/drugreview-63410-n...
6,ParaGard T 380-A Intrauterine Device,On Label,RX,516 Reviews,https://www.webmd.com/drugs/drugreview-13028-p...
7,Implanon Implant,On Label,RX,475 Reviews,https://www.webmd.com/drugs/drugreview-144857-...
8,Depo-Provera Contraceptive Suspension,On Label,RX,452 Reviews,https://www.webmd.com/drugs/drugreview-53357-d...
9,Nexplanon Implant,On Label,RX,415 Reviews,https://www.webmd.com/drugs/drugreview-156597-...


In [0]:
reviews = contraceptives["User Reviews"].str.split(" ", n = 1, expand = True)
contraceptives["Num_Reviews"] = reviews[0]
contraceptives = contraceptives[contraceptives["Num_Reviews"] > "0"]

In [40]:
contraceptives

Unnamed: 0,Drug Name,Indication,Type,User Reviews,Review_Link,Num_Reviews
0,Mirena Intrauterine Device,On Label,RX,1763 Reviews,https://www.webmd.com/drugs/drugreview-20420-m...,1763
1,Sprintec,On Label,RX,960 Reviews,https://www.webmd.com/drugs/drugreview-64195-s...,960
2,YAZ,On Label,RX,683 Reviews,https://www.webmd.com/drugs/drugreview-95358-y...,683
3,Tri-Sprintec,On Label,RX,640 Reviews,https://www.webmd.com/drugs/drugreview-78136-t...,640
4,Loestrin 24 Fe tablet,On Label,RX,620 Reviews,https://www.webmd.com/drugs/drugreview-95194-l...,620
...,...,...,...,...,...,...
273,Gynol II Extra Strength Gel,On Label,OTC,1 Reviews,https://www.webmd.com/drugs/drugreview-9226-gy...,1
274,norgestrel tablet,On Label,RX,1 Reviews,https://www.webmd.com/drugs/drugreview-8820-no...,1
275,ETHYNODIOL-ETHINYL estradiol,On Label,RX,1 Reviews,https://www.webmd.com/drugs/drugreview-6749-et...,1
276,Tri-Levlen (21) tablet,On Label,RX,1 Reviews,https://www.webmd.com/drugs/drugreview-17178-t...,1


# Part 2: Follow links to contraceptive reviews to extract reviews for each contraceptive

In [0]:
def get_reviews(url, header, num_reviews):
  contraceptive_reviews = []

  for page_ind in range((num_reviews // 5) + 1):
    reviews_url = url + "&pageIndex={ind}&sortby=3&conditionFilter=-1".format(ind = page_ind)
    req = urllib.request.Request(reviews_url, headers=header)
    html_page = urllib.request.urlopen( req )
    soup = BeautifulSoup(html_page)

    ratings = soup.findAll('div', {'class': 'userPost'})

    for i in range(len(ratings)):
      review = ratings[i]
      reviewer_info = review.find('p', {'class': 'reviewerInfo'}).getText()
      comment = review.find('p', {'id': 'comFull{num}'.format(num=i+1)}).getText()

      contraceptive_reviews.append([reviewer_info, comment])
    
  return contraceptive_reviews

In [43]:
contraceptives['Review'] = contraceptives.apply(lambda row: get_reviews(row.Review_Link, header, int(row.Num_Reviews)), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [44]:
contraceptives.head()

Unnamed: 0,Drug Name,Indication,Type,User Reviews,Review_Link,Num_Reviews,Review
0,Mirena Intrauterine Device,On Label,RX,1763 Reviews,https://www.webmd.com/drugs/drugreview-20420-m...,1763,"[[Reviewer: Carouselambra, 25-34 on Treatment ..."
1,Sprintec,On Label,RX,960 Reviews,https://www.webmd.com/drugs/drugreview-64195-s...,960,"[[Reviewer: HowInTheHeck, 25-34 Female on Tre..."
2,YAZ,On Label,RX,683 Reviews,https://www.webmd.com/drugs/drugreview-95358-y...,683,"[[Reviewer: Tish, 35-44 on Treatment for less ..."
3,Tri-Sprintec,On Label,RX,640 Reviews,https://www.webmd.com/drugs/drugreview-78136-t...,640,[[Reviewer: 35-44 Female on Treatment for 10 ...
4,Loestrin 24 Fe tablet,On Label,RX,620 Reviews,https://www.webmd.com/drugs/drugreview-95194-l...,620,"[[Reviewer: NV, 35-44 Female on Treatment for..."


# Part 3: Extract the age range of the reviewer, their contraceptive treatment length, and comment

In [0]:
def get_treatment_length(reviewer_info):
  rev_info_split = reviewer_info.split('Treatment for')
  if rev_info_split[0] != reviewer_info:
    return rev_info_split[1].split('(Patient)')[0]
  return ""

In [0]:
webmd_contraceptive_reviews = contraceptives.explode("Review")
webmd_contraceptive_reviews[['Reviewer_Info','Comment']] = pd.DataFrame(webmd_contraceptive_reviews.Review.tolist(), index= webmd_contraceptive_reviews.index)
webmd_contraceptive_reviews['Reviewer_Age_Range'] = webmd_contraceptive_reviews['Reviewer_Info'].str.extract(r'([0-9][0-9]\-[0-9][0-9])')
webmd_contraceptive_reviews['Treatment_Length'] = webmd_contraceptive_reviews.apply(lambda row: get_treatment_length(row.Reviewer_Info), axis = 1)
webmd_contraceptive_reviews['Comment'] = webmd_contraceptive_reviews['Comment'].str.replace('Comment:', '').str.replace('Hide Full Comment', '')

In [94]:
webmd_contraceptive_reviews.drop(columns=['Type', 'User Reviews', 'Review_Link', 'Review', 'Num_Reviews', 'Indication', 'Reviewer_Info'], inplace=True)
webmd_contraceptive_reviews.reset_index(inplace=True, drop=True)
webmd_contraceptive_reviews.head()

Unnamed: 0,Drug Name,Comment,Reviewer_Age_Range,Treatment_Length
0,Mirena Intrauterine Device,"I did initially experience excruciating, breat...",25-34,5 to less than 10 years
1,Mirena Intrauterine Device,Memory Loss!!! Never again on any form of birt...,35-44,2 to less than 5 years
2,Mirena Intrauterine Device,Horrible experience would not recommend to nob...,,
3,Mirena Intrauterine Device,I had a Mirena placed in me in 2006. I had maj...,45-54,5 to less than 10 years
4,Mirena Intrauterine Device,The Mirena IUD was the worst thing to ever hap...,19-24,2 to less than 5 years


In [0]:
webmd_contraceptive_reviews.to_csv('webmd_contraceptive_reviews.csv')
!cp webmd_contraceptive_reviews.csv "drive/My Drive/"