# Part 1: Extract list of all contraceptives listed on WebMD with reviews. 

In [0]:
import pandas as pd
import html5lib
import requests
from bs4 import BeautifulSoup
import re
import urllib

In [0]:
#extract list of all contraceptives in webmd and save them into a DataFrame
contraception_url = "https://www.webmd.com/drugs/2/condition-3454/pregnancy%20contraception"

header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

r = requests.get(contraception_url, headers=header)

contraceptives = pd.read_html(r.text)[0]

In [0]:
#going to extract links to drug reviews
req = urllib.request.Request(contraception_url, headers={'User-Agent' : "Magic Browser"}) 
html_page = urllib.request.urlopen( req )
soup = BeautifulSoup(html_page)

contraceptive_table = soup.find('table', {'class': 'drugs-treatments-table'})
review_links = []

for drug_link in contraceptive_table.findAll('a'):
    link = drug_link.get('href')
    if "drugreview" in link:
      review_links.append("https://www.webmd.com" + link)

contraceptives['Review_Link'] = pd.Series(review_links)
contraceptives.rename(columns={"Drug Name": "Drug_Name", "User Reviews": "User_Reviews"}, inplace=True)

In [0]:
reviews = contraceptives["User_Reviews"].str.split(" ", n = 1, expand = True)
contraceptives["Num_Reviews"] = reviews[0]
contraceptives = contraceptives[contraceptives["Num_Reviews"] > "0"]

In [5]:
contraceptives

Unnamed: 0,Drug_Name,Indication,Type,User_Reviews,Review_Link,Num_Reviews
0,Mirena Intrauterine Device,On Label,RX,1763 Reviews,https://www.webmd.com/drugs/drugreview-20420-m...,1763
1,Sprintec,On Label,RX,960 Reviews,https://www.webmd.com/drugs/drugreview-64195-s...,960
2,YAZ,On Label,RX,683 Reviews,https://www.webmd.com/drugs/drugreview-95358-y...,683
3,Tri-Sprintec,On Label,RX,640 Reviews,https://www.webmd.com/drugs/drugreview-78136-t...,640
4,Loestrin 24 Fe tablet,On Label,RX,620 Reviews,https://www.webmd.com/drugs/drugreview-95194-l...,620
...,...,...,...,...,...,...
273,VCF Gel with Prefilled Applicator,On Label,OTC,1 Reviews,https://www.webmd.com/drugs/drugreview-9226-gy...,1
274,Femynor,On Label,RX,1 Reviews,https://www.webmd.com/drugs/drugreview-8820-no...,1
275,TriNessa Lo,On Label,RX,1 Reviews,https://www.webmd.com/drugs/drugreview-6749-et...,1
276,Tri-Lo-Estarylla,On Label,RX,1 Reviews,https://www.webmd.com/drugs/drugreview-17178-t...,1


# Part 2: Identify short term and long term contraceptives

According to the survey program, long term contraceptives are **male or female sterilization, Norplant and the IUD** (https://dhsprogram.com/pubs/pdf/FR19/FR19.pdf?fbclid=IwAR0On3a_QrRgD3zem8DSpaVRfKBB3fxArdYEzryDViHdj2hB9zdmkub1CXs, page 56). Since sterilizations are not considered *drugs*, we will only be marking norplants and IUDs as long term and every other contraceptives are short term.

In [6]:
# Get list of IUDs
iud_url = "https://www.webmd.com/drugs/2/search?type=drugs&query=Intrauterine%20Device"

req = urllib.request.Request(iud_url, headers=header) 
html_page = urllib.request.urlopen( req )
soup = BeautifulSoup(html_page)

iuds = soup.find('ul', {'class': 'exact-match'}).findAll('a')
iuds_list = []

for iud in iuds:
  iuds_list.append(iud.getText())
  
iuds_list

['Progestasert Intrauterine Device',
 'progesterone Intrauterine Device',
 'Mirena Intrauterine Device',
 'Skyla Intrauterine Device',
 'copper Intrauterine Device',
 'levonorgestrel Intrauterine Device',
 'Kyleena Intrauterine Device',
 'ParaGard T 380-A Intrauterine Device',
 'levonorgestrel 14 mcg/24 hour (3 years) intrauterine device',
 'Liletta 19.5 mcg/24 hrs (5 yrs) 52 mg intrauterine device Contraceptives']

In [7]:
# Get list of Norplants
norplant_url = "https://www.webmd.com/drugs/2/search?type=drugs&query=Norplant%20System%20implant"

req = urllib.request.Request(norplant_url, headers=header) 
html_page = urllib.request.urlopen( req )
soup = BeautifulSoup(html_page)

norplants = soup.find('ul', {'class': 'partial-match'}).findAll('a')
norplants_list = []

for norplant in norplants:
  norplants_list.append(norplant.getText())
  
norplants_list

['Norplant System Kit',
 'Nexplanon Implant',
 'goserelin Implant',
 'Sinuva Implant',
 'fluocinolone Implant',
 'Ozurdex Implant',
 'Retisert Implant',
 'Zoladex Implant',
 'Yutiq Implant',
 'Iluvien Implant']

In [8]:
long_term_contraceptives = iuds_list + norplants_list

def is_long_term(contraceptive, long_term_lst):
  if contraceptive in long_term_lst:
    return 1
  return 0

def is_short_term(contraceptive, long_term_lst):
  if contraceptive in long_term_lst:
    return 0
  return 1

contraceptives['Long_Term'] = contraceptives.apply(lambda row: is_long_term(row.Drug_Name, long_term_contraceptives), axis = 1)
contraceptives['Short_Term'] = contraceptives.apply(lambda row: is_short_term(row.Drug_Name, long_term_contraceptives), axis = 1)
contraceptives.head()

Unnamed: 0,Drug_Name,Indication,Type,User_Reviews,Review_Link,Num_Reviews,Long_Term,Short_Term
0,Mirena Intrauterine Device,On Label,RX,1763 Reviews,https://www.webmd.com/drugs/drugreview-20420-m...,1763,1,0
1,Sprintec,On Label,RX,960 Reviews,https://www.webmd.com/drugs/drugreview-64195-s...,960,0,1
2,YAZ,On Label,RX,683 Reviews,https://www.webmd.com/drugs/drugreview-95358-y...,683,0,1
3,Tri-Sprintec,On Label,RX,640 Reviews,https://www.webmd.com/drugs/drugreview-78136-t...,640,0,1
4,Loestrin 24 Fe tablet,On Label,RX,620 Reviews,https://www.webmd.com/drugs/drugreview-95194-l...,620,0,1


# Part 3: Follow links to contraceptive reviews to extract reviews for each contraceptive

In [0]:
def get_reviews(url, header, num_reviews):
  contraceptive_reviews = []

  for page_ind in range((num_reviews // 5) + 1):
    reviews_url = url + "&pageIndex={ind}&sortby=3&conditionFilter=-1".format(ind = page_ind)
    req = urllib.request.Request(reviews_url, headers=header)
    html_page = urllib.request.urlopen( req )
    soup = BeautifulSoup(html_page)

    ratings = soup.findAll('div', {'class': 'userPost'})

    for i in range(len(ratings)):
      review = ratings[i]
      reviewer_info = review.find('p', {'class': 'reviewerInfo'}).getText()
      comment = review.find('p', {'id': 'comFull{num}'.format(num=i+1)}).getText()

      contraceptive_reviews.append([reviewer_info, comment])
    
  return contraceptive_reviews

In [0]:
contraceptives['Review'] = contraceptives.apply(lambda row: get_reviews(row.Review_Link, header, int(row.Num_Reviews)), axis = 1)

In [11]:
contraceptives.head()

Unnamed: 0,Drug_Name,Indication,Type,User_Reviews,Review_Link,Num_Reviews,Long_Term,Short_Term,Review
0,Mirena Intrauterine Device,On Label,RX,1763 Reviews,https://www.webmd.com/drugs/drugreview-20420-m...,1763,1,0,"[[Reviewer: Carouselambra, 25-34 on Treatment ..."
1,Sprintec,On Label,RX,960 Reviews,https://www.webmd.com/drugs/drugreview-64195-s...,960,0,1,"[[Reviewer: HowInTheHeck, 25-34 Female on Tre..."
2,YAZ,On Label,RX,683 Reviews,https://www.webmd.com/drugs/drugreview-95358-y...,683,0,1,"[[Reviewer: Tish, 35-44 on Treatment for less ..."
3,Tri-Sprintec,On Label,RX,640 Reviews,https://www.webmd.com/drugs/drugreview-78136-t...,640,0,1,[[Reviewer: 35-44 Female on Treatment for 10 ...
4,Loestrin 24 Fe tablet,On Label,RX,620 Reviews,https://www.webmd.com/drugs/drugreview-95194-l...,620,0,1,"[[Reviewer: NV, 35-44 Female on Treatment for..."


# Part 4: Extract the age range of the reviewer, their contraceptive treatment length, and comment

In [0]:
def get_treatment_length(reviewer_info):
  rev_info_split = reviewer_info.split('Treatment for')
  if rev_info_split[0] != reviewer_info:
    return rev_info_split[1].split('(Patient)')[0]
  return ""

In [0]:
webmd_contraceptive_reviews = contraceptives.explode("Review")
webmd_contraceptive_reviews[['Reviewer_Info','Comment']] = pd.DataFrame(webmd_contraceptive_reviews.Review.tolist(), index= webmd_contraceptive_reviews.index)
webmd_contraceptive_reviews['Reviewer_Age_Range'] = webmd_contraceptive_reviews['Reviewer_Info'].str.extract(r'([0-9][0-9]\-[0-9][0-9])')
webmd_contraceptive_reviews['Treatment_Length'] = webmd_contraceptive_reviews.apply(lambda row: get_treatment_length(row.Reviewer_Info), axis = 1)
webmd_contraceptive_reviews['Comment'] = webmd_contraceptive_reviews['Comment'].str.replace('Comment:', '').str.replace('Hide Full Comment', '')

In [14]:
webmd_contraceptive_reviews.drop(columns=['Type', 'User_Reviews', 'Review_Link', 'Review', 'Num_Reviews', 'Indication', 'Reviewer_Info'], inplace=True)
webmd_contraceptive_reviews.reset_index(inplace=True, drop=True)
webmd_contraceptive_reviews.head()

Unnamed: 0,Drug_Name,Long_Term,Short_Term,Comment,Reviewer_Age_Range,Treatment_Length
0,Mirena Intrauterine Device,1,0,"I did initially experience excruciating, breat...",25-34,5 to less than 10 years
1,Mirena Intrauterine Device,1,0,Memory Loss!!! Never again on any form of birt...,35-44,2 to less than 5 years
2,Mirena Intrauterine Device,1,0,Horrible experience would not recommend to nob...,,
3,Mirena Intrauterine Device,1,0,I had a Mirena placed in me in 2006. I had maj...,45-54,5 to less than 10 years
4,Mirena Intrauterine Device,1,0,The Mirena IUD was the worst thing to ever hap...,19-24,2 to less than 5 years


In [0]:
webmd_contraceptive_reviews.to_csv('webmd_contraceptive_reviews.csv')
!cp webmd_contraceptive_reviews.csv "drive/My Drive/"