# BeautifulSoup Scraping

In [1]:
# import jtplot module in notebook
from jupyterthemes import jtplot

# choose which theme to inherit plotting style from
# onedork | grade3 | oceans16 | chesterish | monokai | solarizedl | solarizedd
jtplot.style(theme='onedork')

In [2]:
# import the libraries
import requests
import csv
from bs4 import BeautifulSoup
import re
from robobrowser import RoboBrowser
import pandas as pd

<h2>Structure</h2>
<ul>
    <li>We can't access product reviews unless we're logged in as a registered user. So we have to handle login.</li>
    <li>We will be using the most reviewed products page</li>
    <li>We will focus on "Foundation" products</li>
    <li>There are 7 different "Foundation" subcategories with its own unique "CategoryID" in its url
        <ol>
            <li><strong>Powder</strong> - CategoryID: 503</li>
            <li><strong>Primer/Corrector</strong> - CategoryID: 504</li>
            <li><strong>Stick</strong> - CategoryID: 506</li>
            <li><strong>Tinted Moisturizer</strong> - CategoryID: 505</li>
            <li><strong>Liquid</strong> - CategoryID: 502</li>
            <li><strong>Crème</strong> - CategoryID: 501</li>
            <li><strong>BB Cream</strong> - CategoryID: 507</li>
        </ol>
    </li>
    <li>For each subcategory there are products that each have
        <ul>
            <li><strong>Brand Name</strong> </li>
            <li><strong>Product Name</strong> </li>
            <li><strong>Category</strong> </li>
            <li><strong>Average Rating</strong> </li>
            <li><strong>Total Number of Reviews</strong> </li>
            <li><strong>% That Would Buy Again</strong></li>
        </ul>
    </li>
    <li>We will be scraping the first 5 pages of products for each subcategory, with the exception of "Stick" as "Stick" only has 2 pages of products, and so we will only scrape two pages of products for "Stick"</li>
    <li>When we click on a product page there are reviews on the bottom. For each review we will scrape
        <ul>
            <li><strong>Rating</strong> - 1 to 5 lipsticks (aka stars)</li>
            <li><strong>username</strong> </li>
            <li><strong>date</strong> - date that the review was posted</li>
            <li><strong>age</strong> - age range of the reviewer</li>
            <li><strong>skin</strong> - skin information of the reviewer</li>
            <li><strong>hair</strong> - hair information of the reviewer</li>
            <li><strong>eyes</strong> - eye color of the reviewer</li>
            <li><strong>review</strong> - the review itself</li>
        </ul>
    </li>
</ul>


### Login

In [3]:
# login_url
login_url = "https://www.makeupalley.com/account/login.asp"

In [4]:
# let's login
browser = RoboBrowser(history=True)
browser.open(login_url)
form = browser.get_form(action='/account/login.asp')

form["UserName"] = input("Enter Username: ")
form["Password"] = input("Enter Password: ")
browser.session.headers['Referer'] = login_url

browser.submit_form(form)
# print(str(browser.select))

print("Login Successful!")



  features=self.browser.parser,


Enter Username: ugca
Enter Password: barua123
Login Successful!


### Begin Scraper

In [5]:
# Store the different categories in a list
categories = [
    "Powder",
    "Primer/Corrector",
    "Stick",
    "Tinted Moisturizer",
    "Liquid",
    "Crème",
    "BB Cream"
]

In [6]:
# Create a dictionary to to match each Subcategory with its CategoryID
cat_id = dict()
cat_id["Powder"] = 503
cat_id["Primer/Corrector"] = 504
cat_id["Stick"] = 506
cat_id["Tinted Moisturizer"] = 505
cat_id["Liquid"] = 502
cat_id["Crème"] = 501
cat_id["BB Cream"] = 507

In [7]:
# store product info: brand_name, product_name, category, average_rating, total_reviews, buy_again_percentage%
product_info = []

In [8]:
# store output for each review: product_name, rating, username, date, age, skin, hair, eyes, review
reviews = []

In [None]:
# skeleton url
base_url = "https://www.makeupalley.com/product/browse.asp/page=/pagesize=15/CategoryId=/topten=reviewed/AgeRange=0/"
base = "https://www.makeupalley.com"

# iterate through each category of foundation
for cat in categories:
    # 2 pages of products for stick
    if(cat == "Stick"):
        for i in range(1,3):
            # adjust the urls
            product_url = base_url.replace("page=", "page=" + str(i))
            product_url = product_url.replace("CategoryId=", "CategoryId=" + str(cat_id[cat]))
            
            # collect the url with the requests library
            page = requests.get(product_url)
            print(product_url)
            
            # get the html of the page in string form
            page_html = page.text

            # create the BeautifulSoup object that takes in the html in str form and a html/xml parser of choice either html.parser or lxml
            soup = BeautifulSoup(page_html, 'html.parser')

            # Pull all text from the div class : search-results
            product_search_results = soup.find(class_ = "search-results")
            
            # Grab all the rows of products
            product_list = product_search_results.find_all("tr")
            
            # get the attributes
            for product in product_list:
                # store product data
                row_product_data = []
                
                # grab all column information
                columns = product.find_all("td")
                
                # store variable for the product link
                product_link = ""
                for col in range(len(columns)):
                    # first column is brand
                    if(col == 0):
                        row_product_data.append(columns[col].text)
                        #print(columns[col].text)
                    
                    # second column is the product
                    elif(col == 1):
                        product_name = columns[col].find_all("a", href = True)[1]
                        row_product_data.append(product_name.text)
                        product_link = base + product_name["href"]
                        #print(product_name.text)
                       # print(product_link)
                    
                    # third column is the category name
                    elif(col == 2):
                        the_cat = columns[col].text 
                        row_product_data.append(the_cat)
                       # print(the_cat)
                        
                    # fourth column is avg rating
                    elif(col == 3):
                        avg_rating = float(columns[col].text)
                        row_product_data.append(avg_rating)
                        #print(avg_rating)
                    
                    # fifth columns is number of reviews
                    elif(col == 4):
                        num_reviews = int(columns[col].text.replace(",", ""))
                        row_product_data.append(num_reviews)
                        #print(num_reviews)
                        
                    # sixth column is buy again percentage
                    elif(col == 5):
                        buy_again = float(columns[col].text.replace("%", "")) / 100.0
                        row_product_data.append(buy_again)
                        #print(buy_again)
                        
                # throw this row data into the product_info list
                if(len(row_product_data) != 0):
                    product_info.append(row_product_data)
                    
                    # lets go to the product link
                    #print("This the product link")
                    #print(product_link)
                    product_page = requests.get(product_link)
                    
                    # get the html of the page in string form
                    product_page_html = product_page.text
                    
                    # create the BeautifulSoup object that takes in the html in str form and a html/xml parser of choice either html.parser or lxml
                    soup2 = BeautifulSoup(product_page_html, 'html.parser')
                    
                    # first find the product_id 
                    product_id_search = soup2.find("div" , {"id":"ItemId"})
                    try:
                        product_id = product_id_search.text
                        print("product_id:", product_id, "Scraping")
                        # grab the last page of reviews which is the href of the second to last div of class = track_paging_
                        page_trackers = soup2.find_all(class_ = "track_Paging_", href = True)
                        last_page = page_trackers[-2]["href"]
                    except:
                        print("this shit failed")
                        continue
                         
                    # do something hacky/ string manipulations to extract last page number
                    last_page_number = last_page[:-1]
                    start_index = last_page_number.find("page=")
                    last_page_number = int(last_page_number[start_index + 5:])
                    
                    #print("last_page:", last_page)
                    #print("last page number:", last_page_number)
                    
                    # skeleton
                    base_product_url = "https://www.makeupalley.com/product/showreview.asp/ItemID=/page=/"
    
                    # go through all the pages to scrape reviews!
                    # last_page_number + 1
                    for i in range(1, last_page_number + 1):
                        # update the product url
                        new_product_url = base_product_url.replace("ItemID=","ItemID=" + product_id)
                        new_product_url = new_product_url.replace("page=", "page="+ str(i))
                        #print(new_product_url)
                        
                        # get the html of the page in string form
                        new_product_page = requests.get(new_product_url)
                        new_product_page_html = new_product_page.text
                        
                        # create a beautiful soup object
                        soup3 = BeautifulSoup(new_product_page_html, 'html.parser')
                        
                        comment_list = soup3.find(id = "reviews-wrapper")
                        
                        real_comment_list = comment_list.find_all(class_ = "comments")
                        
                        # iterate through each comment
                        for comment in real_comment_list:
                            # store row data
                            row = []
                            
                            # get the productname
                            row.append(product_name.text)
                            
                            # get the ratings
                            rating = comment.find(class_ = "lipies")
                            score = rating.find("span")
                            row.append(score["class"][0][2])
                            
                            # get username
                            username = comment.find(class_ = "user-name")
                            row.append(username.text.replace("\t", ""))
                            
                            # get the date
                            date = comment.find(class_ = "date")
                            row.append(date.text)
                            
                            # get age, skin, hair, and eyes
                            traits = comment.find(class_ = "important")
                            clean_traits = traits.text.replace("Age:", "")
                            clean_traits = clean_traits.replace("Skin", "")
                            clean_traits = clean_traits.replace("Hair", "")
                            clean_traits = clean_traits.replace("Eyes", "")
                            clean_traits = clean_traits.split(":")
                            for k in range(len(clean_traits)):
                                row.append(clean_traits[k].strip())
                            
                            # get the review
                            review = comment.find(class_ = "break-word")
                            try:
                                row.append(review.text.replace("\t",""))
                            except:
                                try:
                                    review = comment.find(class_ = "1break-word")
                                    row.append(review.text.replace("\t", ""))
                                except:
                                    print("it didn't work")
                                    continue
                            
                            # throw the row of data into reviews
                            reviews.append(row)
                    print("product_id:", product_id, "Complete")
                    
    # 5 pages of products for the rest
    else:
        for i in range(1,6):
            # adjust the urls
            product_url = base_url.replace("page=", "page=" + str(i))
            product_url = product_url.replace("CategoryId=", "CategoryId=" + str(cat_id[cat]))
            
            # collect the url with the requests library
            page = requests.get(product_url)
            print(product_url)
            
            # get the html of the page in string form
            page_html = page.text

            # create the BeautifulSoup object that takes in the html in str form and a html/xml parser of choice either html.parser or lxml
            soup = BeautifulSoup(page_html, 'html.parser')

            # Pull all text from the div class : search-results
            product_search_results = soup.find(class_ = "search-results")
            
            # Grab all the rows of products
            product_list = product_search_results.find_all("tr")
            
            # get the attributes
            for product in product_list:
                # store product data
                row_product_data = []
                
                # grab all column information
                columns = product.find_all("td")
                
                # store variable for the product link
                product_link = ""
                for col in range(len(columns)):
                    # first column is brand
                    if(col == 0):
                        row_product_data.append(columns[col].text)
                        #print(columns[col].text)
                    
                    # second column is the product
                    elif(col == 1):
                        product_name = columns[col].find_all("a", href = True)[1]
                        row_product_data.append(product_name.text)
                        product_link = base + product_name["href"]
                        #print(product_name.text)
                       # print(product_link)
                    
                    # third column is the category name
                    elif(col == 2):
                        the_cat = columns[col].text 
                        row_product_data.append(the_cat)
                       # print(the_cat)
                        
                    # fourth column is avg rating
                    elif(col == 3):
                        avg_rating = float(columns[col].text)
                        row_product_data.append(avg_rating)
                        #print(avg_rating)
                    
                    # fifth columns is number of reviews
                    elif(col == 4):
                        num_reviews = int(columns[col].text.replace(",", ""))
                        row_product_data.append(num_reviews)
                        #print(num_reviews)
                        
                    # sixth column is buy again percentage
                    elif(col == 5):
                        buy_again = float(columns[col].text.replace("%", "")) / 100.0
                        row_product_data.append(buy_again)
                        #print(buy_again)
                        
                # throw this row data into the product_info list
                if(len(row_product_data) != 0):
                    product_info.append(row_product_data)
                    
                    # lets go to the product link
                    #print("This the product link")
                    #print(product_link)
                    product_page = requests.get(product_link)
                    
                    # get the html of the page in string form
                    product_page_html = product_page.text
                    
                    # create the BeautifulSoup object that takes in the html in str form and a html/xml parser of choice either html.parser or lxml
                    soup2 = BeautifulSoup(product_page_html, 'html.parser')
                    
                    # first find the product_id 
                    product_id_search = soup2.find("div" , {"id":"ItemId"})
                    try:
                        product_id = product_id_search.text
                        print("product_id:", product_id, "Scraping")
                        page_trackers = soup2.find_all(class_ = "track_Paging_", href = True)
                        last_page = page_trackers[-2]["href"]
                    except:
                        print("this shit failed")
                        continue
                    
                    # do something hacky/ string manipulations to extract last page number
                    last_page_number = last_page[:-1]
                    start_index = last_page_number.find("page=")
                    last_page_number = int(last_page_number[start_index + 5:])
                    
                    #print("last_page:", last_page)
                    #print("last page number:", last_page_number)
                    
                    # skeleton
                    base_product_url = "https://www.makeupalley.com/product/showreview.asp/ItemID=/page=/"
    
                    # go through all the pages to scrape reviews!
                    # last_page_number + 1
                    for i in range(1, last_page_number + 1):
                        # update the product url
                        new_product_url = base_product_url.replace("ItemID=","ItemID=" + product_id)
                        new_product_url = new_product_url.replace("page=", "page="+ str(i))
                        #print(new_product_url)
                        
                        # get the html of the page in string form
                        new_product_page = requests.get(new_product_url)
                        new_product_page_html = new_product_page.text
                        
                        # create a beautiful soup object
                        soup3 = BeautifulSoup(new_product_page_html, 'html.parser')
                        
                        comment_list = soup3.find(id = "reviews-wrapper")
                        
                        real_comment_list = comment_list.find_all(class_ = "comments")
                        
                        # iterate through each comment
                        for comment in real_comment_list:
                            # store row data
                            row = []
                            
                            # get the productname
                            row.append(product_name.text)
                            
                            # get the ratings
                            rating = comment.find(class_ = "lipies")
                            score = rating.find("span")
                            row.append(score["class"][0][2])
                            
                            # get username
                            username = comment.find(class_ = "user-name")
                            row.append(username.text.replace("\t", ""))
                            
                            # get the date
                            date = comment.find(class_ = "date")
                            row.append(date.text)
                            
                            # get age, skin, hair, and eyes
                            traits = comment.find(class_ = "important")
                            clean_traits = traits.text.replace("Age:", "")
                            clean_traits = clean_traits.replace("Skin", "")
                            clean_traits = clean_traits.replace("Hair", "")
                            clean_traits = clean_traits.replace("Eyes", "")
                            clean_traits = clean_traits.split(":")
                            for k in range(len(clean_traits)):
                                row.append(clean_traits[k].strip())
                            
                            # get the review
                            review = comment.find(class_ = "break-word")
                            try:
                                row.append(review.text.replace("\t",""))
                            except:
                                try:
                                    review = comment.find(class_ = "1break-word")
                                    row.append(review.text.replace("\t", ""))
                                except:
                                    print("it didn't work")
                                    continue
                            
                            # throw the row of data into reviews
                            reviews.append(row)
                            
                    print("product_id:", product_id, "Complete")
                        
                        
                        
    

https://www.makeupalley.com/product/browse.asp/page=1/pagesize=15/CategoryId=503/topten=reviewed/AgeRange=0/
product_id: 9246 Scraping
product_id: 9246 Complete
product_id: 25754 Scraping
product_id: 25754 Complete
product_id: 15111 Scraping
product_id: 15111 Complete
product_id: 78607 Scraping
product_id: 78607 Complete
product_id: 74290 Scraping
product_id: 74290 Complete
product_id: 120907 Scraping
product_id: 120907 Complete
product_id: 13145 Scraping
product_id: 13145 Complete
product_id: 100359 Scraping
product_id: 100359 Complete
product_id: 89705 Scraping
product_id: 89705 Complete
product_id: 67857 Scraping
product_id: 67857 Complete
product_id: 102364 Scraping
product_id: 102364 Complete
product_id: 70540 Scraping
product_id: 70540 Complete
product_id: 11317 Scraping
product_id: 11317 Complete
product_id: 116379 Scraping
product_id: 116379 Complete
product_id: 921 Scraping
product_id: 921 Complete
https://www.makeupalley.com/product/browse.asp/page=2/pagesize=15/CategoryId=50

In [None]:
products_df = pd.DataFrame(product_info, columns = ["Brand", "Product", "Type", "Avg Rating", "Number of Reviews", "Percentage Would Buy Again"])

In [None]:
products_df

In [None]:
reviews_df = pd.DataFrame(reviews, columns = ["Product", "Rating", "Username", "Date", "Age", "Skin", "Hair", "Eyes", "Review"])

In [None]:
reviews_df

In [None]:
reviews_df.to_csv("reviews.csv")

In [None]:
products_df.to_csv("products.csv")