## Scraping
There's two sort of things that I want to scrape.
1. Reviews 
2. Dress details

In [1]:
import json
from pymongo import MongoClient
import requests
from bs4 import BeautifulSoup

client = MongoClient('ec2-34-198-179-91.compute-1.amazonaws.com', 27017)
db = client.fletcher
rev_col = db.rtr_reviews
dress_col = db.rtr_dresses

In [44]:
def insert_reviews(soup):
    reviews = soup.find_all(class_= "pdp-ind-review")
    url = soup.find('meta', {"property":"og:url"})['content']
    for rev in reviews:
        d = {}
        reviewer_info = rev.find(class_='reviewer-info')
        d['author'] = rev.find("span", {"itemprop": "author"}).text
        details = rev.find_all(class_ = 'review-detail-label')
        for detail in details:
            d[detail.text] = detail.next_sibling.text
        review_content = rev.find(class_ = "review-content")
        d['review'] = review_content.find(class_ = 'review-text').text
        rating= review_content.find(class_="review-rating")
        if rating and rating['class'] and len(rating['class']) > 2:
            d['rating'] = rating['class'][2]
        d['date'] = review_content.find("meta", {"itemprop": "datePublished"})['content']
        d['title'] = review_content.find(class_ = 'review-title').text
        d['url'] = url 
        nickname = rev.find('div', class_ = 'reviewer-nickname')
        if nickname.find(class_ = 'top-contributer'):
            d['top_contribute'] = True
        rev_col.insert_one(d)

In [7]:
def insert_dress(soup, url):
    d = {}
    d['price'] = soup.find(class_='product-price__original').text
    d['overall_rating'] = soup.find(class_="product-aggregate-rating__stars")['class'][1]
    d['designer_url'] = soup.find("h1", class_ = 'product-designer').a['href']
    d['designer_name'] = soup.find("h1", class_ = 'product-designer').text
    d['dress_name'] = soup.find("h2", class_="display-name").text
    product_details = soup.find("div", class_ = "product-details").find_all('details')
    for pd in product_details:
        det = pd.find('summary')
        key = det.text 
        val = det.next_sibling.text
        d[key] = val
    d['url'] = url
    dress_col.insert_one(d)

In [81]:
def insert_all_data(url):
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')
    url = soup.find('meta', {"property":"og:url"})['content']
    if exist_check(url):
        return
    print('inserting {}'.format(url))
    pagination = soup.find('div', class_='review-pagination')
    max_page = 0
    if pagination:
        max_page = int(pagination.find_all('span')[-1].text)
    insert_reviews(soup)
    insert_dress(soup, url)
    if max_page > 1:
        for i in range(2, max_page + 1):
            next_url = url + '/review-p{}'.format(i)
            req = requests.get(next_url)
            soup = BeautifulSoup(req.text, "html.parser")
            insert_reviews(soup)

In [90]:
def exist_check(url):
    count = dress_col.find({"url": url}).count()
    if count > 1:
        print("ERROR: {} is already a duplicate".format(url))
    return count > 0

In [83]:
def find_dress_links(soup):
    all_dresses = soup.find_all('a', class_ = 'grid-product-card-inner')
    links = []
    for dress in all_dresses:
        links.append(dress['href'])
    return links

In [95]:
def fetch_page(i = None, url='https://www.renttherunway.com/products/dress?action=click_all_dresses&nav_location=mainmenu&object_type=top_nav&page='):
    if i:
        url = url + str(i)
    links = find_dress_links(BeautifulSoup(requests.get(url).text, 'html.parser'))
    for link in links:
        insert_all_data('https://www.renttherunway.com' + link)

In [92]:
def print_counts():
    print(rev_col.count())
    print(dress_col.count())

In [None]:
for i in range(7):
    fetch_page(i)

In [98]:
additional_urls = ['https://www.renttherunway.com/shop/long_sleeve_dreeses/products?nav_location=submenu&action=click_long_sleeve_dreeses&object_type=top_nav',
                   '']

inserting https://www.renttherunway.com/shop/designers/parker/hanging_beads_dress
inserting https://www.renttherunway.com/shop/designers/nicole_miller/blue_jasmine_dress
inserting https://www.renttherunway.com/shop/designers/cut_25/keep_him_guessing_gown
inserting https://www.renttherunway.com/shop/designers/shoshanna/lace_daria_dress
inserting https://www.renttherunway.com/shop/designers/dsquared2/dorothy_dress
inserting https://www.renttherunway.com/shop/designers/elizabeth_and_james/such_a_tease_dress
inserting https://www.renttherunway.com/shop/designers/mark__james_by_badgley_mischka/mini_sequin_pixie_dress
inserting https://www.renttherunway.com/shop/designers/bcbgmaxazria/rose_mist_sheath
inserting https://www.renttherunway.com/shop/designers/tracy_reese/fluttering_lace_sheath
inserting https://www.renttherunway.com/shop/designers/catherine_deane/roslyn_sheath
inserting https://www.renttherunway.com/shop/designers/cut_25/hard_rock_sheath


In [100]:
print_counts()

31280
90
