In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# Code adapted from JournalDev (https://www.journaldev.com/44473/scrape-amazon-product-information-beautiful-soup)

# Takes: amazon url (str)
# Returns: soup (str)
def create_soup(url):
    HEADERS = ({'User-Agent':
                    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
                    'Accept-Language': 'en-US, en;q=0.5'})
    r = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(r.content, 'lxml')
    return soup 

In [3]:
# Takes: soup (str)
# Returns: title (str)
def get_title(soup):
    try:
        title = soup.find('span', attrs= {'id': 'productTitle'})
        title_value = title.string.strip()
    except AttributeError:
        title_value = ''
    return title_value

In [4]:
# Takes: soup (str)
# Returns: asin (str)
def get_asin(soup):
    try:
        product_detail = soup.find('div', attrs= {'id': 'detailBullets_feature_div'})
        product_detail_bullets = product_detail.find_all('span', attrs= {'class': 'a-list-item'})
        
        # order of detail bullets change, but ASIN always seems to be last
        asin = product_detail_bullets[-1]
        asin_value = asin.text.split()[-1]
    except AttributeError:
        asin_value = ''
    return asin_value  

In [5]:
# Takes: soup (str)
# Returns: rating 
def get_rating(soup):
    try:
        rating = soup.find('span', attrs= {'data-hook': 'rating-out-of-text'})
        rating_value = rating.text.split()[0]
    except AttributeError:
        rating_value = ''
    return rating_value

In [6]:
# Takes: soup (str)
# Returns: price
def get_price(soup):
    try:
        price = soup.find('span', attrs= {'class': 'a-price aok-align-center'}).text
        price_value = price.split('$')[1]
    except AttributeError:
        price_value = ''
    return price_value   

In [7]:
# Takes: soup (str)
# Returns: description (str)
def get_description(soup):
    try:
        description = soup.find('div', attrs= {'id': 'feature-bullets'})
        description_value = description.text.strip()
    except AttributeError:
        description_value = ''
    return description_value  

In [8]:
# Takes: soup (str)
# Returns: reviews (lst)
def get_reviews(soup):
    try:
        review_section = description = soup.find('div', attrs= {'id': 'customer-reviews_feature_div'})
        reviews = review_section.find_all('div', attrs= {'class': 'a-expander-content reviewText review-text-content a-expander-partial-collapse-content'})
        reviews_value = []
        for review in reviews:
            reviews_value.append(review.text.strip())
    except AttributeError:
        reviews_value = ''
    return reviews_value

In [9]:
### TEST ####

In [10]:
# Test dataframe
url_list = ['https://www.amazon.com/Under-Armour-Charged-Assert-Running/dp/B087Z1Y7S7/ref=sr_1_2?crid=6MKV7UDI3LJE&keywords=shoes&qid=1650656388&sprefix=shoe%2Caps%2C145&sr=8-2&th=1&psc=1',
            'https://www.amazon.com/Amazon-Essentials-Regular-Fit-Cotton-X-Large/dp/B01IXFQTXU/ref=sr_1_5?crid=3V8O96NX7578Q&keywords=shirt&qid=1650656396&sprefix=shir%2Caps%2C221&sr=8-5&th=1&psc=1',
            'https://www.amazon.com/MEROKEETY-Womens-Leopard-Pleated-Skirts/dp/B083HSXMRF/ref=sr_1_11?crid=2CI1SFIXCJ1EJ&keywords=skirt&qid=1650656408&sprefix=skirt%2Caps%2C137&sr=8-11&th=1&psc=1',
            'https://www.amazon.com/Columbia-Girls-Switchback-Jacket-Medium/dp/B07L4NYW3Q/ref=sr_1_5?crid=OK03MXADVJC&keywords=raincoat&qid=1650656428&sprefix=raincoa%2Caps%2C272&sr=8-5'
           ]

test_df = pd.DataFrame(url_list, columns=['amazon_url'])

In [11]:
# Fill in dataframe 
def add_columns():
    test_df['soup'] = test_df['amazon_url'].apply(create_soup)
    test_df['title'] = test_df['soup'].apply(get_title)
    test_df['asin'] = test_df['soup'].apply(get_asin)
    test_df['rating'] = test_df['soup'].apply(get_rating)
    test_df['price'] = test_df['soup'].apply(get_price)
    test_df['description'] = test_df['soup'].apply(get_description)
    test_df['review'] = test_df['soup'].apply(get_reviews)

In [12]:
add_columns()

In [13]:
test_df

Unnamed: 0,amazon_url,soup,title,asin,rating,price,description,review
0,https://www.amazon.com/Under-Armour-Charged-As...,"[html, [if lt IE 7]> <html lang=""en-us"" class=...",,,,,,
1,https://www.amazon.com/Amazon-Essentials-Regul...,"[html, [if lt IE 7]> <html lang=""en-us"" class=...",,,,,,
2,https://www.amazon.com/MEROKEETY-Womens-Leopar...,"[html, [if lt IE 7]> <html lang=""en-us"" class=...",,,,,,
3,https://www.amazon.com/Columbia-Girls-Switchba...,"[html, [if lt IE 7]> <html lang=""en-us"" class=...",,,,,,


In [27]:
test_df.to_csv('test_soup.csv')