# Amazon Product Details

This code scraps product details from Amazon

In [33]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import pandas as pd
import datetime
import os.path

**Function** `get_product_data`

    Returns production information from a particular URL

    Input: Product Page URL
    Output: Scraped product information in dict

In [34]:
def get_product_data(url):
    driver.get(url)
    prod_title = driver.find_element_by_id("productTitle").text

    try:
        by_info = driver.find_element_by_id("bylineInfo").text
    except:
        by_info =  ''

    try:
        avg_cust_review = driver.find_element_by_id("acrPopover").get_attribute("title")
    except:
        avg_cust_review = ''


    try:
        num_reviews = driver.find_element_by_id("acrCustomerReviewText").text.split(" ")[0]
    except:
        num_reviews = ''

    try:
        price = driver.find_element_by_id("priceblock_ourprice").text
    except:
        price = ''

    if price ==  '':
        try:
            price = driver.find_element_by_id("priceblock_saleprice").text
        except:
            price = ''
    
    try:
        seller_name = driver.find_element_by_id('sellerProfileTriggerId').text
    except:
        seller_name = ''
    
    try:
        details = [icon.text for icon in driver.find_elements_by_xpath('//div[@id="icon-farm-container"]//span[@class="a-size-small"]')]
        details = ", ".join(details)
    except:
        details = ''
    
    product_data = {'prod_title': prod_title,
                    'by_info' : by_info,
                    'price': price,
                    'avg_cust_review': avg_cust_review,
                    'num_reviews': num_reviews,
                    'seller_name': seller_name,
                    'details': details,
                    'url' : url}
    
    print("Scraped: " + prod_title, by_info, price, avg_cust_review, num_reviews, seller_name, details, '\n')
    
    return product_data

**Function** `get_search_result_urls`

    Returns list of URLs from a search page

    Input: None
    Output: Links of product URLs from the search page

In [35]:
#if "slredirect" in link:
    #    prod_links_sponsored.append(link)

def get_search_result_urls():
    urls = []
    count=1
    
    while True:
        # Get list of all urls on the page
        urls += [result.get_attribute('href')
                 for result in driver.find_elements_by_xpath('//div[@data-component-type="s-search-result"]//h2/a')]
        
        print('Urls till Page' + str(count) + ': ' + str(len(urls)) + '')
        
        # Click on Next Button
        try:
            next_page_link = driver.find_element_by_xpath('//li[@class="a-last"]/a').get_attribute('href')
            driver.get(next_page_link)
            count += 1
        except:
            break
    
    print("\n\n")
    
    for url in urls:
        if "redirect" in url:
            urls.remove(url)
    
    print("Total URLs: " + str(len(urls)) + "\n\n")
    
    return urls

#### User Inputs

1. Term which would be searched on Amazon.in
2. The output file is of the format: `amazon_{search_term}.csv`

In [36]:
search_term = "redmi 6a phone"

output_file_name = '../data/amazon_' + search_term + '.csv'

#### Output File

If the file exists then read it, else create the file

In [44]:
if os.path.isfile(output_file_name):
    print("File exists")
    product_data_df = pd.read_csv(output_file_name)
    num_urls_scraped = len(product_data_df)

else:
    print("File not exists")
    product_data_df = pd.DataFrame(columns=['prod_title','by_info','price','avg_cust_review',
                                  'num_reviews','seller_name','details','url'])
    
    product_data_df.to_csv(output_file_name, index=False)
    num_urls_scraped = 0

num_urls_scraped

File exists


171

Initialize Chome Drive

In [38]:
driver = webdriver.Chrome("/home/harsh/Documents/Web_Scraping/chromedriver")

Open Amazon.in website

In [39]:
driver.get('https://www.amazon.in/')

Put the search term in search bar and go to results page

In [40]:
search_box = driver.find_element_by_id('twotabsearchtextbox')
search_box.clear()
search_box.click()
search_box.send_keys(search_term)
driver.find_element_by_xpath('//input[@type="submit"]').click()
driver.implicitly_wait(5)

Get the list of URLs for from the search results

In [41]:
urls = get_search_result_urls()

Urls till Page1: 13
Urls till Page2: 30
Urls till Page3: 47
Urls till Page4: 64
Urls till Page5: 80
Urls till Page6: 96
Urls till Page7: 112
Urls till Page8: 128
Urls till Page9: 144
Urls till Page10: 160
Urls till Page11: 176
Urls till Page12: 192
Urls till Page13: 208
Urls till Page14: 224
Urls till Page15: 226



Total URLs: 221




Scrap the data on product URLs by looping through the list of URLs

In [45]:
i = num_urls_scraped

while i<len(urls):
    url = urls[i]
    product_data = get_product_data(url)
    
    product_data_df = pd.DataFrame(product_data, index=[0])
    product_data_df.to_csv(output_file_name, mode='a', index=False, header=False)
    
    i+=1

Scraped: Gionee P7 (White, 16 GB)- Brand: Gionee  2.0 out of 5 stars 9  7 Days Replacement 

Scraped: itel itL6005 Vision1 (Gradation Blue, 2GB RAM, 32GB Storage) Brand: itel  5.0 out of 5 stars 2  7 Days Replacement, 1 Year Warranty 

Scraped: Douzo Knight D51 (8GB ROM) (1GB RAM) (Gold) Brand: Douzo ₹ 4,900.00 3.9 out of 5 stars 3 Shivansh mobile No-Contact Delivery, 7 Days Replacement, Amazon Delivered, 1 Year Warranty 

Scraped: Vivo Y15 4 GB ROM (White) Brand: Vivo  3.0 out of 5 stars 3  7 Days Replacement, Warranty Policy 

Scraped: Nillkin Case for Apple iPhone X (5.8" Inch) Air Case PC Material Matte Finish Ventilated Scratch Resist with Logo Cut Red Color Brand: Nillkin ₹ 1,799.00   Daily Shoppers 10 days Returnable, 1 Year Warranty 

Scraped: Gionee P5 Mini (Blue) Brand: Gionee  3.3 out of 5 stars 13  7 Days Replacement, 1 Year Warranty 

Scraped: Micromax Canvas 4 Plus A315 (White-Gold, 16GB)-3G Visit the Micromax Store  3.1 out of 5 stars 218  7 Days Replacement, 1 Year Warr