<a href="https://colab.research.google.com/github/kajendiran-s/amazon_web_scrapping/blob/main/amazon_web_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [None]:
#Function to extract product title
def get_title(soup):
  
  try:
    
    #Outer tag object
    title = soup.find("span",attrs={"id":"productTitle"})

    #Inner string of the tag
    title_value = title.text

    #Trim the space around it
    title_string = title_value.strip()
  
  except AttributeError:

    title_string=""

  return title_string

#Function to extract Product Price
def get_price(soup):

  try:
    price = soup.find("span",attrs={"id":"priceblock_ourprice"}).string.strip()

  except AttributeError:

    try:
      #If there is some deal price
      price = soup.find("span",attrs={"id":"priceblock_dealprice"}).string.strip()

    except:
      price = ""
    
  return price

#Function to extract Product Rating
def get_rating(soup):

  try:
    rating = soup.find("i",attrs={"class":"a-icon a-icon-star a-star-4-5"}).string.strip()

  except AttributeError:
    try:
      rating = soup.find("span",attrs={"class":"a-icon-alt"}).string.strip()
    
    except:
      rating=""

  return rating

#Function to extract the number of user reviews
def get_review_count(soup):
  try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()

  except AttributeError:
        review_count = ""	

  return review_count



#Function to extract Number of availability 
def get_availability(soup):

  try:

    available = soup.find("div",attrs={"id":"availability"})
    available = available.find("span").string.strip()

  except AttributeError:

    available = "Not available"

  return available


In [None]:
#main function
if __name__ == "__main__":

  #Headers for request
  HEADERS=({'user-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0','Accept-Language':'en-US, en;q=0.5'})

  #URL of the website
  URL = "https://www.amazon.in/s?k=playstation+4&ref=nb_sb_noss_2"
  
  #Request to the website
  website = requests.get(URL,headers=HEADERS)

  #Soup object contain all the data
  soup = BeautifulSoup(website.content,"html.parser")

  #Fetch the links as list
  links = soup.find_all("a",attrs={'class':'a-link-normal s-no-outline'})

  #List to store the list
  links_list = []

  #loop to get the link on the page
  for link in links:
    links_list.append(link.get("href"))

  #dictionary to store the values
  d = {"title":[],"price":[],"rating":[],"review":[],"availability":[]}

  #loop to get the items on the page
  for link in links_list:
    new_webpage = requests.get("http://www.amazon.com"+link, headers = HEADERS)

    new_soup = BeautifulSoup(new_webpage.content,"html.parser")

    #Function call to get all necessary data from webpage
    d['title'].append(get_title(new_soup))
    d['price'].append(get_price(new_soup))
    d['rating'].append(get_rating(new_soup))
    d['review'].append(get_review_count(new_soup))
    d['availability'].append(get_availability(new_soup))
  
  #Panda dataframe
  amazon_df = pd.DataFrame.from_dict(d)
  amazon_df["title"].replace('',np.nan,inplace=True)
  amazon_df = amazon_df.dropna(subset=["title"])
  amazon_df.to_csv("amazon_data.csv", header = True, index = False)


In [None]:
amazon_df

Unnamed: 0,title,price,rating,review,availability
7,Ghost Of Tsushima/PS4,,4.7 out of 5 stars,"1,870 ratings",Currently unavailable.
8,HyperX Cloud Stinger Core - Gaming Headset for...,$19.99,4.5 out of 5 stars,"16,340 ratings",In Stock
11,God Of War PS4,$29.97,4.7 out of 5 stars,"3,326 ratings",Not available
12,Call of Duty: Infinite Warfare - Standard Edit...,$22.11,4.6 out of 5 stars,"6,376 ratings",Not available
