In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import time

In [40]:
# Function to extract product information
def extract_product_info(page_source):
    soup = BeautifulSoup(page_source, 'html.parser')
    products = soup.find_all('div', class_='plp-card-container')  # Ensure this matches the correct class

    name = []
    price = []
    sku_code = []
    discount_applied = []

    for product in products:
        
        # Extracting the text content of the name element
        name_element = product.find('div', class_='plp-card-details-name')
        name_value = name_element.text.strip() if name_element else None
        name.append(name_value)

        # Extracting the text content of the price element
        price_element = product.find('span', class_='jm-heading-xxs')
        price_value = price_element.text.strip() if price_element else None
        price.append(price_value)
        
        # Extracting the data-sku attribute from the wishlist span tag
        wishlist_span = product.find('a', target_='jm-wishlist-btn medium wishlist_btn')
        sku_code_value = wishlist_span.get('data-sku') if wishlist_span else None
        sku_code.append(sku_code_value)
        ## Finding wishlist_span class in which our data-sku class is located, we have to extract "class" not class's text
        
        
        # Extracting the text content of the discount applied element
        discount_applied_element = product.find('span', class_='jm-badge')
        discount_applied_value = discount_applied_element.text.strip() if discount_applied_element else None
        discount_applied.append(discount_applied_value)
        
    return pd.DataFrame({
        'Name': name,
        'Price': price,
        'Sku_code': sku_code,
        'Discount_applied': discount_applied
    })

In [41]:
# Function to set delivery location pincode
def set_delivery_location(driver, pincode):
    try:
        # Find and click on the "Deliver to" button using XPath
        deliver_to_button = driver.find_element(By.XPATH, '/html/body/header/section[1]/div/section[1]/div[2]/button')
        deliver_to_button.click()
        
        # Find and click on the "Enter Pin Code" button using XPath
        deliver_to_button = driver.find_element(By.XPATH, '/html/body/header/section[1]/div/section[1]/div[2]/div/section/div[1]/div[4]/div[1]/button')
        deliver_to_button.click()
        
        # Find and click on the "Enter Pin Code" text bar using XPath
        deliver_to_button = driver.find_element(By.XPATH, '/html/body/header/section[1]/div/section[1]/div[2]/div/section/div[2]/form/div/div[1]/div/input')
        deliver_to_button.click()
        
        pincode_input = driver.find_element(By.ID, 'rel_pincode')
        pincode_input.clear()
        pincode_input.send_keys(pincode)
        
        # Submit the form (if necessary)
        pincode_input.submit()
        
        # Add a short delay to ensure the page loads with the new pincode (optional)
        time.sleep(4)
        
        print(f"Successfully set delivery location to {pincode}")
    except Exception as e:
        print(f"Failed to set delivery location: {e}")

In [42]:
# Set up Selenium WebDriver
service = Service('C:/Program Files (x86)/chromedriver-win64/chromedriver.exe')  # Update with the path to your ChromeDriver
driver = webdriver.Chrome(service=service)
        
# Load the home page and set delivery location
driver.get('https://www.jiomart.com')
set_delivery_location(driver, '834001')  # Replace with your desired pincode

# Define categories with URLs
category = {
    'biscuits-drinks-packaged-foods': 28997,
    'fruits-vegetables': 219,
    'cooking-essentials': 28984,
    'dairy-bakery': 61,
    'personal-care-beauty': 29020,
    'home-care': 36,
    'kitchen-care': 29012,
    'mom-baby-care': 2551
}

# Initialize an empty list to hold all DataFrames
all_data_frames = []

# Loop through each category and scrape data
for cat_type in category:
    all_pages_data = []  # Reset the list for each category
    
    url = f'https://www.jiomart.com/c/groceries/{cat_type}/{category[cat_type]}'
    driver.get(url)
    print(f"Requesting URL: {url}")  # Debug: Print the URL being requested
    
    try:
        # Wait for the initial products to load
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'plp-card-container')))
        
        # Scroll down to load all products
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)  # Wait to load page
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
        
        # Extract data after all products are loaded
        page_data = extract_product_info(driver.page_source)
        all_pages_data.append(page_data)
    except Exception as e:
        print(f"Failed to retrieve data from {url}: {e}")
        
    if all_pages_data:  # Check if there is any data before concatenating
        final_df = pd.concat(all_pages_data, ignore_index=True)
        final_df['Category'] = cat_type  # Add a column with the category name
        all_data_frames.append(final_df)

driver.quit()

Successfully set delivery location to 834001
Requesting URL: https://www.jiomart.com/c/groceries/biscuits-drinks-packaged-foods/28997
Requesting URL: https://www.jiomart.com/c/groceries/fruits-vegetables/219
Requesting URL: https://www.jiomart.com/c/groceries/cooking-essentials/28984
Requesting URL: https://www.jiomart.com/c/groceries/dairy-bakery/61
Requesting URL: https://www.jiomart.com/c/groceries/personal-care-beauty/29020
Requesting URL: https://www.jiomart.com/c/groceries/home-care/36
Requesting URL: https://www.jiomart.com/c/groceries/kitchen-care/29012
Requesting URL: https://www.jiomart.com/c/groceries/mom-baby-care/2551


In [43]:
# Concatenate all DataFrames into a single DataFrame
JioMart_Data = pd.concat(all_data_frames, ignore_index=True)
JioMart_Data

Unnamed: 0,Name,Price,Sku_code,Discount_applied,Category
0,Maaza Mango Drink 1.2 L,₹69.00,490001795,8% OFF,biscuits-drinks-packaged-foods
1,Sprite 2.25 L,₹95.00,490004166,5% OFF,biscuits-drinks-packaged-foods
2,Thums Up 2.25 L,₹95.00,490005134,5% OFF,biscuits-drinks-packaged-foods
3,Pepsi 2.25 L,₹86.00,490004176,14% OFF,biscuits-drinks-packaged-foods
4,Britannia Jimjam Sandwich Biscuits 138 g,₹24.00,490876695,31% OFF,biscuits-drinks-packaged-foods
...,...,...,...,...,...
3128,Daddy's Care Ultra-Thin Disposable Pull-Up Dia...,₹810.00,591141056,32% OFF,mom-baby-care
3129,"Scenty 10In1 Baby Grooming Kit,Portable Baby G...",₹499.00,608114656,61% OFF,mom-baby-care
3130,Chicco Body Lotion 500 ml,₹527.00,607727249,12% OFF,mom-baby-care
3131,Bumtum Baby Body Lotion Natural For Babies/Chi...,₹209.00,607699263,47% OFF,mom-baby-care


In [44]:
# JioMart_Data.to_csv('JioMart_Data.csv', index = False)

In [46]:
# # Get the unique categories
# unique_categories = JioMart_Data['Category'].unique()

# # Save each category to a separate CSV file
# for category in unique_categories:
#     category_data = JioMart_Data[JioMart_Data['Category'] == category]
#     category_data.to_csv(f'{category}_JioMart.csv', index=False)

In [1]:
pip install gspread gspread-dataframe oauth2client

Collecting gspread
  Downloading gspread-6.1.2-py3-none-any.whl (57 kB)
     -------------------------------------- 57.5/57.5 kB 606.0 kB/s eta 0:00:00
Collecting gspread-dataframe
  Downloading gspread_dataframe-4.0.0-py2.py3-none-any.whl (9.0 kB)
Collecting oauth2client
  Downloading oauth2client-4.1.3-py2.py3-none-any.whl (98 kB)
     -------------------------------------- 98.2/98.2 kB 935.8 kB/s eta 0:00:00
Collecting google-auth-oauthlib>=0.4.1
  Downloading google_auth_oauthlib-1.2.0-py2.py3-none-any.whl (24 kB)
Collecting google-auth>=1.12.0
  Downloading google_auth-2.30.0-py2.py3-none-any.whl (193 kB)
     -------------------------------------- 193.7/193.7 kB 1.5 MB/s eta 0:00:00
Collecting rsa>=3.1.4
  Downloading rsa-4.9-py3-none-any.whl (34 kB)
Collecting httplib2>=0.9.1
  Downloading httplib2-0.22.0-py3-none-any.whl (96 kB)
     ---------------------------------------- 96.9/96.9 kB 1.8 MB/s eta 0:00:00
Collecting cachetools<6.0,>=2.0.0
  Downloading cachetools-5.3.3-py3-no

In [12]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
import numpy as np

# Function to authenticate and get the Google Sheet
def get_google_sheet(sheet_url, json_keyfile_name):
    scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/spreadsheets",
             "https://www.googleapis.com/auth/drive.file", "https://www.googleapis.com/auth/drive"]

    credentials = ServiceAccountCredentials.from_json_keyfile_name(json_keyfile_name, scope)
    client = gspread.authorize(credentials)
    
    # Extract the sheet ID from the URL
    sheet_id = sheet_url.split("/d/")[1].split("/")[0]
    
    return client.open_by_key(sheet_id).sheet1

# Function to upload data to the Google Sheet
def upload_to_google_sheet(sheet, data):
    sheet.clear()  # Clear existing data
    sheet.update([data.columns.values.tolist()] + data.values.tolist())  # Update with new data

# Your Google Sheet URL and JSON key file path
sheet_url = 'https://docs.google.com/spreadsheets/d/1xIJt9B8HAQTE-JyCBGh_O_OYmPnPw4UX9hg7qbJ3oV8/edit?gid=0#gid=0'
json_keyfile_name = r'C:\Users\Hp\Downloads\harshal1074-568be4008809.json'  # Replace with your JSON key file path

# Clean the DataFrame
JioMart_Data = JioMart_Data.replace([np.inf, -np.inf], np.nan)  # Replace infinite values with NaN
JioMart_Data = JioMart_Data.fillna('')  # Replace NaN with empty string

# Authenticate and get the Google Sheet
sheet = get_google_sheet(sheet_url, json_keyfile_name)

# Upload the JioMart_Data DataFrame to the Google Sheet
upload_to_google_sheet(sheet, JioMart_Data)