# Script for extracting search result product details 

The script takes as input a .txt file containing a list of search keywords and then iterates through all the keywords in that list to find all their respective search results.

For each keyword in the keyword list, it uses it to search for products on Amazon's Website and then for each product it saves its search result data and product page data in a .jsonl file.

In [None]:
import urllib.request # FOR URL ENCODING 
import requests # For making requests to download a webpage content
from selectorlib import Extractor # For extracting specific fileds from downloaded webpage
import json 
import random
import re
from time import sleep
import os
import jsonlines
import pandas as pd
import datetime
import re

#### Step 1: Read KeywordList txt file and store all keywords in a list

**NOTE:** Before running this, change the path variable 'keywords' to point to the KeywordList .txt file. 

The following code loads a KeywordList file, and stores each into a list. 

In [None]:
!ls ./../DATASET/KeywordLists

In [None]:
keywords = open('./../DATASET/KeywordLists/top_100_keyword_list.txt', 'r')
keyword_list = []

for k in keywords:
    k = k.strip("\n")
    keyword_list.append(k)
print('Keyword List: ', keyword_list[:10])
print('Keyword Count: ', len(keyword_list))

#### Step 2: Define Headers

Each header is a unique user agent which will be used to request the data from the website to be scraped. We use multiple user agents to ensure that if our request is rejected, we can retry.

To create more headers, simply copy any one of the old headers and replace the 'user-agent' string with a new 'user-agent' string, which can be found online. (Eg. https://developer.chrome.com/multidevice/user-agent)

In [None]:
headers = [
           {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
           },
           {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
           },
           {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
           },
           {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
           },
           {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
           },
           {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
           },
           {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
           },
           {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
           },
           {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
           },
           {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:70.0) Gecko/20100101 Firefox/70.0',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
           },
           {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
           },
           {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
           },
           {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
           },
           {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36 OPR/68.0.3618.165',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
           },
           {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
           },
           {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36 Edg/83.0.478.37',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
           }
]

#### Step 3: Read Extractor Files

The extractor (.yml) files contain *css id* information about the fields which we intend to extract from the scarped website. Here, the two extractor files are:
##### 1. keyword_search_product_list.yml
From the scraped webpage, this extractor file extracts the main *css division* which contains all the individual (child) products. Once the main div is scraped, it extracts all the child divisions (products) contained in it.
##### 2. keyword_search_product_page.yml 
From the scraped product page, this extractor file extracts the all the fields that are relevant to the product on the given page.
##### 3. nextpg.yml
Extracts the 'next' button from the website, to check if its disabled. If it is disabled, it means that we have reached the end of the product list for the current brand. We then move onto the next brand to continue our scraping.

In [None]:
e = Extractor.from_yaml_file('./Extractor/keyword_search_product_list.yml')
l = Extractor.from_yaml_file('./Extractor/nextpg.yml')
p = Extractor.from_yaml_file('./Extractor/keyword_search_product_page.yml')

#### Step 4: Define scrape function for search results scraping

**NOTE:** Set the variables MAX_TRIALS & ERROR_THRESHHOLD according to your preferences. 

A high MAX_TRIALS will slow down the scraping as it will scrape those pages without actually any data multiple times too, but it will reduce the chances of error. 
A low ERROR_THRESHHOLD will also slow down the scraping, as VPN will need to changed multiple times. However, it will reduce the chances missing data due to errors. 

The function scrape_SearchResult(url) downloads the webpage at the given url (here: search result page) using requests module, and looks for products on the page. If it finds any product, it extracts the required fields and returns the data. If no product is found, it continues to randomly select a new header and retry scraping untill the limit MAX_TRIALS is reached, where it concludes that the page does not contain any data.

These multiple trials are required, as amazon often blocks a user for repeqatedly making requests using the same user agent. 

In [None]:
MAX_TRIALS_A = 25  # Set the max number of trials to perform here.
ERROR_COUNT_A = 1 # Used for keeping a count of errors, if the count exceeds threshhold, the user is asked to
                # change the vpn
ERROR_THRESHHOLD_A = 5 # Number of pages with missed information allowed after which vpn change is required
def scrape_SearchResult(url):  
    global ERROR_COUNT_A
    
    '''
    This function downloads the webpage at the given url using requests module.
    
    Parameters:
    url (string): URL of webpage to scrape
    Returns: 
    string: If the URL contains products, returns the html of the webpage as text, else returns 'False'.
    '''
    
    # Download the page using requests
    print("Downloading %s"%url)
    trial = 0
    while(True):
        # Ask to change vpn every ERROR_THRESHHOLD pages without results to ensure data is not missed because of being blocked
        if ERROR_COUNT_A % ERROR_THRESHHOLD_A == 0:
            _ = input('Please Change VPN and enter \'DONE\' to continue')
            ERROR_COUNT_A += 1
        if trial == MAX_TRIALS_A:  
            print("Max trials exceeded yet no Data found on this page!")
            ERROR_COUNT_A += 1
            return 'False'
        trial = trial + 1
        print("Trial no:", trial)
        
        # Get the html data from the url
        while True:
            try:
                r = requests.get(url, headers=random.choice(headers), timeout = 15) 
                
                # We use product_list.yml extractor to extract the product details from the html data text
                data = e.extract(r.text) 
                # If the products div in the scraped html is not empty, return html text. 
                #If the products div in the scraped html is empty, retry with new user agent.
                if (data['products'] != None): 
                    return r.text
                else:
                    print("Retrying with new user agent!")
                    break
            except requests.exceptions.RequestException as err:
                print('Error Detected: ', err)
                print('Retrying after 30 seconds')
                sleep(30)
                continue
            except requests.exceptions.HTTPError as err:
                print('Error Detected: ', err)
                print('Retrying after 30 seconds')
                sleep(30)
                continue
            except requests.exceptions.ConnectionError as err:
                print('Error Detected: ', err)
                print('Retrying after 30 seconds')
                sleep(30)
                continue
            except requests.exceptions.Timeout as err:
                print('Error Detected: ', err)
                print('Retrying after 30 seconds')
                sleep(30)
                continue

#### Step 5: Define scrape function for product page scraping

**NOTE:** Set the variables MAX_TRIALS & ERROR_THRESHHOLD according to your preferences. 

A high MAX_TRIALS will slow down the scraping as it will scrape those pages without actually any data multiple times too, but it will reduce the chances of error. 
A low ERROR_THRESHHOLD will also slow down the scraping, as VPN will need to changed multiple times. However, it will reduce the chances missing data due to errors. 

The function scrape_ProductPage(url) downloads the webpage at the given url (here: product page) using requests module, and looks for the specific fileds defigned in the extractor file product_page.yml. If a Title for the product on the page is not found, it continues to randomly select a new header and retry scraping untill the limit MAX_TRIALS is reached, where it reports that the page does not contain any data.

These multiple trials are required, as amazon often blocks a user for repeatedly making requests using the same user agent. 

In [None]:
MAX_TRIALS_B = 20  # Set the max number of trials to perform here.
ERROR_COUNT_B = 1 # Used for keeping a count of errors, if the count exceeds threshhold, the user is asked to
                # change the vpn
ERROR_THRESHHOLD_B = 25 # Number of pages with missed information allowed after which vpn change is required

def scrape_ProductPage(url):
    global ERROR_COUNT_B
    '''
    This function downloads the webpage at the given url using requests module.
    
    Parameters:
    url (string): URL of webpage to scrape
    Returns: 
    string: If the URL contains products, returns the html of the webpage as text, else returns 'False'.
    '''
    
    # Download the page using requests
    print("Downloading %s"%url)
    trial = 0
    while(True):
        
        # Ask to change vpn every (ERROR_THRESHHOLD_B) pages without results to ensure data is not missed because of being blocked
        if ERROR_COUNT_B % ERROR_THRESHHOLD_B == 0:
            _ = input('Please Change VPN and press enter')
            ERROR_COUNT_B += 1
        if trial == MAX_TRIALS_B:  
            print("Max trials exceeded yet no Data found on this page!")
            ERROR_COUNT_B += 1
            return 'False'
        trial = trial + 1
        print("Trial no:", trial)
        
        # Get the html data from the url
        while True:
            try:
                r = requests.get(url, headers=random.choice(headers), timeout = 15) 
                
                # We use product_list.yml extractor to extract the product details from the html data text
                data = p.extract(r.text) 
                # If the products title in the scraped html is not empty, return extracted details as dict. 
                # If the products title in the scraped html is empty, retry with new user agent.
                if data['Title'] != None:
                    return (p.extract(r.text))
                else:
                    print("Retrying with new user agent!")
                    break
            except requests.exceptions.RequestException as err:
                print('Error Detected: ', err)
                print('Retrying after 30 seconds')
                sleep(30)
                continue
            except requests.exceptions.HTTPError as err:
                print('Error Detected: ', err)
                print('Retrying after 30 seconds')
                sleep(30)
                continue
            except requests.exceptions.ConnectionError as err:
                print('Error Detected: ', err)
                print('Retrying after 30 seconds')
                sleep(30)
                continue
            except requests.exceptions.Timeout as err:
                print('Error Detected: ', err)
                print('Retrying after 30 seconds')
                sleep(30)
                continue

#### Step 6: Initialise path of output file

**NOTE:** Set the File Name accoring to what is being scraped here

Eg: SearchResult_ApparioGeneric or SearchResult_Top100India

In [None]:
FileName = input('Enter a Filename for output file!\n')

outfile_path = str('./ScriptOutput/DATASET/' + str(FileName) + '.jsonl')  

#### Step 7: Define cleaning functions

In [None]:
def CleanRating(s):
    '''
    Here, the input is rating in a string format, eg: "3.3 out of 5 stars".
    The function converts it to a float, eg: '3.3'
    '''
    if s is not None:
        try:
            return float(s.split(' ')[0])
        except ValueError:
            return None
        except AttributeError:
            return None
    else:
        return None

def CleanRatingCount(s):
    '''
    Here, the input is RatingCount in a string format, eg: "336 ratings".
    The function converts it to a float, eg: '336'
    '''
    if s is not None:
        return float(s.split(' ')[0].replace(',', ''))
    else:
        return float(0)

def CleanAnsweredQuestionsCount(s):
    '''
    Here, the input is AnsweredQuestionsCount in a string format, eg: "336 answered questions".
    The function converts it to a float, eg: '336'
    '''
    if s is not None:
        try:
            return float(s.split(' ')[0].replace(',', '').replace('+', ''))
        except ValueError:
            return float(0)
        except AttributeError:
            return float(0)
    else:
        return float(0)
    
def CleanAmazonPrice(s):
    '''
    Here, the input is AmazonPrice in a string format, eg: "₹ 336.00".
    The function converts it to a float, eg: '336'
    '''
    if s is not None:
        print(s)
        s = s.replace('₹', '').replace(',', '').replace(r'\x', '').replace('a', '')
        return float(s.strip().split(' ')[0])
    else:
        return s
    
def CleanMRP(s):
    '''
    Here, the input is MRP in a string format, eg: "₹ 336.00".
    The function converts it to a float, eg: '336'
    '''
    if s is not None:
        print(s)
        s = s.replace('₹', '').replace(',', '').replace(r'\x', '').replace('a', '')
        return float(s.strip().split(' ')[0])
    else:
        return s
def CleanDiscount(s):
    '''
    Here, the input is Savings in a string format, eg: "₹ 336.00 (50% Off)".
    The function converts it to a float, eg: '50'
    '''
    if s is not None:
        if re.search(re.compile(r'\(.*\)'), s):
            return int((re.search(re.compile(r'\(.*\)'), s).group(0)).replace('(', '').replace(')', '').replace('%', '').replace(',', ''))
        else:
            return s
    else:
        return s

def CleanSavings(s):
    '''
    Here, the input is Savings in a string format, eg: "₹ 336.00 (50% Off)".
    The function converts it to a float, eg: '336'
    '''
    if s is not None:
        s = s.replace('₹', '').replace(',', '').replace(r'\x', '').replace('a', '')
        return float(s.split(' ')[0])
    else:
        return s
    
def CleanKeywords(s):
    '''
    Here, the input is Breadcrumbs in a string format, 
    eg: 'Electronics  > Home Audio  > Speakers  > 10.or Crafted for Amazon Rave Portable Wireless Bluetooth Speaker'
    The function converts it to a list, seperating it based on the '>' symbol.
    '''
    if type(s) == float:
        if math.isnan(s):
            return None
    else:
        if s is not None:
            if '›' in s:
                k = list(s.split('›'))
            else:
                k = list(s.split('> '))
            return k

#### Step 8: Define Keyword Type:

Eg: Top 100 Germany 

Eg: Generic Appario

In [None]:
# Keyword Type
# Example: APPARIO GENERIC
# Example: TOP 100 UK
# Example: TOP 100 INDIA
KeywordType = input("Enter Keyword Type\n")

#### Step 8: Begin main scraping:

In [None]:
MIN_NUM_OF_PRODUCTS_TO_SCRAPE = 80

with open(outfile_path,'a') as outfile:
    for k in keyword_list:
        pg_number = 1
        search_rank = 1
        if k == 'EOF':
            break
        
        while True:
            
            if search_rank >= MIN_NUM_OF_PRODUCTS_TO_SCRAPE + 1:
                break
                
            # To account for differnt urls based on page number
            if pg_number == 1:
                url = str("https://www.amazon.in/s?k="+str(k))
            else:
                url = str("https://www.amazon.in/s?k="+str(k)+"&page="+ str(pg_number))
            
            data_text = scrape_SearchResult(url)
            if data_text == 'False':
                break
            
            else:
                # Extract all product details in a dict 'data' using the extractor file
                data = e.extract(data_text)
                
                # Save html text to file
                html_files_path = str('./ScriptOutput/HTML/'+ str(FileName) + '/' + str(k) +'/Page_'+str(pg_number)+'.html')
                os.makedirs(os.path.dirname(html_files_path), exist_ok=True)
                with open(html_files_path, 'w') as file:
                    file.write(data_text)
                
                for product in data['products']:
                    product['SearchResultPosition'] = search_rank
                    product['KeywordType'] = KeywordType
                    search_rank += 1
                    product['SearchKeyword'] = k
                    product['SearchUrl'] = url
                    date = datetime.datetime.now()
                    product['Timestamp'] = date.strftime("%c")
                    if product['Label'] == 'Amazon\'s':
                        product['Label'] = 'Amazon\'s Choice'
                    if 'www.amazon.in' in product['ProductPageUrl']:
                        data = scrape_ProductPage(product['ProductPageUrl'])
                    else:
                        data = scrape_ProductPage('https://www.amazon.in'+ product['ProductPageUrl'])
                        product['ProductPageUrl'] = 'https://www.amazon.in'+ product['ProductPageUrl']
                    if data == 'False':
                        product['Brand'] = None
                        product['MRP'] = None
                        product['AmazonPrice'] = None
                        product['DiscountPercentage'] = None
                        product['Rating'] = None
                        product['RatingCount'] = None
                        product['Savings'] = None
                        product['ShortDescription'] = None
                        product['ProductDescription'] = None
                        product['BestSellerRank'] = None
                        product['DateFirstAvailable'] = None
                        product['Breadcrumbs'] = None
                        product['Availability'] = None
                        product['Seller'] = None
                        product['Keywords'] = None
                        product['FullfilledBy'] = None
                        if re.search('B0.{8}', product['ProductPageUrl']):
                            product['ASIN'] = re.search('B0.{8}', product['ProductPageUrl']).group(0)
                        else:
                            product['ASIN'] = None
                        print("Saving Product: %s"%product['Title'])
                        print(product)
                        json.dump(product,outfile)
                        outfile.write("\n")
                        continue
                    product['Brand'] = data['Brand']
                    product['MRP'] = CleanMRP(data['MRP'])
                    product['Rating'] = CleanRating(data['Rating'])
                    product['RatingCount'] = CleanRatingCount(data['RatingCount'])
                    product['AnsweredQuestionsCount'] = CleanAnsweredQuestionsCount(data['AnsweredQuestionsCount'])
                    product['AmazonPrice'] = CleanAmazonPrice(data['AmazonPrice'])
                    product['DiscountPercentage'] = CleanDiscountPercentage(data['Savings'])
                    product['Savings'] = CleanSavings(data['Savings'])
                    product['ShortDescription'] = data['ShortDescription']
                    product['ProductDescription'] = data['ProductDescription']
                    product['BestSellerRank'] = data['BestSellerRank']
                    product['DateFirstAvailable'] = data['DateFirstAvailable']
                    product['Breadcrumbs'] = data['Breadcrumbs']
                    product['Keywords'] = CleanKeywords(data['Breadcrumbs'])
                    product['Seller'] = data['Seller']
                    product['FullfilledBy'] = data['FullfilledBy']
                    if product['AmazonPrice'] is not None:
                        product['Availability'] = 'Available'
                    else:
                        product['Availability'] = 'Currently Unavailable'
                    if re.search('B0.{8}', product['ProductPageUrl']):
                        product['ASIN'] = re.search('B0.{8}', product['ProductPageUrl']).group(0)
                    else:
                        if re.search('/dp/\d*/', product['ProductPageUrl']):
                            product['ASIN'] = re.search('/dp/\d*/', product['ProductPageUrl']).group(0).replace('/dp/', '').replace('/', '')
                        else:
                            product['ASIN'] = None
                    print("Saving Product: %s"%product['Title'])
                    print(product)
                    json.dump(product,outfile)
                    outfile.write("\n")

                # If next page is not available, break and go to next brand                  
                if l.extract(data_text)['last'] == 'Next →':
                    break
                elif data_text == 'False':
                    break
                else:
                    pg_number += 1 # Incrementing page numbe


## Step 8: Read Jsonl File

In [None]:
Search_Result_file = open(outfile_path, 'r')

Search_Result_List = []
reader = jsonlines.Reader(Search_Result_file)
for item in reader.iter():
    Search_Result_List.append(item)
    
df = pd.DataFrame(Search_Result_List)
print(df.count())
df