In [None]:
from logging import exception
from typing_extensions import Self
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException #used to debug the program
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
from pandas import DataFrame
import os
import requests
from bs4 import BeautifulSoup
import uuid
from uuid import UUID
import json
import urllib

class Scraper():
    """This class is used to represent a Scraping the lego products like 'Minions', 'DUPLO','Technic','Disney'.

    Attributes:
        selected_theme: lego products eg:'Minions', 'DUPLO','Technic','Disney'.
        """
    
    def __init__(self, selected_theme, url:str = 'https://www.lego.com/en-gb'):
        """ Initailising the theme"""
        #self.selected_theme = selected_theme
        self.selected_theme = selected_theme
        #input('Choose a Theme name(Minions,Technics,DUPLO): ')
        self.driver = Chrome(ChromeDriverManager().install())
        self.driver.get(url)
        self.driver.maximize_window()

    def lego_continue(self):
        """This function is created to click the cookie button in the Webpage"""
        xpath = '//*[@id="root"]/div[5]/div/div/div[1]/div[1]/div/button'
        try:
            time.sleep(2)
            WebDriverWait(self.driver,10).until(EC.presence_of_element_located((By.XPATH, xpath)))
            self.driver.find_element(By.XPATH, xpath).click()
        except TimeoutException:
            print('no elements found')

    def necessary_cookies(self):
        """This method is meant to click the necessary cookies"""
        xpath = '//button[contains(@class,"Button__Base-sc-1jdmsyi-0 eCVPKR")]'
        try:
            #time.sleep(2)
            WebDriverWait(self.driver,10).until(EC.presence_of_element_located((By.XPATH, xpath)))
            self.driver.find_element(By.XPATH,xpath).click()
        except TimeoutException:
            print('no elements found')

    def look_for_search_bar(self):
        """This method will look for search bar in the web page."""
        time.sleep(2)
        xpath = '//*[@id="root"]/div[2]/header/div[2]/div[2]/div/div[5]/div/button'
        try:
            #//*[@id="root"]/div[2]/header/div[2]/div[2]/div/div[5]/div/button/svg
            self.driver.find_element(By.XPATH, xpath).click()
        except TimeoutException:
            print('No search bar found')
            return None

    
    def send_keys_to_search_bar(self):
        """This method will input the search in the search bar and perform search."""
        xpath1 = '//*[@id="desktop-search-search-input"]'
        self.driver.find_element(By.XPATH, xpath1).send_keys(self.selected_theme)
        self.driver.find_element(By.XPATH, xpath1).send_keys(Keys.ENTER)
    
    def Availability(self):
        """Check for availablility button"""
        xpath = '//*[@id="product-facet-availability-accordion-title"]/div'
        try:
            #time.sleep(2)
            WebDriverWait(self.driver,10).until(EC.presence_of_element_located((By.XPATH, xpath)))
            self.driver.find_element(By.XPATH,xpath).click()
        except TimeoutException:
            print('no elements found')
    
    def check_available_now(self):
        """Click Avaialbale now option"""
        xpath ='//*[@id="product-facet-availability-accordion-content"]/div/div/ul/li[1]/label/span'
        try:
            time.sleep(2)
            WebDriverWait(self.driver,10).until(EC.presence_of_element_located((By.XPATH, xpath)))
            self.driver.find_element(By.XPATH,xpath).click()
        except TimeoutException:
            print('no elements found')

    def show_all(self):
        """This method clicks the 'show all' button in the page in order to display all the search result of the multile page"""
        xpath = '//*[@id="blt5881a9b7772d3176"]/section/div/div/div[3]/a'
        try:
            time.sleep(2)
            WebDriverWait(self.driver,10).until(EC.presence_of_element_located((By.XPATH, xpath)))
            self.driver.find_element(By.XPATH,xpath).click()
        except TimeoutException:
            print("only one page lego product is available. No Show all button' is displayed")

    def lego_product_links(self):
        """List_item = finds the list of products or container.
           Each list in the container get the href of each products of items in the container  """
        self.list_items = self.driver.find_elements(By.XPATH,'//*[@data-test = "product-item"]')
        print(self.list_items)
        self.lego_links = []
        for legoitems_link in self.list_items[0:]:
            self.lego_links.append(legoitems_link.find_element(By.TAG_NAME,'a').get_attribute('href'))
        return self.lego_links
    
    def lego_product_info(self):
        """Click each lego product link and get the Product name , link, prices.
            Update these info in lego_dict . Create each record unique to avoid copies using UUID"""
        #self.lego_links 
        self.Lego_dict = {'Product_name':[], 'Prices':[], 'Product_link':[], 'Ratings':[], 'UUID':[]}
        
        for link in self.lego_links[0:]:
            self.driver.get(link)
            time.sleep(2)
            self.Lego_dict['Product_link'].append(link)
            try:
                Prices = self.driver.find_element(By.XPATH,'//div[@data-test="product-leaf-price"]')
                self.Lego_dict['Prices'].append(Prices.text)
            except NoSuchElementException:
                self.Lego_dict['Prices'].append('N/A')
            try:
                Product_name = self.driver.find_element(By.XPATH,'//h1[@data-test="product-overview-name"]')
                self.Lego_dict['Product_name'].append(Product_name.text)
            except NoSuchElementException:
                self.Lego_dict['Product_name'].append('N/A')
    #bot.driver.find_element(By.XPATH,'//span[@data-test="product-price"]')
            try:
                rating_xpath =self.driver.find_element(By.XPATH,'//div[@class="RatingBarstyles__RatingContainer-sc-11ujyfe-2 fgbdIf"]')
                Rating = rating_xpath.get_attribute('title')
                self.Lego_dict['Ratings'].append(str(Rating))
                print(Rating)
            except NoSuchElementException:
                self.Lego_dict['Ratings'].append('N/A')
            try:
                self.Lego_dict['UUID'].append(str(uuid.uuid4()))
                #print('UUID is',uuid.uuid4())
            except:
                pass
            
    def Data_list(self):
        """Create a data table using panda for product info"""
        return(print(pd.DataFrame(self.Lego_dict)))

    def data_JSON(self):
        """Created a JSON file in the root folder clled 'raw_data'-->data.json"""
        path = '/home/lakshmi/Documents/DS/Selenium/Lego/raw_data'
        os.mkdir(path)
        os.chdir(path)
        #L = json.dump(Lego_dict)
        #I = json.dump(Image_dict)
        #Lego_dict = MyEncoder.encode(Lego_dict)
        with open('data.json', 'w') as f:
            f.write(json.dumps(self.Lego_dict,indent=4, sort_keys=True))
            #f.write('\\n')
            #f.write(json.dumps(self.Image_dict,cls = UUIDEncoder))
            #f.write(I)
if __name__ == '__main__' : 
   bot = Scraper('Technic') 

In [None]:
bot.lego_continue()
bot.necessary_cookies()
bot.look_for_search_bar()   
bot.send_keys_to_search_bar()
bot.Availability()
bot.check_available_now()



In [None]:
bot.lego_product_links() 

In [None]:
bot.lego_product_info()

In [None]:
bot.Data_list()

In [None]:
bot.data_JSON()

In [None]:
def lego_image_downloader(self):
        """Download Firsts image from the lego products and update it as list in image_dict with UUID"""

        os.mkdir(os.path.join(os.getcwd(),self.selected_theme))
        category_folder = os.path.join(os.getcwd(), bot.selected_theme)
        os.chdir(os.path.join(os.getcwd(),self.selected_theme))

        self.Image_dict = {'Lego_images' :[],'Image_UUID':[]}

        for link in self.lego_links[0:]:
                self.driver.get(link)
                
                try:
                            
                    time.sleep(1)
                    Product_name = bot.driver.find_element(By.XPATH,'//h1[@data-test="product-overview-name"]')
                    lego_product_name = (Product_name.text).replace(' ','_').replace(',','').replace('-','')
                    os.mkdir(f'{category_folder}/{lego_product_name}')
                    ul_class = bot.driver.find_element(By.XPATH,'//*[@id="main-content"]/div/div[1]/div/div[1]/div[1]/div/div/div/div[1]/div/div[1]/ul')
                    all_li = ul_class.find_elements(By.XPATH,'//li//img')
                    print(all_li)
                    i = 0
                    for li in all_li:
                        #img_container = bot.driver.find_element(By.XPATH,'//picture[@class = "Picturestyles__Container-j8hf1d-0 bVuOVw LazyImagestyles__Picture-sc-1gcjd00-1 ddKWmr"]')
                        find_image = all_li.find_element(By.TAG_NAME,'img').get_attribute('src')
                        print(find_image)
                        self.Image_dict['Lego_images'].append(find_image)
                        print(f"{category_folder}/{lego_product_name}")
                        with open(f"{category_folder}/{lego_product_name}/{lego_product_name}{i}.jpg",'wb') as f:
                                pict = requests.get(find_image)
                                f.write(pict.content)
                    

                except NoSuchElementException:
                        print('No images found')

                try:
                    self.Image_dict['Image_UUID'].append(str(uuid.uuid4()))
                    print('UUID is',uuid.uuid4())
                except:
                        pass

In [8]:
import pandas as pd

print(pd.__doc__)


pandas - a powerful data analysis and manipulation library for Python

**pandas** is a Python package providing fast, flexible, and expressive data
structures designed to make working with "relational" or "labeled" data both
easy and intuitive. It aims to be the fundamental high-level building block for
doing practical, **real world** data analysis in Python. Additionally, it has
the broader goal of becoming **the most powerful and flexible open source data
analysis / manipulation tool available in any language**. It is already well on
its way toward this goal.

Main Features
-------------
Here are just a few of the things that pandas does well:

  - Easy handling of missing data in floating point as well as non-floating
    point data.
  - Size mutability: columns can be inserted and deleted from DataFrame and
    higher dimensional objects
  - Automatic and explicit data alignment: objects can be explicitly aligned
    to a set of labels, or the user can simply ignore the labels and