# This is Webscraping Project
#### (fetching products from ecommerce website)

## import required libraries.

In [31]:
import pandas as pd 
import numpy as np
import requests
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
import os

## Build Class Scrapper For Scrapping

In [33]:
class scrapper:
    
    base_url = "https://www.urbansole.com.pk/collections/"
    links_pages = {'sport-shoes':1,'urbansole-footwear':5, 'women':1, 'kids':1, 'accessories':1, 'technology':1}
    headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
                    AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    dataset = {'Product_id':[], 'Product_Name':[], 'Category':[], 'Price':[], 'Image_link':[], 'Product_link':[]}
    images_directory_path = "./Images"
    dataset_path = "dataset.csv"
    url_first_part = "https://www.urbansole.com.pk"

    def __init__(self) -> None:
        pass
    
    def fetch_products_urls(self) -> None:
        i = 1
        for key, value in self.links_pages.items():
            for page in range(1,value+1):
                try:
                    url = self.base_url+key+f"?page={page}"
                    response =requests.get(url, headers=self.headers)
                except Exception as e:
                    print("URL Error: ", e)
                if response.status_code==200:
                    try:
                        soup=BeautifulSoup(response.content, 'html.parser')
                        items = soup.find_all('li', class_="productgrid--item")
                        for item in items:
                            self.dataset['Product_link'].append(self.url_first_part+item.find('a', class_="productitem--image-link").get('href'))
                            self.dataset['Category'].append(key)
                            self.dataset['Product_id'].append(str(i))
                            i=i+1
                    except Exception as e:
                        print("Soup Error: ", e)
        print("URLs are fetched")

    def fetch_data_from_urls(self) -> None:
        for product_link in self.dataset['Product_link']:
            try:
                response = requests.get(product_link, headers=self.headers)
            except Exception as e:
                print("URL Error", e)
            if response.status_code==200:
                try:
                    soup = BeautifulSoup(response.content, 'html.parser')
                    self.dataset['Product_Name'].append(soup.find('h1', class_="product-title").text.strip())
                    self.dataset['Price'].append(soup.find('div', class_="price__current").text.strip())
                    div=soup.find('div', class_="product-gallery--image-background")
                    self.dataset['Image_link'].append('https:'+div.find('img').get('src'))
                except Exception as e:
                    print(f"Soup Error : {e}")
            else:
                print("page not found")
        print("Data is fetched")

    def download_images(self) -> None:
        if not os.path.exists(self.images_directory_path):
            os.makedirs(self.images_directory_path)
        for Product_Name, Image_url, Category, Product_id in zip(self.dataset['Product_Name'],
                                                     self.dataset['Image_link'],
                                                     self.dataset['Category'],
                                                     self.dataset['Product_id']):
            path = os.path.join(self.images_directory_path, Product_id)+"_"+Product_Name+"_"+Category+'.PNG'
            try:
                response = requests.get(Image_url, headers=self.headers)
            except Exception as e:
                print("URL Error: ", e)
            if response.status_code == 200:
                image = Image.open(BytesIO(response.content))
                image.save(path, 'PNG')
            else:
                print(f"Failed to download image. Status code: {response.status_code}")
    
    def save_dataset(self):
        pd.DataFrame(self.dataset).to_csv(self.dataset_path, index=False)
        print("Dataset is saved")


## Create Object and Call Methods

In [34]:
scrapper_obj = scrapper()

In [35]:
scrapper_obj.fetch_products_urls() 

URLs are fetched


In [36]:
scrapper_obj.fetch_data_from_urls()

Data is fetched


In [37]:
scrapper_obj.save_dataset()

Dataset is saved


In [38]:
scrapper_obj.download_images()

In [39]:
df = pd.DataFrame(scrapper_obj.dataset)
df.head()

Unnamed: 0,Product_id,Product_Name,Category,Price,Image_link,Product_link
0,1,Rush US-EX-3204,sport-shoes,"Rs.13,499",https://www.urbansole.com.pk/cdn/shop/files/US...,https://www.urbansole.com.pk/collections/sport...
1,2,Raptor US-EX-3206,sport-shoes,"Rs.16,999",https://www.urbansole.com.pk/cdn/shop/files/US...,https://www.urbansole.com.pk/collections/sport...
2,3,MEN'S SPORTS SHOES US-EX-3203,sport-shoes,"Rs.13,999",https://www.urbansole.com.pk/cdn/shop/files/US...,https://www.urbansole.com.pk/collections/sport...
3,4,Pulse US-EX-3210,sport-shoes,"Current price\n\nRs.9,799",https://www.urbansole.com.pk/cdn/shop/files/US...,https://www.urbansole.com.pk/collections/sport...
4,5,Fury US-EX-3208,sport-shoes,"Rs.13,999",https://www.urbansole.com.pk/cdn/shop/files/US...,https://www.urbansole.com.pk/collections/sport...


In [40]:
len(df), list(df.columns)

(340,
 ['Product_id',
  'Product_Name',
  'Category',
  'Price',
  'Image_link',
  'Product_link'])

In [41]:
!tar -zcf Images.tar.gz Images

In [42]:
!tar -zcf dataset.tar.gz cleaned_dataset_new.csv