# Web Scraping using JUMIA SITE

## Install Selenium

In [3]:
!pip install selenium



## Import Libraries

In [5]:
import requests  # make a request to a url
from bs4 import BeautifulSoup  # parse the requests as html
import pandas as pd  # data manipulation
from selenium import webdriver
from time import sleep
import re

## Scrape Data

In [18]:
# product data dictionary
product_data = {
    "Product Name": [],
    "Current Price": [],
    "Old Price": [],
    "Discount": [],
    "Rating": [],
    "URL": [],
    "Photo": [],
    "Vendor": []
}


chrome_options = webdriver.ChromeOptions()
def visit_site(url):
    
    driver = webdriver.Chrome()
    driver.get(url)  # visit url
    sleep(20)
    
    html = driver.page_source
    page_soup = BeautifulSoup(html, "html.parser")
    
    product_cards = page_soup.findAll('li',{'class':'bbe45_3oExY _22339_3gQb9'}) # product container

    
    for card in product_cards:
        product_name = card.find('h3') #
        current_price = card.find("span", class_="d7c0f_sJAqi") 
        old_price = card.find("span", class_="f6eb3_1MyTu") 
        discount = card.find("span", class_="false _6c244_q2qap _6977e_X5mZi")
        rating = card.find("span", class_="eea9b_1Ma8-")
        url = card.find("a").get("href")
        photo = card.find('img').get("src")

        # ecommerce Site
        product_data["Vendor"] = "Konga Site"
        
        if product_name != None:
            product_data["Product Name"].append(product_name.text)
        else:
            product_data["Product Name"].append("")


        if current_price != None:
            product_data["Current Price"].append(current_price.text)
        else:
            product_data["Current Price"].append("")


        if old_price != None:
            product_data["Old Price"].append(old_price.text)
        else:
            product_data["Old Price"].append("")


        if rating != None:
            product_data["Rating"].append(rating.text)
        else:
            product_data["Rating"].append("")
            
        if discount != None:
            product_data["Discount"].append(discount.text)
        else:
            product_data["Discount"].append("")

        if url != None:
            product_data["URL"].append(f"https://www.konga.com{url}")
        else:
            product_data["URL"].append("")


        if photo != None:
            product_data["Photo"].append(photo)
        else:
            product_data["Photo"].append("")
            
           
    # close chrome driver 
    driver.close()

## Visit Pages

In [20]:
BASE_URL = 'https://www.konga.com/category/laptops-5230?'


for page_num in range(1,26):
    page_url = f"{BASE_URL}page={page_num}"
    print(f"Collecting data from {page_url} ...")
    visit_site(page_url)
    print('Done Collecting Data from',page_url)
    

Collecting data from https://www.konga.com/category/laptops-5230?page=1 ...
Done Collecting Data from https://www.konga.com/category/laptops-5230?page=1
Collecting data from https://www.konga.com/category/laptops-5230?page=2 ...
Done Collecting Data from https://www.konga.com/category/laptops-5230?page=2
Collecting data from https://www.konga.com/category/laptops-5230?page=3 ...
Done Collecting Data from https://www.konga.com/category/laptops-5230?page=3
Collecting data from https://www.konga.com/category/laptops-5230?page=4 ...
Done Collecting Data from https://www.konga.com/category/laptops-5230?page=4
Collecting data from https://www.konga.com/category/laptops-5230?page=5 ...
Done Collecting Data from https://www.konga.com/category/laptops-5230?page=5
Collecting data from https://www.konga.com/category/laptops-5230?page=6 ...
Done Collecting Data from https://www.konga.com/category/laptops-5230?page=6
Collecting data from https://www.konga.com/category/laptops-5230?page=7 ...
Done C

## Create DataFrame

In [22]:
konga_laptop_df = pd.DataFrame.from_dict(product_data)
konga_laptop_df

Unnamed: 0,Product Name,Current Price,Old Price,Discount,Rating,URL,Photo,Vendor
0,HP Laptop 15s-fq5268nia 7c7p4ea ...,"₦753,480",,,No reviews yet,https://www.konga.com/product/hp-laptop-15s-fq...,https://www-konga-com-res.cloudinary.com/w_aut...,Konga Site
1,HP Eb650g9 I7-1255u 15 - 512GB S...,"₦1,678,508",,,No reviews yet,https://www.konga.com/product/hp-eb650g9-i7-12...,https://www-konga-com-res.cloudinary.com/w_aut...,Konga Site
2,HP Spectre X360 Laptop 14-ef2017...,"₦1,771,432",,,No reviews yet,https://www.konga.com/product/hp-spectre-x360-...,https://www-konga-com-res.cloudinary.com/w_aut...,Konga Site
3,HP Victus Gaming Laptop 15-fa114...,"₦1,140,163","₦1,213,527",- 6%,No reviews yet,https://www.konga.com/product/hp-victus-gaming...,https://www-konga-com-res.cloudinary.com/w_aut...,Konga Site
4,HP Victus Gaming Laptop 15-fa115...,"₦1,291,067","₦1,374,130",- 6%,No reviews yet,https://www.konga.com/product/hp-victus-gaming...,,Konga Site
...,...,...,...,...,...,...,...,...
995,HP Elitebook 840 G5 - 12GB RAM -...,"₦485,000","₦555,000",- 13%,No reviews yet,https://www.konga.com/product/hp-elitebook-840...,,Konga Site
996,HP Notebook Stream 11 - Intel Ce...,"₦155,000","₦170,000",- 9%,No reviews yet,https://www.konga.com/product/hp-notebook-stre...,,Konga Site
997,HP Elitebook 840 G5 - 16GB RAM -...,"₦475,000","₦535,000",- 11%,No reviews yet,https://www.konga.com/product/hp-elitebook-840...,,Konga Site
998,HP 250 - Intel Celeron - Pentium...,"₦348,000",,,No reviews yet,https://www.konga.com/product/hp-250-intel-cel...,,Konga Site


## Save in CSV

In [25]:
konga_laptop_df.to_csv("konga_laptop.csv", index=False)

## Performing a Data Cleaning on Konga Data

In [28]:
konga_df = pd.read_csv("konga_laptop.csv")
konga_df

Unnamed: 0,Product Name,Current Price,Old Price,Discount,Rating,URL,Photo,Vendor
0,HP Laptop 15s-fq5268nia 7c7p4ea ...,"₦753,480",,,No reviews yet,https://www.konga.com/product/hp-laptop-15s-fq...,https://www-konga-com-res.cloudinary.com/w_aut...,Konga Site
1,HP Eb650g9 I7-1255u 15 - 512GB S...,"₦1,678,508",,,No reviews yet,https://www.konga.com/product/hp-eb650g9-i7-12...,https://www-konga-com-res.cloudinary.com/w_aut...,Konga Site
2,HP Spectre X360 Laptop 14-ef2017...,"₦1,771,432",,,No reviews yet,https://www.konga.com/product/hp-spectre-x360-...,https://www-konga-com-res.cloudinary.com/w_aut...,Konga Site
3,HP Victus Gaming Laptop 15-fa114...,"₦1,140,163","₦1,213,527",- 6%,No reviews yet,https://www.konga.com/product/hp-victus-gaming...,https://www-konga-com-res.cloudinary.com/w_aut...,Konga Site
4,HP Victus Gaming Laptop 15-fa115...,"₦1,291,067","₦1,374,130",- 6%,No reviews yet,https://www.konga.com/product/hp-victus-gaming...,,Konga Site
...,...,...,...,...,...,...,...,...
995,HP Elitebook 840 G5 - 12GB RAM -...,"₦485,000","₦555,000",- 13%,No reviews yet,https://www.konga.com/product/hp-elitebook-840...,,Konga Site
996,HP Notebook Stream 11 - Intel Ce...,"₦155,000","₦170,000",- 9%,No reviews yet,https://www.konga.com/product/hp-notebook-stre...,,Konga Site
997,HP Elitebook 840 G5 - 16GB RAM -...,"₦475,000","₦535,000",- 11%,No reviews yet,https://www.konga.com/product/hp-elitebook-840...,,Konga Site
998,HP 250 - Intel Celeron - Pentium...,"₦348,000",,,No reviews yet,https://www.konga.com/product/hp-250-intel-cel...,,Konga Site


## Inspecting the Data

In [31]:
konga_df.shape

(1000, 8)

In [33]:
konga_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Product Name   1000 non-null   object
 1   Current Price  1000 non-null   object
 2   Old Price      318 non-null    object
 3   Discount       314 non-null    object
 4   Rating         1000 non-null   object
 5   URL            1000 non-null   object
 6   Photo          740 non-null    object
 7   Vendor         1000 non-null   object
dtypes: object(8)
memory usage: 62.6+ KB


In [35]:
konga_df["Current Price"].unique()

array(['₦753,480', '₦1,678,508', '₦1,771,432', '₦1,140,163', '₦1,291,067',
       '₦1,075,582', '₦1,225,347', '₦1,753,662', '₦1,037,918',
       '₦1,369,372', '₦1,378,095', '₦1,497,645', '₦806,260', '₦1,418,412',
       '₦692,068', '₦586,768', '₦834,964', '₦832,100', '₦864,100',
       '₦973,200', '₦1,348,300', '₦1,081,400', '₦1,487,700', '₦263,500',
       '₦660,000', '₦1,020,000', '₦850,000', '₦776,056', '₦1,806,896',
       '₦571,067', '₦786,240', '₦1,171,570', '₦1,190,465', '₦1,640,490',
       '₦801,750', '₦2,001,408', '₦1,307,023', '₦796,483', '₦1,358,136',
       '₦1,048,352', '₦796,475', '₦2,450,847', '₦744,888', '₦1,114,450',
       '₦1,082,389', '₦988,330', '₦1,014,314', '₦852,748', '₦1,669,370',
       '₦1,729,100', '₦1,272,998', '₦2,975,365', '₦1,295,081', '₦612,673',
       '₦1,089,204', '₦428,871', '₦1,089,196', '₦680,748', '₦1,034,738',
       '₦524,176', '₦984,516', '₦980,277', '₦2,212,098', '₦1,425,405',
       '₦989,093', '₦1,453,296', '₦1,174,189', '₦1,349,816', '₦92

In [36]:
# Current price
konga_df['Current Price'] = konga_df['Current Price'].str.replace("₦","")
konga_df['Current Price'] = konga_df['Current Price'].str.replace(",","")
konga_df['Current Price'].fillna(value=0, inplace=True)

In [39]:
# change datatype
konga_df['Current Price'] = konga_df['Current Price'].astype("float64")

In [41]:
konga_df["Old Price"].unique()

array([nan, '₦1,213,527', '₦1,374,130', '₦924,600', '₦960,200',
       '₦1,081,400', '₦1,348,300', '₦1,487,700', '₦292,800', '₦825,000',
       '₦1,133,400', '₦944,500', '₦1,267,054', '₦1,445,512', '₦3,000,000',
       '₦1,339,500', '₦356,250', '₦240,000', '₦150,000', '₦525,000',
       '₦155,000', '₦190,000', '₦350,000', '₦170,000', '₦750,000',
       '₦700,000', '₦1,000,000', '₦800,000', '₦900,000', '₦380,000',
       '₦600,000', '₦260,000', '₦400,000', '₦270,000', '₦300,000',
       '₦200,000', '₦280,000', '₦500,000', '₦1,265,000', '₦3,200,000',
       '₦2,300,000', '₦4,000,000', '₦798,000', '₦2,500,000', '₦4,700,000',
       '₦1,980,000', '₦2,230,000', '₦480,000', '₦535,000', '₦555,000',
       '₦490,000', '₦510,000', '₦762,000', '₦990,000', '₦1,795,000',
       '₦331,000', '₦5,200,000', '₦1,168,000', '₦1,500,000', '₦470,000',
       '₦450,000', '₦2,670,000', '₦2,000,000', '₦1,200,000', '₦868,000',
       '₦878,000', '₦640,000', '₦295,000', '₦1,030,000', '₦230,000',
       '₦267,00

In [43]:
# Current price
konga_df['Old Price'] = konga_df['Old Price'].str.replace("₦","")
konga_df['Old Price'] = konga_df['Old Price'].str.replace(",","")
konga_df['Old Price'].fillna(value=0, inplace=True)

In [45]:
# change datatype
konga_df['Old Price'] = konga_df['Old Price'].astype("float64")

In [47]:
# discount
konga_df['Discount'] = konga_df['Discount'].str.replace("- ","")
konga_df['Discount'] = konga_df['Discount'].str.replace("%","")

# let fill missing value in discount
konga_df["Discount"].fillna(value=0, inplace=True)

# now let change the datatype
konga_df['Discount'] = konga_df['Discount'].astype("int")

# converting to percent
konga_df["Discount"] = konga_df["Discount"] / 100

In [49]:
# Rating
konga_df["Rating"].unique()

array(['No reviews yet', '1 Review', '4 Reviews'], dtype=object)

In [51]:
# rating category
rating_cat = {
    'No reviews yet': 0,
    '1 Review': 1,
    '2 Reviews': 2,
    '3 Reviews': 3,
    '4 Reviews': 4,
    '5 Reviews': 5,
    
}

# Remap the values of the dataframe
konga_df["Rating"] = konga_df["Rating"].map(rating_cat)

In [53]:
konga_df["Rating"].unique()

array([0, 1, 4])

In [55]:
# remove whitespace
konga_df["Product Name"] = konga_df['Product Name'].str.strip()
konga_df["URL"] = konga_df['URL'].str.strip()
konga_df["Photo"] = konga_df['Photo'].str.strip()

In [57]:
# check for duplicates
konga_df.duplicated().sum()

0

In [59]:
konga_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Product Name   1000 non-null   object 
 1   Current Price  1000 non-null   float64
 2   Old Price      1000 non-null   float64
 3   Discount       1000 non-null   float64
 4   Rating         1000 non-null   int64  
 5   URL            1000 non-null   object 
 6   Photo          740 non-null    object 
 7   Vendor         1000 non-null   object 
dtypes: float64(3), int64(1), object(4)
memory usage: 62.6+ KB


## Save Clean data to CSV

In [62]:
konga_df.to_csv("konga_clean_laptop.csv", index=False)