In [1]:
import config
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from time import sleep
import pymysql
import os
import re
from math import ceil

In [2]:
conn = pymysql.connect(host=config.database_host, port=3306, user=config.database_user, password=config.database_pass)
cur  = conn.cursor()

In [3]:
cur.execute('CREATE DATABASE IF NOT EXISTS Jumia;')
cur.execute('use Jumia;')
cur.execute('CREATE TABLE IF NOT EXISTS Jumia_Products(\
             product_title       text,\
             product_link        text,\
             current_price       int,\
             old_price           int,\
             discount_pircentage int,\
             discount_quantity   int\
             ) CHARACTER SET utf8 COLLATE utf8_general_ci;')

0

In [4]:
try:
    current_path = os.path.dirname(os.path.abspath(__file__))
except:
    current_path = '.'

In [5]:
def init_driver(gecko_driver='', user_agent='', load_images=True, is_headless=False):
    firefox_profile = webdriver.FirefoxProfile()
    
    firefox_profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', False)
    firefox_profile.set_preference("media.volume_scale", "0.0")
    firefox_profile.set_preference("dom.webnotifications.enabled", False)
    if user_agent != '':
        firefox_profile.set_preference("general.useragent.override", user_agent)
    if not load_images:
        firefox_profile.set_preference('permissions.default.image', 2)

    options = Options()
    options.headless = is_headless
    
    driver = webdriver.Firefox(options=options,
                               executable_path=f'{current_path}/{gecko_driver}',
                               firefox_profile=firefox_profile)
    
    return driver
    

In [6]:
def get_url(driver, page_url):
    '''
        This function opens the page from which we want to crawl data and wait for a short time, and if any ad is created 
        it will be closed
    '''
    driver.get(page_url)
    sleep(config.page_load_timeout)
    # if any advertisement displayed, close it.
    close = driver.find_elements_by_css_selector('.close_popup')
    if len(close) > 0:
        close[0].click()

def get_products_info_and_add_them_to_database(driver):
    '''
        this function will get (product_title, product_link, current_price, old_price, discount_pircentage, discount_quantity)
        for each product exist in jumia and first: add this data to the database. second: return them as list of dictionaries
    '''
    
    products = driver.find_elements_by_css_selector('div._4cl-3cm-shs article.c-prd')

    products_info = []
    for product in products:

        product_title = ''
        if len(product.find_elements_by_css_selector('div.info h3.name')) > 0:
            product_title = product.find_elements_by_css_selector('div.info h3.name')[0].text

        product_link = ''
        if len(product.find_elements_by_css_selector('a.core')) > 0:
            product_link = product.find_elements_by_css_selector('a.core')[0].get_attribute('href')

        current_price = 0
        if len(product.find_elements_by_css_selector('div.info div.prc')) > 0:
            current_price = product.find_elements_by_css_selector('div.info div.prc')[0].text
            # remove any characters, we need the price only.
            current_price = re.sub('[\D]', '', current_price)
            current_price = ceil(float(current_price))

        old_price = 0
        if len(product.find_elements_by_css_selector('div.s-prc-w div.old')) > 0:
            old_price = product.find_elements_by_css_selector('div.s-prc-w div.old')[0].text
            # remove any characters, we need the price only.
            old_price = re.sub('[\D]', '', old_price)
            old_price = ceil(float(old_price))

        discount_pircentage = 0
        discount_quantity   = 0
        if current_price != 0 and old_price != 0 and old_price > current_price:
            discount_quantity   = old_price - current_price
            discount_pircentage = round((discount_quantity/old_price)*100)

        # save the product data in a dictionary and append in to the products_info list.
        product_info = {'product_title':product_title,
                        'product_link':product_link,
                        'current_price':current_price,
                        'old_price':old_price,
                        'discount_pircentage':discount_pircentage,
                        'discount_quantity':discount_quantity}
        products_info.append(product_info)
        
        all_data = (product_title, product_link, current_price, old_price, discount_pircentage, discount_quantity)
        
        # get all product titles from the database
        cur.execute('SELECT product_title FROM Jumia_Products')
        product_title_fetched = cur.fetchall()
        
        # check if the product not exist in the database ( if not exist, insert it)
        if not (product_title,) in product_title_fetched:
            cur.execute('INSERT INTO Jumia_Products VALUES' + str(all_data) +'')
            conn.commit()
        # if the product already exist in the database, fetch its price to check whether the price was changed or not.
        # if the product price was changed, update it with the new one.
        # Note: this will performed on the current_price & old_price.
        else:
            cur.execute(f'SELECT current_price,old_price FROM Jumia_Products WHERE product_title = "{product_title}"')
            price_fetched = cur.fetchall()
            if current_price != price_fetched[0][0] and old_price != price_fetched[0][1]:
                cur.execute(f'UPDATE Jumia_Products SET current_price={current_price},\
                                                        old_price={old_price},\
                                                        discount_pircentage={discount_pircentage},\
                                                        discount_quantity={discount_quantity}\
                                                        WHERE product_title = "{product_title}"')
                conn.commit()
                
        
                                                                                   
        
    return products_info

In [7]:
# open the browser then open the jumia site
driver = init_driver(config.gecko_driver, user_agent=config.user_agent)
driver.get(config.jumia_base_url)

In [8]:
# iterate on every page from the given categories and performs the above mentioned functions.
for category in config.categories:
    category_url = f'{config.jumia_base_url}/{category}'
    for page in range(2, 6):
        page_url = f'{category_url}/?page={page}'
        get_url(driver, page_url)
        
        pruducts_info = get_products_info_and_add_them_to_database(driver)
        print(len(pruducts_info))


48
