## Imports & functions

In [19]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [20]:
import logging
import os
import re
from datetime import date, datetime
from itertools import product

import numpy as np
import pandas as pd
import yaml

from scraping_class import ScrapingClass
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup

In [9]:
# HERE = os.path.realpath(os.path.dirname(__file__)) # python script
HERE = os.path.realpath(os.getcwd()) # notebook

class ScrapingEnergieInfo:
    def __init__(self):

        with open(
            os.path.join(
                HERE,
                f"config_files/settings_mediateurenergie_dev.yaml",
            ),
            "r",
        ) as ymlfile:
            self.settings = yaml.safe_load(ymlfile)

        self.today = str(datetime.today().date())
        # try:
        #     self.params = pd.read_csv(f"all_params_{self.today}.csv")
        # except:
        #     print("no params found, starts from scratch")
        #     self.params = pd.DataFrame(columns=['today', 'profile', 'energy_type', 'options'])

        self.sc = ScrapingClass()  # config["driver_path"]
        self.url = self.settings["url"]

        self.options_profile = self.settings["options"]["profile"]
        self.options_energy_type = self.settings["options"]["energy type"]
        self.options_elec_consumption_type = self.settings["options"][
            "elec consumption type"
        ]
        self.options_elec_zip_code = self.settings["options"]["zip code"]["elec"]
        self.options_gas_zip_code = self.settings["options"]["zip code"]["gas"]
        self.options_counter_power = self.settings["options"]["counter power"]
        self.options_gas_consumption = self.settings["options"]["gas consumption"]

        self.all_elec_offers_list = []
        self.all_gas_offers_list = []
        self.all_elec_offers_df = pd.DataFrame()
        self.all_gas_offers_df = pd.DataFrame()
        
        self.current_option = None

    def scrap_comparateur_offre(self, profile, energy_type):
        """
        Running function
        """
        print(f'Scraping profile : {profile} - energy_type : {energy_type}')
        self.profile = profile
        self.energy_type = energy_type
        self.sc.get(self.url)
        self.choose_profile()

        if self.energy_type == "elec":
            options = list(
                product(
                    self.options_elec_consumption_type,
                    self.options_counter_power,
                    self.options_elec_zip_code,
                )
            )
            for ind, option in enumerate(options, start = 1):
                self.current_option = option
                print(f"{profile} - {energy_type} : {ind:02d}/{len(options)} - {option}")
                self.elec_consumption_type = option[0]
                self.counter_power = option[1]
                self.zip_code = option[2]

                self.main_run()
                break

        elif energy_type == "gas":
            options = list(
                product(self.options_gas_consumption, self.options_gas_zip_code)
            )
            for ind, option in enumerate(options, start = 1):
                self.current_option = option
                print(f"{profile} - {energy_type} : {ind:02d}/{len(options)} - {option}")
                self.gas_consumption = option[0]
                self.zip_code = option[1]

                self.main_run()
                break

        # self.cleaning_result()

        #  try:
        #    self.all_offers.to_gbq("team_data.mediateur_national_energie_offers", project_id=settings.GCLOUD_PROJECT, if_exists='append')
        #  except:
        #    pass

        return self.all_offers

    def choose_profile(self):
        """
        Change the profile of the user (private/professional)
        """
        if self.profile == "professional":
            try:
                self.sc.click_xpath(self.settings["xpath"]["cookie banner"])
            except Exception as e:
                log.info(f"Exception clicking cookies : {e}")
                pass
            sliders = self.sc.find_elements_by_xpath(self.settings["xpath"]["profile slider"])
            for el in sliders:
                try:
                    el.click()
                    break
                except:
                    continue
            
            self.sc.click_css_selector(
                self.settings["selector"]["profile reset button"]
            )
        else:
            pass

    def first_page(self):
        """
        Goes through the first page of the website selecting options from settings file
        """

        try:
            self.sc.click_xpath(self.settings["xpath"]["cookie banner"])
        except Exception as e:
                print(f"Exception clicking cookies : {e}")
                pass
        self.sc.send_xpath(self.settings["xpath"]["zip code"], self.zip_code)
        self.sc.wait(1, 2)
        self.sc.click_xpath(self.settings["xpath"]["city"])
        self.sc.click_xpath(self.settings["xpath"]["energy type"][self.energy_type])

        self.sc.click_xpath(self.settings["xpath"]["next button"])

    def second_page(self):
        """
        Goes through the second page of the website selecting options from settings file
        """
        if self.energy_type == "elec":
            self.sc.click_xpath(self.settings["xpath"]["linky counter"])
            for elmt in ['has contract', 'know prm']:
                try:
                    self.sc.click_xpath(self.settings["xpath"][elmt])
                except:
                    print(f'element not found page 2: {elmt}')

            if self.profile == "private":
                self.sc.click_xpath(
                    self.settings["xpath"]["elec consumption knowPower"]
                )
            self.sc.click_xpath(
                "//select[@id='elec_consumption_power']//option[normalize-space()='"
                + str(self.counter_power)
                + " kVA']"
            )
            self.sc.click_xpath(
                self.settings["xpath"]["elec consumption type"][
                    self.elec_consumption_type
                ]
            )
            self.sc.click_xpath(self.settings["xpath"]["elec consumption knowConso"])
        elif self.energy_type == "gas":
            self.sc.click_xpath(self.settings["xpath"]["gas consumption knowPower"])
            self.sc.send_xpath(
                self.settings["xpath"]["gas consumption"], self.gas_consumption
            )
            self.sc.click_keyboard_enter()

        self.sc.click_css_selector(self.settings["selector"]["submit button"])

    def third_page(self):
        """
        Goes through the third page of the website selecting options from settings file
        """
        self.sc.click_xpath(self.settings["xpath"]["sorted by provider"])
        if self.profile == "professional":
            self.sc.click_xpath(self.settings["xpath"]["displayed price"])
        self.sc.click_css_selector(self.settings["selector"]["submit button"])

    def main_run(self):
        """
        Main running function for a unique combination of options
        """
        # param_checked, params = self.check_params()
        # if param_checked:
        #     self.new_simulation()
        #     print(f"params already used ({' - '.join([param for param in params])})")
        #     return
        
        try:
            self.first_page()
            self.second_page()
            self.third_page()
        except Exception as e:
            print(f"Exception in main_run : {e}")
            pass

## Init scraper

In [11]:
scraper = ScrapingEnergieInfo()

In [13]:
scraper.scrap_comparateur_offre(profile='private',energy_type='elec')

Scraping profile : private - energy_type : elec
private - elec : 01/2 - ('base', 9, 13008)


AttributeError: 'ScrapingEnergieInfo' object has no attribute 'all_offers'

## Tests

In [37]:
import requests
import re

In [28]:
# Collect cookies
cookies = {}
for cookie in scraper.sc.driver.get_cookies():
    cookies.update({cookie['name']:cookie['value']})

# Retrieve token
soup = BeautifulSoup(scraper.sc.driver.page_source, 'html.parser')
token_elmt = soup.find(id='offer_filters__token')
token_filter = token_elmt.get('value')

In [35]:
session = requests.session()
results_list_html = []
last = False
ind = 1
headers = {
    'accept': '*/*',
    'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
    'priority': 'u=1, i',
    'referer': 'https://comparateur-offres.energie-info.fr/',
    'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
    'x-requested-with': 'XMLHttpRequest',
}

params = {
    's': '',
    'offer_filters[fullGreen]': '0',
    'offer_filters[fullOnLine]': '0',
    'offer_filters[period]': '12',
    'offer_filters[linky]': '0',
    'offer_filters[sortBy]': '3',
    'offer_filters[sortDirection]': 'ASC',
    'offer_filters[_token]': token_filter,
}

while last is False:
    url_filtered_results = f'https://comparateur-offres.energie-info.fr/results/{ind}'
    print(url_filtered_results)
    response = session.get(url_filtered_results, params=params, cookies=cookies, headers=headers)
    
    if response.json().get('lastPage'):
        last = True

    results_list_html.append(response.json().get('html'))
    ind+=1

https://comparateur-offres.energie-info.fr/results/1
https://comparateur-offres.energie-info.fr/results/2
https://comparateur-offres.energie-info.fr/results/3
https://comparateur-offres.energie-info.fr/results/4


In [43]:
soup_test = BeautifulSoup(results_list_html[0], 'html.parser')
offers = soup_test.find_all('div', class_ ='offre offer')
offers_ids = [offer.get('data-id') for offer in offers]
offers_ids

['12419',
 '13676',
 '12984',
 '13903',
 '13303',
 '13883',
 '13688',
 '13690',
 '14081',
 '13681',
 '6887',
 '13689',
 '13684',
 '13686',
 '6534']

In [45]:
soup_test

 <div class="anchor" id="offer-12419"></div>
<div class="offre offer" data-id="12419">
<header class="offre-header">
<div class="offre-price">
<span class="total">
                        Total :
                        <span>
                            1 081
                            
                            €
                        </span>
</span>
</div>
</header>
<main>
<article class="line">
<div class="type">
<i class="icon-electricite">
<svg id="prefix__icone-electricite" viewbox="0 0 40.901 75" xmlns="http://www.w3.org/2000/svg">
<path d="M41.519 29.388a1.7 1.7 0 00-1.7-1.7H26.257l8.357-25.033-.019-.008a1.651 1.651 0 00.107-.531A1.7 1.7 0 0033 .411H15.957a1.688 1.688 0 00-1.6 1.173l-.021-.008L.705 42.486l.019.008a1.651 1.651 0 00-.107.531 1.7 1.7 0 001.7 1.7h15.144l-3.2 28.79h.026a1.792 1.792 0 00-.036.186 1.7 1.7 0 001.7 1.7 1.657 1.657 0 001.514-1l.032.015L41.362 30.1h-.008a1.655 1.655 0 00.165-.712z" data-name="Tracé 5" id="prefix__Tracé_5" transform="translate(-.618 

In [57]:
# Update cookies
cookies = {}
for cookie in scraper.sc.driver.get_cookies():
    cookies.update({cookie['name']:cookie['value']})

url_detail = 'https://comparateur-offres.energie-info.fr/detail/'
offer = offers_ids[0]

res = requests.get(url = url_detail+offer, cookies=cookies, headers=headers)

In [60]:
soup_offer = BeautifulSoup(res.content, 'html.parser')

In [None]:
result_row_selector = "//*[@id='electricite']//table/tbody/tr"

In [69]:
row.name

'tr'

In [73]:
result_rows = soup_offer.find(id = 'electricite').find_all('tr')

for row in result_rows:
    children = row.children
    if any([child.name == "td" for child in children]):
        print('found')
        break

found


In [79]:
offer = {}
result_rows = soup_offer.find(id = 'electricite').find_all('tr')

for row in result_rows:
## Add additionnal keys from rows in result based on yaml settings for results columns
    children = row.find_elements(By.XPATH, ("*"))

    if any([child.name == "td" for child in children]): # Filter rows that have no values
        
        key = row.find('th').text
        value = row.find('td').text

        if key in scraper.settings['results']['columns']: # Only keeps desired columns
            offer[key] = value
offer


TypeError: 'NoneType' object is not callable

---

## Restart

In [10]:
scraper.sc.quit()