Exemple d'url à tester : 
- https://www.zoom.us/fr/signup?ampDeviceId=59043737-0e3c-412d-af6b-91f85e29b81f&ampSessionId=1766618944644&_ics=1766618944230&irclickid=%7E16Z5-c5Z4-2451W023S0QHIypgmnkmdiea723SUSICxqpnea841R&_gl=1*tveo27*_gcl_au*MTc4NDg4MjQ0Ny4xNzY2NjE4OTQz#/signup
- https://maratp.ratp.fr/newsletter/
- https://www.facebook.com/r.php?entry_point=login&locale=fr_FR
- https://jobs.smartrecruiters.com/oneclick-ui/company/Devoteam/publication/08190f81-022c-4a32-94c9-b496b29d05a7?dcr_ci=Devoteam&sid=306448a8-a471-41bb-b86a-218ae712f877 (CAPTCHA - Impossible de récupérer le formulaire)

In [2]:
import random
import re
import time

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager


## Test de récupération de la pge html brut.

In [3]:
TIMEOUT = 15
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
]

In [4]:
# Les fonctions qui servent à l'extraction du html brut.

def create_driver() -> webdriver.Chrome:
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")

    return webdriver.Chrome(
        options=options,
        service=Service(ChromeDriverManager().install())
    )


# Cas des pages avec du JavaScript dynamique (formulaire non accessible avec le code source) .
def load_main_page(driver: webdriver.Chrome, url: str, wait_seconds: int) -> str:
    driver.get(url)
    WebDriverWait(driver, wait_seconds).until(
        ec.presence_of_element_located((By.TAG_NAME, "body")))
    return driver.page_source


# Cas avec iframes.
def get_iframes(driver: webdriver.Chrome) -> list:
    return driver.find_elements(By.TAG_NAME, "iframe")


def load_iframe_html(driver: webdriver.Chrome, iframe) -> str:
    driver.switch_to.frame(iframe)
    html = driver.page_source
    driver.switch_to.default_content()
    return html


def fetch_html_with_selenium(url: str, wait_seconds: int = 10) -> str:
    driver = create_driver()

    try:
        main_html = load_main_page(driver, url, wait_seconds)

        # Cas simple : formulaire dans le DOM principal
        if "<form" in main_html.lower():
            return main_html

        # Recherche éventuelle dans les iframes
        iframes = get_iframes(driver)

        for iframe in iframes:
            iframe_html = load_iframe_html(driver, iframe)

            if "<form" in iframe_html.lower():
                return iframe_html

        return main_html

    finally:
        driver.quit()


In [5]:
url = "https://httpbin.org/forms/post"
print(fetch_html_with_selenium(url))

<html><head>
  </head>
  <body>
  <!-- Example form from HTML5 spec http://www.w3.org/TR/html5/forms.html#writing-a-form's-user-interface -->
  <form method="post" action="/post">
   <p><label>Customer name: <input name="custname"></label></p>
   <p><label>Telephone: <input type="tel" name="custtel"></label></p>
   <p><label>E-mail address: <input type="email" name="custemail"></label></p>
   <fieldset>
    <legend> Pizza Size </legend>
    <p><label> <input type="radio" name="size" value="small"> Small </label></p>
    <p><label> <input type="radio" name="size" value="medium"> Medium </label></p>
    <p><label> <input type="radio" name="size" value="large"> Large </label></p>
   </fieldset>
   <fieldset>
    <legend> Pizza Toppings </legend>
    <p><label> <input type="checkbox" name="topping" value="bacon"> Bacon </label></p>
    <p><label> <input type="checkbox" name="topping" value="cheese"> Extra Cheese </label></p>
    <p><label> <input type="checkbox" name="topping" value="onion

In [14]:
url = "https://httpbin.org/forms/post"

headers = {
    "User-Agent": random.choice(USER_AGENTS),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
    "Connection": "close",
}


try:
    response = requests.get(url, headers=headers, timeout=TIMEOUT)
    response.raise_for_status()
    html = response.text
    html_lower = html.lower()
    limit_size = 1000
    if len(html) < limit_size or "<form" not in html_lower:
        html = fetch_html_with_selenium(url)
    print("Pas d'erreur : ", response.status_code, html)

except requests.RequestException:
    html = fetch_html_with_selenium(url)
    print("Avec erreur: ", 200, html)


Pas d'erreur :  200 <!DOCTYPE html>
<html>
  <head>
  </head>
  <body>
  <!-- Example form from HTML5 spec http://www.w3.org/TR/html5/forms.html#writing-a-form's-user-interface -->
  <form method="post" action="/post">
   <p><label>Customer name: <input name="custname"></label></p>
   <p><label>Telephone: <input type=tel name="custtel"></label></p>
   <p><label>E-mail address: <input type=email name="custemail"></label></p>
   <fieldset>
    <legend> Pizza Size </legend>
    <p><label> <input type=radio name=size value="small"> Small </label></p>
    <p><label> <input type=radio name=size value="medium"> Medium </label></p>
    <p><label> <input type=radio name=size value="large"> Large </label></p>
   </fieldset>
   <fieldset>
    <legend> Pizza Toppings </legend>
    <p><label> <input type=checkbox name="topping" value="bacon"> Bacon </label></p>
    <p><label> <input type=checkbox name="topping" value="cheese"> Extra Cheese </label></p>
    <p><label> <input type=checkbox name="topp

## Test de la detection des fomulaires

In [7]:
def has_form_tag(soup: BeautifulSoup) -> bool:
    return len(soup.find_all("form")) > 0


def count_form_tags(soup: BeautifulSoup) -> int:
    return len(soup.find_all("form"))


def has_input_fields(soup: BeautifulSoup) -> bool:
    fields = soup.find_all(["input", "select", "textarea"])
    return len(fields) > 0


def has_action_button(soup: BeautifulSoup) -> bool:
    buttons = soup.find_all("button")
    keywords = ["submit", "envoyer", "valider", "sign up", "register", "s\'inscrire", "enregistrer", "s\'enregistrer"]
    for button in buttons:
        text = button.get_text(" ").lower()
        if any(keyword in text for keyword in keywords):
            return True
    return False


def has_placeholder_or_label(soup: BeautifulSoup) -> bool:
    inputs = soup.find_all("input")

    for field in inputs:
        if field.get("placeholder"):
            return True

        field_id = field.get("id")
        if field_id and soup.find("label", attrs={"for": field_id}):
            return True
    return False


def is_probable_form(soup: BeautifulSoup) -> bool:
    return (
        has_input_fields(soup)
        and (has_action_button(soup) or has_placeholder_or_label(soup))
    )

In [15]:
soup = BeautifulSoup(html, "lxml")

form_present = has_form_tag(soup)
input_present = has_input_fields(soup)

reasons: list[str] = []

if form_present:
    reasons.append("Balise HTML <form> detectée.")
if input_present:
    reasons.append("Champs input/select/textarea détecté.")

probable_form = False

if not form_present and is_probable_form(soup):
    probable_form = True
    reasons.append("Structure de type formulaire détectée à l'aide d'heuristiques HTML.")

dict_response = {
    "has_form": form_present,
    "forms_count": count_form_tags(soup),
    "has_inputs": input_present,
    "probable_form": probable_form,
    "reasons": reasons
    }

print(dict_response)

{'has_form': True, 'forms_count': 1, 'has_inputs': True, 'probable_form': False, 'reasons': ['Balise HTML <form> detectée.', 'Champs input/select/textarea détecté.']}


## Extraction des noms de champs dans le HTML

In [16]:
# Conditions de filtrage des champs. On se concentre pour le moment que sur les champs textuels.

TEXTUAL_TAGS = {
    "input",
    "textarea",
}

ALLOWED_INPUT_TYPES = {
    None,
    "text",
    "email",
    "number",
    "tel",
    "date",
}

EXCLUDED_INPUT_TYPES = {
    "hidden",
    "password",
    "submit",
    "button",
    "reset",
    "image",
    "file",
}

SYSTEM_TOKENS = {
    "captcha",
    "token",
    "csrf",
    "gtm",
    "tracking",
    "session",
    "analytics",
    "pixel",
    "fb",
    "ga",
}


def _contains_system_token(text: str | None) -> bool:
    if not text:
        return False
    text = text.lower()
    return any(token in text for token in SYSTEM_TOKENS)


def is_user_fillable_field(element, label: str | None) -> bool:

    tag = element.name
    if tag not in TEXTUAL_TAGS:
        return False

    field_type = element.get("type")
    field_type = field_type.lower() if field_type else None

    if field_type in EXCLUDED_INPUT_TYPES:
        return False

    if field_type not in ALLOWED_INPUT_TYPES:
        return False

    if _contains_system_token(element.get("name")):
        return False
    if _contains_system_token(element.get("id")):
        return False

    return element.get("name") or label or element.get("placeholder")


# Cas simple : <label for='id'>
# <label for="email">Adresse e-mail</label>
# <input type="email" id="email" name="user_email" placeholder="email@email.com">
def label_from_for_attribute(field, soup: BeautifulSoup) -> str | None:
    field_id = field.get("id")
    if not field_id:
        return None

    label = soup.find("label", attrs={"for": field_id})
    if label:
        return label.get_text(strip=True)

    return None


# Cas lorsque l'input est imbriqué dans le <label>
# <label>
#   Adresse e-mail
#   <input type="email">
# </label>
def label_from_parent_label(field) -> str | None:
    parent_label = field.find_parent("label")
    if parent_label:
        return parent_label.get_text(strip=True)
    return None


# Cas avec un attribut plpaceholder.
# <input placeholder="Votre adresse email">
def label_from_placeholder(field) -> str | None:
    return field.get("placeholder")


# Cas difficile, où le nom du champ est dans le "parent", quelque soit la balise.
def label_from_nearby_text(field) -> str | None:
    parent = field.parent
    if not parent:
        return None

    limit_text = 60
    text = parent.get_text(" ", strip=True)
    if text and len(text) < limit_text:
        return text
    return None


# On essaie d'extraire un label pour un champ de formulaire.
def extract_label_for_field(field, soup: BeautifulSoup) -> str | None:
    resolvers = [
        lambda: label_from_for_attribute(field, soup),
        lambda: label_from_parent_label(field),
        lambda: label_from_placeholder(field),
        lambda: label_from_nearby_text(field),
    ]

    for resolve in resolvers:
        label = resolve()
        if label:
            return label

    return None


def clean_label(label: str | None) -> str | None:
    if not label:
        return None

    label = " ".join(label.split())
    limit_text = 80
    if len(label) > limit_text:
        return None

    return label


In [17]:
soup = BeautifulSoup(html, "lxml")
fields = []

elements = soup.find_all(["input", "select", "textarea"])

for element in elements:
    raw_label = extract_label_for_field(element, soup)
    label = clean_label(raw_label)
    if not is_user_fillable_field(element, label):
        continue
    fields.append(dict(
            tag=element.name,
            type=element.get("type"),
            name=element.get("name"),
            id=element.get("id"),
            placeholder=element.get("placeholder"),
            label=extract_label_for_field(element, soup),
            aria_label=element.get("aria-label")
)
        )
print(fields)

[{'tag': 'input', 'type': None, 'name': 'custname', 'id': None, 'placeholder': None, 'label': 'Customer name:', 'aria_label': None}, {'tag': 'input', 'type': 'tel', 'name': 'custtel', 'id': None, 'placeholder': None, 'label': 'Telephone:', 'aria_label': None}, {'tag': 'input', 'type': 'email', 'name': 'custemail', 'id': None, 'placeholder': None, 'label': 'E-mail address:', 'aria_label': None}, {'tag': 'textarea', 'type': None, 'name': 'comments', 'id': None, 'placeholder': None, 'label': 'Delivery instructions:', 'aria_label': None}]


## Faire le matching chammps HTML <-> Données de l'utilisateur

In [18]:

# Dictionnaire de synonymes (tokens -> clé user_data)
SYNONYMS = {
    "email": ["email", "e mail", "mail", "courriel", "emailaddress", "email_address", "adresse_mail"],
    "phone": ["phone", "tel", "telephone", "téléphone", "mobile", "gsm", "cell", "cellphone", "phonenumber", "phone_number"],
    "first_name": ["first_name", "firstname", "fname", "prenom", "prénom", "givenname", "given_name", "forename"],
    "last_name": ["last_name", "lastname", "lname", "nom", "surname", "familyname", "family_name", "nom de famille"],
    "full_name": ["name", "fullname", "full_name", "nomcomplet", "nom_complet"],
    "street": ["street", "rue", "road", "voie", "address1", "address_1"],
    "street_number": ["street_number", "number", "numero", "num"],
    "postal_code": ["zip", "zipcode", "zip_code", "postal", "postalcode", "postcode", "codepostal", "code_postal"],
    "city": ["city", "ville", "town", "commune"],
    "country": ["country", "pays", "nation"],
    "address": ["address", "adresse", "fulladdress", "full_address", "billingaddress", "shippingaddress"],
    "company": ["company", "societe", "société", "enterprise", "organisation", "organization"],
    "birth_date": ["birthdate", "birth_date", "dob", "dateofbirth", "date_naissance", "datedenaissance"],
    "gender": ["gender", "sexe", "sex", "civility", "civilite", "title"],
    "birth_day": ["day", "jour", "birthday_day", "birth_day", "dayofbirth", "jour_naissance"],
    "birth_month": ["month", "mois", "birthday_month", "birth_month", "monthofbirth", "mois_naissance"],
    "birth_year": ["year", "annee", "année", "birthday_year", "birth_year", "yearofbirth", "annee_naissance"],
    "age": ["age", "âge", "years", "ans", "birthday_age"],

}


# Nettoyage du texte pour comparaison
def _normalize(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-z0-9_àâçéèêëîïôûùüÿñæœ\s-]+", " ", text)
    text = text.replace("-", " ")
    return " ".join(text.split())


def _field_text(field) -> str:
    parts = [
        field.get("tag") or "",
        field.get("type") or "",
        field.get("name") or "",
        field.get("id") or "",
        field.get("placeholder") or "",
        field.get("label") or "",
    ]
    return _normalize(" ".join(parts))


In [19]:

# Fonction principale de correspondance
def match_field_to_user_key(field) -> tuple[str | None, float, str]:

    # Priorité au tyepe du champ
    field_type = field.get("type")
    if field_type:
        field_type = field_type.lower()

        if field_type == "email":
            return "email", 1.0, "Matched by input type=email"

        if field_type in {"tel", "phone"}:
            return "phone", 0.95, f"Matched by input type={field_type}"

        if field_type == "password":
            return None, 0.0, "Password field ignored"

        if field_type in {"date"}:
            return "birth_date", 0.9, "Matched by input type=date"

        if field_type in {"number"}:
            # peut être âge ou code postal → on laisse aux tokens
            pass

    # match par tokens
    blob = _field_text(field)
    if "email" in blob and "mobile" in blob:
        return "email", 0.95, "Matched by combined email/phone label"

    for key, tokens in SYNONYMS.items():
        for token in tokens:
            token_norm = _normalize(token)
            if token_norm and token_norm in blob:
                return (
                    key,
                    0.9,
                    f"Matched by token '{token}' in field attributes",
                )

    return None, 0.0, "No match found"


In [20]:
for field in fields:
    print(match_field_to_user_key(field))

('full_name', 0.9, "Matched by token 'name' in field attributes")
('phone', 0.95, 'Matched by input type=tel')
('email', 1.0, 'Matched by input type=email')
(None, 0.0, 'No match found')


## Auto_complétion

In [28]:
# Création d'un utilisateur
USER_DATA = {
    "first_name": "Amel",
    "last_name": "Cherbi",
    "full_name": "Amel Cherbi",
    "email": "amel@mail.com",
    "phone": "0612345678",
    "city": "Paris",
    "address": "10 rue de Rivoli",
    "postal_code": "75001",
    "country": "France",
}


In [29]:
#Fonctions nécessaire à l'autocomplétion des formulaires.

def make_driver():
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1280,900")

    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )
    driver.set_page_load_timeout(25)
    return driver


def get_user_value(user_data: dict, key: str) -> str | None:
    value = user_data.get(key)
    if value is None:
        return None
    return str(value)


def find_element(driver, field: dict):
    if field.get("name"):
        try:
            return driver.find_element(By.NAME, field["name"])
        except Exception:
            pass

    if field.get("id"):
        try:
            return driver.find_element(By.ID, field["id"])
        except Exception:
            pass

    return None


def fill_fields(driver, fields: list[dict], user_data: dict) -> dict:
    filled, skipped = [], []

    for field in fields:
        key, conf, reason = match_field_to_user_key(field)

        if not key:
            skipped.append({"field": field, "reason": reason})
            continue

        value = get_user_value(user_data, key)
        if not value:
            skipped.append({"field": field, "reason": f"Aucune donnée utilisateur pour {key}"})
            continue

        el = find_element(driver, field)
        if not el:
            skipped.append({"field": field, "reason": "Element non trouvé dans le DOM"})
            continue

        try:
            el.clear()
        except Exception:
            pass

        el.send_keys(value)

        # Simulation d’un vrai utilisateur
        driver.execute_script(
            "arguments[0].dispatchEvent(new Event('input', {bubbles:true}));"
            "arguments[0].dispatchEvent(new Event('change', {bubbles:true}));",
            el,
        )

        filled.append({
            "key": key,
            "value": value,
            "confidence": conf,
            "field": field
        })

    return {"filled": filled, "skipped": skipped}


In [None]:
#Test complet d’extraction et de remplissage de formulaire.

TEST_URL = "https://httpbin.org/forms/post"

driver = make_driver()
driver.get(TEST_URL)

html = driver.page_source

# extraction déjà implémentée dans ton notebook
soup = BeautifulSoup(html, "lxml")
fields = []

elements = soup.find_all(["input", "select", "textarea"])

for element in elements:
    raw_label = extract_label_for_field(element, soup)
    label = clean_label(raw_label)

    if not is_user_fillable_field(element, label):
        continue

    fields.append({
        "tag": element.name,
        "type": element.get("type"),
        "name": element.get("name"),
        "id": element.get("id"),
        "placeholder": element.get("placeholder"),
        "label": label,
    })

print(f"{len(fields)} champs détectés")


4 champs détectés


In [31]:
result = fill_fields(driver, fields, USER_DATA)

print("✅ Champs remplis :")
for f in result["filled"]:
    print(f["key"], "→", f["value"])

print("\n⏭️ Champs ignorés :")
for s in result["skipped"]:
    print("-", s["reason"])

✅ Champs remplis :
full_name → Amel Cherbi
phone → 0612345678
email → amel@mail.com

⏭️ Champs ignorés :
- No match found


In [32]:
for field in fields:
    name = field.get("name")
    if not name:
        continue
    try:
        el = driver.find_element(By.NAME, name)
        print(name, "=", el.get_attribute("value"))
    except Exception:
        pass


custname = Amel Cherbi
custtel = 0612345678
custemail = amel@mail.com
comments = 


In [33]:
driver.quit()
