## LEKOUNDA NGOLO Mardochet Gédéon 
### COOP Msc |DATA SCIENCE
### FINAL EXAMEN : DATA COLLECTION 
### Date : 09-12-2025

#### DATABASE CONFIGURATION

In [1]:
import sqlite3
import pandas as pd
from requests import get
from bs4 import BeautifulSoup as bs

In [18]:
DB_PATH = "dog_annonces.db"

def setup_database(db_name="dog_annonces.db"):
    conn = sqlite3.connect(db_name)
    c = conn.cursor()
    
    # Création de la table
    c.execute('''
        CREATE TABLE IF NOT EXISTS announces_dogs (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT,
            price INTEGER,
            location TEXT,
            image_url TEXT,
            product_url TEXT UNIQUE
        )
    ''')
    
    conn.commit()
    return conn

### SCRAPPING WEB PAGES WITH BeautifulSoup

In [None]:

def scrape_page(page_number):
    url = f'https://sn.coinafrique.com/categorie/chiens?page={page_number}'
    res = get(url)
    soup = bs(res.content, "html.parser")

    containers = soup.select("div.card.ad__card") 

    data = []

    for container in containers:
        try:
            # extract listing link
            a_tag = container.find("a", href=True)
            if not a_tag:
                continue

            product_url = "https://sn.coinafrique.com" + a_tag["href"]

            # open detail page
            detail_res = get(product_url)
            soup_detail = bs(detail_res.content, 'html.parser')

            # title
            title_tag = soup_detail.find("h1")
            title = title_tag.get_text(strip=True) if title_tag else None

            # price
            price_tag = container.select_one("p.ad__card-price a")
            if price_tag:
                # Supprime 'CFA' et les espaces
                price_raw = price_tag.get_text(strip=True)
                price = price_raw.replace("CFA", "").replace(" ", "")
            else:
                price = None

            # location
            loc_span = container.select_one("p.ad__card-location span")
            location = loc_span.get_text(strip=True) if loc_span else None

            # images
            img_tag = soup_detail.find("img", class_="ad__card-img")
            if not img_tag:
                img_tag = soup_detail.find("img")
            image_link = img_tag["src"] if img_tag else None

            data.append({
                "title": title,
                "price": price,
                "location": location,
                "image": image_link,
                "url": product_url
            })

        except Exception as e:
            print("Error:", e)
            continue

    return data


In [13]:
df = pd.DataFrame()
for page in range(1, 3):
    print(f"Scraping page {page}...")
    page_data = scrape_page(page)
    if len(page_data) == 0:
        print("Probably empty page.")
        break
    df_page = pd.DataFrame(page_data)
    df = pd.concat([df, df_page], axis=0).reset_index(drop=True)
print("Scraping completed!") # For test need I scrapping only 4 pages#df.to_csv("coinafrique_apartments_raw.csv", index=False)

Scraping page 1...
Scraping page 2...
Scraping completed!


In [14]:
df 

Unnamed: 0,title,price,location,image,url
0,Berger malinois charbonner,250,"Thies, Sénégal",https://static.coinafrique.com/static/images/p...,https://sn.coinafrique.com/annonce/chiens/berg...
1,Chiot Bichon Maltais,100000,"Almadies, Dakar, Sénégal",https://static.coinafrique.com/static/images/p...,https://sn.coinafrique.com/annonce/chiens/chio...
2,Chiots Labradors Retriever,300000,"Guediawaye, Dakar, Sénégal",https://static.coinafrique.com/static/images/p...,https://sn.coinafrique.com/annonce/chiens/chio...
3,Chiot Malinois,200000,"Dakar, Sénégal",https://static.coinafrique.com/static/images/p...,https://sn.coinafrique.com/annonce/chiens/chio...
4,Chiot Berger Malinois,150000,"Almadies 2, Dakar, Sénégal",https://static.coinafrique.com/static/images/p...,https://sn.coinafrique.com/annonce/chiens/chio...
...,...,...,...,...,...
163,Chiot Berger Allemand,130000,"Niaga, Sénégal",https://static.coinafrique.com/static/images/p...,https://sn.coinafrique.com/annonce/chiens/chio...
164,Chiot Rottweiler,250,"Dakar, Sénégal",https://static.coinafrique.com/static/images/p...,https://sn.coinafrique.com/annonce/chiens/chio...
165,Chiots labrador pure race,Prixsurdemande,"Mbao, Dakar, Sénégal",https://static.coinafrique.com/static/images/p...,https://sn.coinafrique.com/annonce/chiens/chio...
166,Chiots Bichon Maltais,Prixsurdemande,"Dakar, Sénégal",https://static.coinafrique.com/static/images/p...,https://sn.coinafrique.com/annonce/chiens/chio...


### SCRAPPING WEB PAGES WITH BeautifulSoup (optimized version with sqlite)

In [None]:

def scrape_page_optimized(page_number):
    """
    Scrape la page de liste pour les annonces de chiens, en extrayant les données
    directement des cartes pour optimiser les performances.
    """
    base_url = "https://sn.coinafrique.com"
    list_url = f'{base_url}/categorie/chiens?page={page_number}'
    
    print(f"Scrapping de la page : {list_url}")
    
    try:
        res = get(list_url)
        res.raise_for_status() # raise bad requests HTTP code
    except Exception as e:
        print(f"Erreur lors de la requête de la page {page_number}: {e}")
        return []

    soup = bs(res.content, "html.parser")
    
    # REAL CSS CLASS FOR EACH AD PREVIEW
    containers = soup.select("div.card.ad__card") 
    print(f"Trouvé {len(containers)} annonces sur la page.")
    
    data = []

    for container in containers:
        try:
            # 1. Product Link & URL
            a_tag = container.find("a", href=True)
            if not a_tag:
                continue

            product_url = base_url + a_tag["href"]

            # 2. Title / Description
            # Found in <p class="ad__card-description">
            title_tag = container.select_one("p.ad__card-description a")
            title = title_tag.get_text(strip=True) if title_tag else None
            
            # 3. Price
            # Found in <p class="ad__card-price">
            price_tag = container.select_one("p.ad__card-price a")
            if price_tag:
                # delete 'CFA' and spaces
                price_raw = price_tag.get_text(strip=True)
                price = price_raw.replace("CFA", "").replace(" ", "")
            else:
                price = None

            # 4. Location (Adresse complète)
            # Found in <p class="ad__card-location">
            loc_span = container.select_one("p.ad__card-location span")
            location = loc_span.get_text(strip=True) if loc_span else None

            # 5. First Main Image
            # Found in <img class="ad__card-img">
            img_tag = container.find("img", class_="ad__card-img")
            image_link = img_tag["src"] if img_tag and "src" in img_tag.attrs else None

            data.append({
                "title": title,
                "price": price,
                "location": location,
                "image": image_link,
                "url": product_url
            })

        except Exception as e:
            # Affiche l'erreur mais continue le scrapping
            print(f"Erreur lors de l'extraction d'un élément: {e}")
            continue
            
    return data

In [7]:
data_page_1 = scrape_page_optimized(1)
print(data_page_1) # To verify the scraped data


Scrapping de la page : https://sn.coinafrique.com/categorie/chiens?page=1
Trouvé 84 annonces sur la page.
[{'title': 'Berger malinois charbonner', 'price': '250', 'location': 'Thies, Sénégal', 'image': 'https://images.coinafrique.com/thumb_5649558_uploaded_image1_1764572984.jpg', 'url': 'https://sn.coinafrique.com/annonce/chiens/berger-malinois-charbonner-5649558'}, {'title': 'Chiot Bichon Maltais', 'price': '100000', 'location': 'Almadies, Dakar, Sénégal', 'image': 'https://images.coinafrique.com/thumb_5644579_uploaded_image1_1764272051.jpg', 'url': 'https://sn.coinafrique.com/annonce/chiens/chiot-bichon-maltais-5644579'}, {'title': 'Chiots Labradors Retriever', 'price': '300000', 'location': 'Guediawaye, Dakar, Sénégal', 'image': 'https://images.coinafrique.com/thumb_5640905_uploaded_image1_1764096718.jpg', 'url': 'https://sn.coinafrique.com/annonce/chiens/chiots-labradors-retriever-5640905'}, {'title': 'Chiot Malinois', 'price': '200000', 'location': 'Dakar, Sénégal', 'image': 'http

In [None]:
# --- 3. Main Function
def main_scraper(start_page=1, end_page=5):
    """
    Gère le flux de scraping, l'insertion dans la base de données et la fermeture.
    """
    # 1. DB configuration
    conn = setup_database()
    c = conn.cursor()
    
    total_inserted = 0
    total_skipped = 0
    
    for page in range(start_page, end_page + 1):
        # 2. Pages scraping
        ads_data = scrape_page_optimized(page)
        
        # 3. Data Insertion
        for ad in ads_data:
            # Using the IGNORE clause to avoid duplicates
            # (based on the 'product_url' column, which is UNIQUE)
            try:
                c.execute(
                    "INSERT OR IGNORE INTO announces_dogs (title, price, location, image_url, product_url) VALUES (:title, :price, :location, :image, :url)",
                    ad
                )
                if c.rowcount > 0:
                    total_inserted += 1
                else:
                    total_skipped += 1
                    
            except sqlite3.Error as e:
                print(f"Erreur SQLite lors de l'insertion: {e}")
        
        # saves changes for each pages
        conn.commit()
        
    print("\n" + "="*50)
    print(f"Scraping finished on {end_page - start_page + 1} pages.")
    print(f"Total inserted announces : {total_inserted}")
    print(f"Total ignored announces : {total_skipped}")
    
    # 4. close connexion
    conn.close()

In [25]:
# --- Execution ---
if __name__ == "__main__":
    main_scraper(start_page=1, end_page=5)

Scrapping de la page : https://sn.coinafrique.com/categorie/chiens?page=1
Trouvé 84 annonces sur la page.
Scrapping de la page : https://sn.coinafrique.com/categorie/chiens?page=2
Trouvé 84 annonces sur la page.
Scrapping de la page : https://sn.coinafrique.com/categorie/chiens?page=3
Trouvé 84 annonces sur la page.
Scrapping de la page : https://sn.coinafrique.com/categorie/chiens?page=4
Trouvé 84 annonces sur la page.
Scrapping de la page : https://sn.coinafrique.com/categorie/chiens?page=5
Trouvé 84 annonces sur la page.

Scraping finished on 5 pages.
Total inserted announces : 420
Total ignored announces : 0


### Scrapes data v2 for moutons_annonces

In [26]:
# --- 1. FONCTION DE CRÉATION DE LA BASE DE DONNÉES ---
def setup_database2(db_name="moutons_annonces.db"):
    conn = sqlite3.connect(db_name)
    c = conn.cursor()
    
    # Création de la table (maintenant pour les moutons)
    c.execute('''
        CREATE TABLE IF NOT EXISTS announces_moutons (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT,
            price INTEGER,        
            address TEXT,         
            image_url TEXT,       
            product_url TEXT UNIQUE
        )
    ''')
    
    conn.commit()
    return conn

In [None]:

# --- 2. FONCTION DE SCRAPING OPTIMISÉE POUR LES MOUTONS ---
def scrape_moutons_page(page_number):
    """
    Scrape une page de liste Coinafrique pour les annonces de MOUTONS.
    """
    base_url = "https://sn.coinafrique.com"
    list_url = f'{base_url}/categorie/moutons?page={page_number}' # Changement ici
    
    print(f"Scrapping de la page : {list_url}")
    
    try:
        res = get(list_url)
        res.raise_for_status() 
    except Exception as e:
        print(f"Erreur lors de la requête de la page {page_number}: {e}")
        return []

    soup = bs(res.content, "html.parser")
    containers = soup.select("div.card.ad__card") 
    
    data = []

    for container in containers:
        try:
            # get URL of product
            a_tag = container.find("a", href=True)
            if not a_tag:
                continue
                
            product_url = base_url + a_tag["href"]
            
            # Name (Titre / Description)
            title_tag = container.select_one("p.ad__card-description a")
            name = title_tag.get_text(strip=True) if title_tag else None
            
            # Price (Prix)
            price_tag = container.select_one("p.ad__card-price a")
            price = None
            if price_tag:
                price_raw = price_tag.get_text(strip=True)
                # Nettoyage et conversion du prix
                price_str = price_raw.replace("CFA", "").replace(" ", "").strip()
                price = int(price_str) if price_str.isdigit() else None

            # Address (Localisation)
            loc_span = container.select_one("p.ad__card-location span")
            address = loc_span.get_text(strip=True) if loc_span else None

            # Image Link (Lien de l'image)
            img_tag = container.find("img", class_="ad__card-img")
            image_link = img_tag["src"] if img_tag and "src" in img_tag.attrs else None

            # Ajout des données sous forme de tuple pour l'insertion SQLite
            data.append((name, price, address, image_link, product_url))

        except Exception as e:
            print(f"Erreur lors de l'extraction d'un élément: {e}")
            continue
            
    return data

In [28]:
data_page_1 = scrape_moutons_page(1)
print(data_page_1) # To verify the scraped data

Scrapping de la page : https://sn.coinafrique.com/categorie/moutons?page=1
[('Fauteuil pliable', 50000, 'Parcelle Assainies, Dakar, Sénégal', 'https://images.coinafrique.com/thumb_5641122_uploaded_image2_1764104086.jpeg', 'https://sn.coinafrique.com/annonce/moutons/fauteuil-pliable-5641122'), ('Agneau pur sang', None, 'Mbao, Dakar, Sénégal', 'https://images.coinafrique.com/thumb_5631456_uploaded_image1_1763631774.jpg', 'https://sn.coinafrique.com/annonce/moutons/agneau-pur-sang-5631456'), ('Mouton', 125000, 'Guediawaye, Dakar, Sénégal', 'https://images.coinafrique.com/thumb_5630109_uploaded_image1_1763563794.jpg', 'https://sn.coinafrique.com/annonce/moutons/mouton-5630109'), ('Mouton Ladoum', 800000, 'Guediawaye, Dakar, Sénégal', 'https://images.coinafrique.com/thumb_5630102_uploaded_image1_1763563551.jpg', 'https://sn.coinafrique.com/annonce/moutons/mouton-ladoum-5630102'), ('Moutons Ladoum', 350000, 'Thies, Sénégal', 'https://images.coinafrique.com/thumb_5613540_uploaded_image1_17627

In [29]:
pd.DataFrame(data_page_1)

Unnamed: 0,0,1,2,3,4
0,Fauteuil pliable,50000.0,"Parcelle Assainies, Dakar, Sénégal",https://images.coinafrique.com/thumb_5641122_u...,https://sn.coinafrique.com/annonce/moutons/fau...
1,Agneau pur sang,,"Mbao, Dakar, Sénégal",https://images.coinafrique.com/thumb_5631456_u...,https://sn.coinafrique.com/annonce/moutons/agn...
2,Mouton,125000.0,"Guediawaye, Dakar, Sénégal",https://images.coinafrique.com/thumb_5630109_u...,https://sn.coinafrique.com/annonce/moutons/mou...
3,Mouton Ladoum,800000.0,"Guediawaye, Dakar, Sénégal",https://images.coinafrique.com/thumb_5630102_u...,https://sn.coinafrique.com/annonce/moutons/mou...
4,Moutons Ladoum,350000.0,"Thies, Sénégal",https://images.coinafrique.com/thumb_5613540_u...,https://sn.coinafrique.com/annonce/moutons/mou...
...,...,...,...,...,...
79,Landoum pur,1200000.0,"Dakar, Sénégal",https://images.coinafrique.com/thumb_4948067_u...,https://sn.coinafrique.com/annonce/moutons/lan...
80,Mouton Ladoum,800000.0,"Dieuppeul-Derklé, Dakar, Sénégal",https://images.coinafrique.com/thumb_4931367_u...,https://sn.coinafrique.com/annonce/moutons/mou...
81,Mouton Ladoum,350000.0,"Mbour, Sénégal",https://images.coinafrique.com/thumb_4905476_u...,https://sn.coinafrique.com/annonce/moutons/mou...
82,Ladoum Mix,250000.0,"Ouakam, Dakar, Sénégal",https://images.coinafrique.com/thumb_4885680_u...,https://sn.coinafrique.com/annonce/moutons/lad...


In [32]:

# --- 3. Principal fonction of main scrape
def main_scraper2(start_page=1, end_page=3):
    """
    Gère le flux de scraping, l'insertion dans la base de données et la fermeture.
    """
    conn = setup_database2()
    c = conn.cursor()
    total_inserted = 0
    total_skipped = 0
    
    for page in range(start_page, end_page + 1):
        # 2. Scraping of page
        ads_data = scrape_moutons_page(page) #
        
        # 3.Data insertion
        for ad in ads_data:
            # IGNORE sql key word for avoid duplicated values
            try:
                # (name, price, address, image_url, product_url)
                c.execute('''
                    INSERT OR IGNORE INTO announces_moutons (name, price, address, image_url, product_url) 
                    VALUES (?, ?, ?, ?, ?)
                ''', ad)
                
                if c.rowcount > 0:
                    total_inserted += 1
                else:
                    total_skipped += 1
                    
            except sqlite3.Error as e:
                print(f"Erreur SQLite lors de l'insertion: {e}")
        
        conn.commit()
        
    print("\n" + "="*50)
    print(f"Scraping des moutons finished on {end_page - start_page + 1} pages.")
    print(f"Total inserted annonces : {total_inserted}")
    print(f"Total ignored announces(doublicated) : {total_skipped}")
    
    # 4. Closed connexion
    conn.close()

In [33]:
if __name__ == "__main__":
    main_scraper2(start_page=1, end_page=5)

Scrapping de la page : https://sn.coinafrique.com/categorie/moutons?page=1
Scrapping de la page : https://sn.coinafrique.com/categorie/moutons?page=2
Scrapping de la page : https://sn.coinafrique.com/categorie/moutons?page=3
Scrapping de la page : https://sn.coinafrique.com/categorie/moutons?page=4
Scrapping de la page : https://sn.coinafrique.com/categorie/moutons?page=5

Scraping des moutons finished on 5 pages.
Total inserted annonces : 420
Total ignored announces(doublicated) : 0


### Scrapping data (autres-animaux)

In [34]:
# --- 1. FONCTION DE CRÉATION DE LA BASE DE DONNÉES ---
def setup_database3(db_name="autres_animaux_annonces.db"):
    """
    Crée la base de données SQLite et la table 'annonces' si elles n'existent pas.
    """
    conn = sqlite3.connect(db_name)
    c = conn.cursor()
    
    c.execute('''
        CREATE TABLE IF NOT EXISTS other_announces (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT,            
            price INTEGER,        
            address TEXT,         
            image_url TEXT,       
            product_url TEXT UNIQUE
        )
    ''')
    conn.commit()
    return conn

In [None]:
# ---  AGAIN SAME THINGS FOR THIS ONE 
def scrape_autres_animaux_page(page_number):
    """
    Scrape une page de liste pour les annonces d'AUTRES ANIMAUX.
    """
    base_url = "https://sn.coinafrique.com"
    list_url = f'{base_url}/categorie/autres-animaux?page={page_number}' 
    
    print(f"Scrapping de la page : {list_url}")
    
    try:
        res = get(list_url)
        res.raise_for_status() 
    except Exception as e:
        print(f"Erreur lors de la requête de la page {page_number}: {e}")
        return []

    soup = bs(res.content, "html.parser")
    containers = soup.select("div.card.ad__card") 
    
    data = []

    for container in containers:
        try:
            #Get URL of product
            a_tag = container.find("a", href=True)
            if not a_tag:
                continue
                
            product_url = base_url + a_tag["href"]
            
            # Name (Titre / Description dans le site)
            title_tag = container.select_one("p.ad__card-description a")
            name = title_tag.get_text(strip=True) if title_tag else None
            
            # Price (Prix)
            price_tag = container.select_one("p.ad__card-price a")
            price = None
            if price_tag:
                price_raw = price_tag.get_text(strip=True)
                # Nettoyage et conversion du prix en entier
                price_str = price_raw.replace("CFA", "").replace(" ", "").strip()
                price = int(price_str) if price_str.isdigit() else None

            # Address (Localisation)
            loc_span = container.select_one("p.ad__card-location span")
            address = loc_span.get_text(strip=True) if loc_span else None

            # image Link 
            img_tag = container.find("img", class_="ad__card-img")
            image_link = img_tag["src"] if img_tag and "src" in img_tag.attrs else None
            data.append((name, price, address, image_link, product_url))

        except Exception as e:
            print(f"Erreur lors de l'extraction d'un élément: {e}")
            continue
            
    return data

In [None]:

# --- 3. FONCTION PRINCIPALE D'EXÉCUTION ET D'INSERTION ---
def main_scraper3(start_page=1, end_page=3):
    """
    Gère le flux de scraping pour les Autres Animaux, l'insertion dans la base de données et la fermeture.
    """
    # 1. Configuration de la DB (fichier 'autres_animaux_annonces.db')
    conn = setup_database3()
    c = conn.cursor()
    
    total_inserted = 0
    total_skipped = 0
    
    for page in range(start_page, end_page + 1):
        # 2. Scraping pages
        ads_data = scrape_autres_animaux_page(page) 
        
        # 3. Insertion data
        for ad in ads_data:
            try:
                c.execute('''
                    INSERT OR IGNORE INTO other_announces (name, price, address, image_url, product_url) 
                    VALUES (?, ?, ?, ?, ?)
                ''', ad)
                
                if c.rowcount > 0:
                    total_inserted += 1
                else:
                    total_skipped += 1
                    
            except sqlite3.Error as e:
                print(f"Erreur SQLite lors de l'insertion: {e}")
        
        # SChange saves
        conn.commit()
        
    print("\n" + "="*50)
    print(f"Scraping des Autres Animaux terminé sur {end_page - start_page + 1} pages.")
    print(f"Total d'annonces insérées : {total_inserted}")
    print(f"Total d'annonces ignorées (doublons) : {total_skipped}")
    
    # 4. Fermeture de la connexion
    conn.close()

In [37]:
# --- Exécution ---
if __name__ == "__main__":
    main_scraper3(start_page=1, end_page=5)

Scrapping de la page : https://sn.coinafrique.com/categorie/autres-animaux?page=1
Scrapping de la page : https://sn.coinafrique.com/categorie/autres-animaux?page=2
Scrapping de la page : https://sn.coinafrique.com/categorie/autres-animaux?page=3
Scrapping de la page : https://sn.coinafrique.com/categorie/autres-animaux?page=4
Scrapping de la page : https://sn.coinafrique.com/categorie/autres-animaux?page=5

Scraping des Autres Animaux terminé sur 5 pages.
Total d'annonces insérées : 420
Total d'annonces ignorées (doublons) : 0


### Scrappe data (pigeons et lapins)

In [None]:
# --- 1. FONCTION DE CRÉATION DE LA BASE DE DONNÉES ---
def setup_database4(db_name="ppige_annonces.db"):
    """
    Crée la base de données SQLite et la table 'annonces' si elles n'existent pas.
    """
    conn = sqlite3.connect(db_name)
    c = conn.cursor()
    
    c.execute('''
        CREATE TABLE IF NOT EXISTS pp_annonces (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT,            
            price INTEGER,        
            address TEXT,         
            image_url TEXT,       
            product_url TEXT UNIQUE
        )
    ''')
    conn.commit()
    return conn

In [39]:
# --- 2. FONCTION DE SCRAPING OPTIMISÉE POUR LA VOLAILLE ---
def scrape_volaille_page(page_number):
    """
    Scrape une page de liste Coinafrique pour les annonces de Poules, Lapins et Pigeons.
    """
    base_url = "https://sn.coinafrique.com"
    list_url = f'{base_url}/categorie/poules-lapins-et-pigeons?page={page_number}' 
    
    print(f"Scrapping de la page : {list_url}")
    
    try:
        res = get(list_url)
        res.raise_for_status() 
    except Exception as e:
        print(f"Erreur lors de la requête de la page {page_number}: {e}")
        return []

    soup = bs(res.content, "html.parser")
    containers = soup.select("div.card.ad__card") 
    
    data = []

    for container in containers:
        try:
            a_tag = container.find("a", href=True)
            if not a_tag:
                continue
                
            product_url = base_url + a_tag["href"]
            
            # Name (Titre / Description)
            title_tag = container.select_one("p.ad__card-description a")
            name = title_tag.get_text(strip=True) if title_tag else None
            
            # Price 
            price_tag = container.select_one("p.ad__card-price a")
            price = None
            if price_tag:
                price_raw = price_tag.get_text(strip=True)
                price_str = price_raw.replace("CFA", "").replace(" ", "").strip()
                price = int(price_str) if price_str.isdigit() else None

            # Address (Localisation)
            loc_span = container.select_one("p.ad__card-location span")
            address = loc_span.get_text(strip=True) if loc_span else None

            # Image Link (Lien de l'image)
            img_tag = container.find("img", class_="ad__card-img")
            image_link = img_tag["src"] if img_tag and "src" in img_tag.attrs else None

            data.append((name, price, address, image_link, product_url))

        except Exception as e:
            print(f"Erreur lors de l'extraction d'un élément: {e}")
            continue
            
    return data

In [40]:

# --- 3. FONCTION PRINCIPALE D'EXÉCUTION ET D'INSERTION ---
def main_scraper4(start_page=1, end_page=3):
    """
    Gère le flux de scraping, l'insertion dans la base de données et la fermeture.
    """
    # 1. Configuration de la DB (fichier 'poules_lapins_pigeons_annonces.db')
    conn = setup_database4()
    c = conn.cursor()
    
    total_inserted = 0
    total_skipped = 0
    
    for page in range(start_page, end_page + 1):
        # 2. Scraping de la page
        ads_data = scrape_volaille_page(page) 
        
        # 3. Insertion des données
        for ad in ads_data:
            # Utilisation de INSERT OR IGNORE pour éviter les doublons
            try:
                c.execute('''
                    INSERT OR IGNORE INTO pp_annonces (name, price, address, image_url, product_url) 
                    VALUES (?, ?, ?, ?, ?)
                ''', ad)
                
                if c.rowcount > 0:
                    total_inserted += 1
                else:
                    total_skipped += 1
                    
            except sqlite3.Error as e:
                print(f"Erreur SQLite lors de l'insertion: {e}")
        
        # Sauvegarde les changements après chaque page
        conn.commit()
        
    print("\n" + "="*50)
    print(f"Scraping des Poules, Lapins et Pigeons terminé sur {end_page - start_page + 1} pages.")
    print(f"Total d'annonces insérées : {total_inserted}")
    print(f"Total d'annonces ignorées (doublons) : {total_skipped}")
    
    # 4. close connexion
    conn.close()

In [42]:
main_scraper4(start_page=1, end_page=5) 

Scrapping de la page : https://sn.coinafrique.com/categorie/poules-lapins-et-pigeons?page=1
Scrapping de la page : https://sn.coinafrique.com/categorie/poules-lapins-et-pigeons?page=2
Scrapping de la page : https://sn.coinafrique.com/categorie/poules-lapins-et-pigeons?page=3
Scrapping de la page : https://sn.coinafrique.com/categorie/poules-lapins-et-pigeons?page=4
Scrapping de la page : https://sn.coinafrique.com/categorie/poules-lapins-et-pigeons?page=5

Scraping des Poules, Lapins et Pigeons terminé sur 5 pages.
Total d'annonces insérées : 168
Total d'annonces ignorées (doublons) : 252


## Cleaning data steps (for all of dataframes)

In [45]:
db_name = "ppige_annonces.db"
conn = sqlite3.connect(db_name)

# 2. Load data in DataFrame
df_panounces= pd.read_sql_query("SELECT * FROM pp_annonces", conn)

# 3. close connexion
conn.close()

print(f"DataFrame load {len(df_panounces)} rows.")

DataFrame load 420 rows.


In [66]:
df_panounces.drop_duplicates(inplace=True) 

print(f"Size of dataframe after suppression : {len(df_panounces )}")

Size of dataframe after suppression : 420


In [62]:
def extract_city(address):
    if pd.isna(address):
        return None
    parts = address.split(',')
    if len(parts) >= 2:
        return parts[1].strip()

    return address.strip() 

# Apply for new column'city'
df_panounces['city'] = df_panounces['address'].apply(extract_city)

In [63]:
print(df_panounces['city'].value_counts().head(10))

city
Sénégal    234
Dakar      186
Name: count, dtype: int64


In [None]:
# Conversion finale en numérique (gère les cas où la DB aurait stocké des NaN comme chaînes)
# 'coerce' va remplacer les valeurs non numériques restantes par NaN
df_panounces['price'] = pd.to_numeric(df_panounces ['price'], errors='coerce') 

# Suppression des prix irréalistes (par exemple, prix égal à 0 ou négatif)
df.dropna(subset=['price'], inplace=True)

In [71]:
df_panounces['price'].fillna(df_panounces['price'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_panounces['price'].fillna(df_panounces['price'].median(), inplace=True)


In [75]:
df_panounces.dtypes

id               int64
name            object
price          float64
address         object
image_url       object
product_url     object
city            object
dtype: object

In [76]:
df.isnull().sum()

title       0
price       0
location    0
image       0
url         0
dtype: int64

In [74]:
print("\n" + "="*50)
print("Cleaning has done .")
print(f"final size cleaned : {len(df)}")
df_panounces.sample(20)


Cleaning has done .
final size cleaned : 168


Unnamed: 0,id,name,price,address,image_url,product_url,city
331,584,Poulets,3500.0,"Dakar, Sénégal",https://images.coinafrique.com/thumb_3809817_u...,https://sn.coinafrique.com/annonce/poules-lapi...,Sénégal
142,143,Poussin Marans,10000.0,"Dakar, Sénégal",https://images.coinafrique.com/thumb_4603800_u...,https://sn.coinafrique.com/annonce/poules-lapi...,Sénégal
21,22,Pigeon Beauté Allemande,25000.0,"Thies, Sénégal",https://images.coinafrique.com/thumb_5486666_u...,https://sn.coinafrique.com/annonce/poules-lapi...,Sénégal
9,10,Couple Mondain,130000.0,"HLM, Dakar, Sénégal",https://images.coinafrique.com/thumb_5541428_u...,https://sn.coinafrique.com/annonce/poules-lapi...,Dakar
239,240,Pigeon Padda de java,20000.0,"Point E, Dakar, Sénégal",https://images.coinafrique.com/thumb_4191356_u...,https://sn.coinafrique.com/annonce/poules-lapi...,Dakar
289,542,Poulets de chair,3000.0,"Dakar, Sénégal",https://images.coinafrique.com/thumb_4021629_u...,https://sn.coinafrique.com/annonce/poules-lapi...,Sénégal
238,239,Pigeon Texan,70000.0,"Mbour, Sénégal",https://images.coinafrique.com/thumb_4193114_u...,https://sn.coinafrique.com/annonce/poules-lapi...,Sénégal
365,618,Pigeon dragon,20000.0,"Mariste, Dakar, Sénégal",https://images.coinafrique.com/thumb_3671452_u...,https://sn.coinafrique.com/annonce/poules-lapi...,Dakar
404,657,Pigeons,25000.0,"Guediawaye, Dakar, Sénégal",https://images.coinafrique.com/thumb_3521128_u...,https://sn.coinafrique.com/annonce/poules-lapi...,Dakar
60,61,Lapin Californien,120000.0,"Almadies 2, Dakar, Sénégal",https://images.coinafrique.com/thumb_5189123_u...,https://sn.coinafrique.com/annonce/poules-lapi...,Dakar


In [79]:
df.to_csv('ppigeons_clean.csv', index=False)

#### cleaning dog_annonces

In [80]:
db_name = "dog_annonces.db"
conn = sqlite3.connect(db_name)

# 2. Load data in DataFrame
df = pd.read_sql_query("SELECT * FROM announces_dogs", conn)

# 3. close connexion
conn.close()

print(f"DataFrame load {len(df_panounces )} rows.")

DataFrame load 420 rows.


In [81]:
df.drop_duplicates(inplace=True) 

print(f"Size of dataframe after suppression : {len(df)}")

Size of dataframe after suppression : 420


In [82]:
df['price'] = pd.to_numeric(df['price'], errors='coerce') 

df.dropna(subset=['price'], inplace=True)

df = df[df['price'] > 0]

df['price'] = df['price'].astype(float)

print("Colonne 'price' nettoyée et convertie en float.")

Colonne 'price' nettoyée et convertie en float.


In [83]:
df['price'].fillna(df['price'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['price'].fillna(df['price'].median(), inplace=True)


In [84]:
df.isnull().sum()

id             0
title          0
price          0
location       0
image_url      0
product_url    0
dtype: int64

In [86]:
print("\n" + "="*50)
print("Cleaning has done !.")
print(f"Final size of DataFrame cleaned : {len(df)}")
df.head()


Cleaning has done !.
Final size of DataFrame cleaned : 348


Unnamed: 0,id,title,price,location,image_url,product_url
0,1,Berger malinois charbonner,250.0,"Thies, Sénégal",https://images.coinafrique.com/thumb_5649558_u...,https://sn.coinafrique.com/annonce/chiens/berg...
1,2,Chiot Bichon Maltais,100000.0,"Almadies, Dakar, Sénégal",https://images.coinafrique.com/thumb_5644579_u...,https://sn.coinafrique.com/annonce/chiens/chio...
2,3,Chiots Labradors Retriever,300000.0,"Guediawaye, Dakar, Sénégal",https://images.coinafrique.com/thumb_5640905_u...,https://sn.coinafrique.com/annonce/chiens/chio...
3,4,Chiot Malinois,200000.0,"Dakar, Sénégal",https://images.coinafrique.com/thumb_5628892_u...,https://sn.coinafrique.com/annonce/chiens/chio...
4,5,Chiot Berger Malinois,150000.0,"Almadies 2, Dakar, Sénégal",https://images.coinafrique.com/thumb_5628715_u...,https://sn.coinafrique.com/annonce/chiens/chio...


In [None]:
# Sauvegarder les données nettoyées dans un nouveau fichier CSV
df.to_csv('dog_announces.csv', index=False)

# Sauvegarder les données nettoyées dans une nouvelle table
conn_cleaned = sqlite3.connect(db_name) 

#df.to_sql('annonces_cleaned', conn_cleaned, if_exists='replace', index=False)
#conn_cleaned.close()


### cleaning moutons_annonces

In [102]:
db_name = "moutons_annonces.db"
conn = sqlite3.connect(db_name)

# 2. Load data in DataFrame
df = pd.read_sql_query("SELECT * FROM announces_moutons", conn)

# 3. close connexion
conn.close()

print(f"DataFrame load {len(df_panounces )} rows.")

DataFrame load 420 rows.


In [103]:
df.drop_duplicates(inplace=True) 

print(f"Size of dataframe after suppression : {len(df)}")

Size of dataframe after suppression : 420


In [104]:
df['price'] = pd.to_numeric(df['price'], errors='coerce') 

df.dropna(subset=['price'], inplace=True)

df = df[df['price'] > 0]

df['price'] = df['price'].astype(int)

print("Colonne 'price' nettoyée et convertie en entier.")

Colonne 'price' nettoyée et convertie en entier.


In [92]:
df.isnull().sum()

id             0
name           0
price          0
address        0
image_url      0
product_url    0
dtype: int64

In [105]:
print("\n" + "="*50)
print("Nettoyage terminé.")
print(f"Taille finale du DataFrame nettoyé : {len(df)}")
df.sample(10)



Nettoyage terminé.
Taille finale du DataFrame nettoyé : 351


Unnamed: 0,id,name,price,address,image_url,product_url
23,24,Mouton de tabaski,200000,"Ouakam, Dakar, Sénégal",https://images.coinafrique.com/thumb_5351264_u...,https://sn.coinafrique.com/annonce/moutons/mou...
405,406,Femelle ladoum,1000000,"Guediawaye, Dakar, Sénégal",https://images.coinafrique.com/thumb_3695058_u...,https://sn.coinafrique.com/annonce/moutons/fem...
30,31,Moutons,225000,"Ouakam, Dakar, Sénégal",https://images.coinafrique.com/thumb_5342599_u...,https://sn.coinafrique.com/annonce/moutons/mou...
93,94,Mouton,170000,"Mbao, Dakar, Sénégal",https://images.coinafrique.com/thumb_4785428_u...,https://sn.coinafrique.com/annonce/moutons/mou...
351,352,Mouton,100000,"Pikine, Sénégal",https://images.coinafrique.com/thumb_3747590_u...,https://sn.coinafrique.com/annonce/moutons/mou...
65,66,Bélier,225000,"Guediawaye, Dakar, Sénégal",https://images.coinafrique.com/thumb_5179070_u...,https://sn.coinafrique.com/annonce/moutons/bel...
378,379,Mouton azawat,850000,"Thies, Sénégal",https://images.coinafrique.com/thumb_3721884_u...,https://sn.coinafrique.com/annonce/moutons/mou...
73,74,Mouton Ladoum,1000000,"Rufisque, Sénégal",https://images.coinafrique.com/thumb_5026478_u...,https://sn.coinafrique.com/annonce/moutons/mou...
294,295,Moutons tabaski,175000,"Keur Massar, Sénégal",https://images.coinafrique.com/thumb_3770483_u...,https://sn.coinafrique.com/annonce/moutons/mou...
277,278,Moutons Jumeaux ladoums,600000,"Niaga, Sénégal",https://images.coinafrique.com/thumb_3911094_u...,https://sn.coinafrique.com/annonce/moutons/mou...


In [None]:
df.to_csv('mouton_cleaned.csv', index=False)

### cleaning autres_animaux

In [96]:
db_name = "autres_animaux_annonces.db"
conn = sqlite3.connect(db_name)

# 2. Load data in DataFrame
df = pd.read_sql_query("SELECT * FROM other_announces", conn)

# 3. close connexion
conn.close()

print(f"DataFrame load {len(df_panounces )} rows.")

DataFrame load 420 rows.


In [97]:
df.drop_duplicates(inplace=True) 

print(f"Size of dataframe after suppression : {len(df)}")

Size of dataframe after suppression : 420


In [None]:
df['price'] = pd.to_numeric(df['price'], errors='coerce') 

df.dropna(subset=['price'], inplace=True)

df = df[df['price'] > 0]

df['price'] = df['price'].astype(float)

print("Colonne 'price' nettoyée et convertie en float.")

Colonne 'price' nettoyée et convertie en entier.


In [99]:
df.isnull().sum()

id             0
name           0
price          0
address        0
image_url      0
product_url    0
dtype: int64

In [100]:
print("\n" + "="*50)
print("cleaning has done !.")
print(f"finale size of DataFrame nettoyé : {len(df)}")
df.head()


cleaning has done !.
finale size of DataFrame nettoyé : 329


Unnamed: 0,id,name,price,address,image_url,product_url
1,2,Taureau,2500000.0,"Fatick, Sénégal",https://images.coinafrique.com/thumb_5634272_u...,https://sn.coinafrique.com/annonce/autres-anim...
2,3,Bœuf,650.0,"Bambey, Sénégal",https://images.coinafrique.com/thumb_5632019_u...,https://sn.coinafrique.com/annonce/autres-anim...
3,4,Bœuf,725.0,"Bambey, Sénégal",https://images.coinafrique.com/thumb_5632012_u...,https://sn.coinafrique.com/annonce/autres-anim...
4,5,Vache,350000.0,"Patte d'oie, Dakar, Sénégal",https://images.coinafrique.com/thumb_5587581_u...,https://sn.coinafrique.com/annonce/autres-anim...
7,8,Perroquets perruches ondulées,25000.0,"Point E, Dakar, Sénégal",https://images.coinafrique.com/thumb_5560070_u...,https://sn.coinafrique.com/annonce/autres-anim...


In [101]:
df.to_csv('autres_animaus_cleaned.csv', index=False)