## LEKOUNDA NGOLO Mardochet Gédéon 
### COOP Msc |DATA SCIENCE
### FINAL EXAMEN : DATA COLLECTION 
### Date : 09-12-2025

#### DATABASE CONFIGURATION

In [1]:
import sqlite3
import pandas as pd
from requests import get
from bs4 import BeautifulSoup as bs

In [None]:
DB_PATH = "dog_scraped_data.db"

def setup_database(db_name="dog_scraped_data.db"):
    conn = sqlite3.connect(db_name)
    c = conn.cursor()
    
    # Création de la table
    c.execute('''
        CREATE TABLE IF NOT EXISTS annonces (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT,
            price INTEGER,
            location TEXT,
            image_url TEXT,
            product_url TEXT UNIQUE
        )
    ''')
    
    conn.commit()
    return conn

### SCRAPPING WEB PAGES WITH BeautifulSoup

In [12]:
from bs4 import BeautifulSoup as bs
from requests import get

def scrape_page(page_number):
    url = f'https://sn.coinafrique.com/categorie/chiens?page={page_number}'
    res = get(url)
    soup = bs(res.content, "html.parser")

    containers = soup.select("div.card.ad__card") 

    data = []

    for container in containers:
        try:
            # extract listing link
            a_tag = container.find("a", href=True)
            if not a_tag:
                continue

            product_url = "https://sn.coinafrique.com" + a_tag["href"]

            # open detail page
            detail_res = get(product_url)
            soup_detail = bs(detail_res.content, 'html.parser')

            # title
            title_tag = soup_detail.find("h1")
            title = title_tag.get_text(strip=True) if title_tag else None

            # price
            price_tag = container.select_one("p.ad__card-price a")
            if price_tag:
                # Supprime 'CFA' et les espaces
                price_raw = price_tag.get_text(strip=True)
                price = price_raw.replace("CFA", "").replace(" ", "")
            else:
                price = None

            # location
            loc_span = container.select_one("p.ad__card-location span")
            location = loc_span.get_text(strip=True) if loc_span else None

            # images
            img_tag = soup_detail.find("img", class_="ad__card-img")
            if not img_tag:
                img_tag = soup_detail.find("img")
            image_link = img_tag["src"] if img_tag else None

            data.append({
                "title": title,
                "price": price,
                "location": location,
                "image": image_link,
                "url": product_url
            })

        except Exception as e:
            print("Error:", e)
            continue

    return data


In [13]:
df = pd.DataFrame()
for page in range(1, 3):
    print(f"Scraping page {page}...")
    page_data = scrape_page(page)
    if len(page_data) == 0:
        print("Probably empty page.")
        break
    df_page = pd.DataFrame(page_data)
    df = pd.concat([df, df_page], axis=0).reset_index(drop=True)
print("Scraping completed!") # For test need I scrapping only 4 pages#df.to_csv("coinafrique_apartments_raw.csv", index=False)

Scraping page 1...
Scraping page 2...
Scraping completed!


In [14]:
df 

Unnamed: 0,title,price,location,image,url
0,Berger malinois charbonner,250,"Thies, Sénégal",https://static.coinafrique.com/static/images/p...,https://sn.coinafrique.com/annonce/chiens/berg...
1,Chiot Bichon Maltais,100000,"Almadies, Dakar, Sénégal",https://static.coinafrique.com/static/images/p...,https://sn.coinafrique.com/annonce/chiens/chio...
2,Chiots Labradors Retriever,300000,"Guediawaye, Dakar, Sénégal",https://static.coinafrique.com/static/images/p...,https://sn.coinafrique.com/annonce/chiens/chio...
3,Chiot Malinois,200000,"Dakar, Sénégal",https://static.coinafrique.com/static/images/p...,https://sn.coinafrique.com/annonce/chiens/chio...
4,Chiot Berger Malinois,150000,"Almadies 2, Dakar, Sénégal",https://static.coinafrique.com/static/images/p...,https://sn.coinafrique.com/annonce/chiens/chio...
...,...,...,...,...,...
163,Chiot Berger Allemand,130000,"Niaga, Sénégal",https://static.coinafrique.com/static/images/p...,https://sn.coinafrique.com/annonce/chiens/chio...
164,Chiot Rottweiler,250,"Dakar, Sénégal",https://static.coinafrique.com/static/images/p...,https://sn.coinafrique.com/annonce/chiens/chio...
165,Chiots labrador pure race,Prixsurdemande,"Mbao, Dakar, Sénégal",https://static.coinafrique.com/static/images/p...,https://sn.coinafrique.com/annonce/chiens/chio...
166,Chiots Bichon Maltais,Prixsurdemande,"Dakar, Sénégal",https://static.coinafrique.com/static/images/p...,https://sn.coinafrique.com/annonce/chiens/chio...


### SCRAPPING WEB PAGES WITH BeautifulSoup (optimized version with sqlite)

In [None]:
from requests import get
from bs4 import BeautifulSoup as bs

def scrape_page_optimized(page_number):
    """
    Scrape la page de liste pour les annonces de chiens, en extrayant les données
    directement des cartes pour optimiser les performances.
    """
    base_url = "https://sn.coinafrique.com"
    list_url = f'{base_url}/categorie/chiens?page={page_number}'
    
    print(f"Scrapping de la page : {list_url}")
    
    try:
        res = get(list_url)
        res.raise_for_status() # raise bad requests HTTP code
    except Exception as e:
        print(f"Erreur lors de la requête de la page {page_number}: {e}")
        return []

    soup = bs(res.content, "html.parser")
    
    # REAL CSS CLASS FOR EACH AD PREVIEW
    containers = soup.select("div.card.ad__card") 
    print(f"Trouvé {len(containers)} annonces sur la page.")
    
    data = []

    for container in containers:
        try:
            # 1. Product Link & URL
            a_tag = container.find("a", href=True)
            if not a_tag:
                continue

            product_url = base_url + a_tag["href"]

            # 2. Title / Description
            # Found in <p class="ad__card-description">
            title_tag = container.select_one("p.ad__card-description a")
            title = title_tag.get_text(strip=True) if title_tag else None
            
            # 3. Price
            # Found in <p class="ad__card-price">
            price_tag = container.select_one("p.ad__card-price a")
            if price_tag:
                # delete 'CFA' and spaces
                price_raw = price_tag.get_text(strip=True)
                price = price_raw.replace("CFA", "").replace(" ", "")
            else:
                price = None

            # 4. Location (Adresse complète)
            # Found in <p class="ad__card-location">
            loc_span = container.select_one("p.ad__card-location span")
            location = loc_span.get_text(strip=True) if loc_span else None

            # 5. First Main Image
            # Found in <img class="ad__card-img">
            img_tag = container.find("img", class_="ad__card-img")
            image_link = img_tag["src"] if img_tag and "src" in img_tag.attrs else None

            data.append({
                "title": title,
                "price": price,
                "location": location,
                "image": image_link,
                "url": product_url
            })

        except Exception as e:
            # Affiche l'erreur mais continue le scrapping
            print(f"Erreur lors de l'extraction d'un élément: {e}")
            continue
            
    return data

In [None]:
data_page_1 = scrape_page_optimized(1)
print(data_page_1) # To verify the scraped data


Scrapping de la page : https://sn.coinafrique.com/categorie/chiens?page=1
Trouvé 84 annonces sur la page.
[{'title': 'Berger malinois charbonner', 'price': '250', 'location': 'Thies, Sénégal', 'image': 'https://images.coinafrique.com/thumb_5649558_uploaded_image1_1764572984.jpg', 'url': 'https://sn.coinafrique.com/annonce/chiens/berger-malinois-charbonner-5649558'}, {'title': 'Chiot Bichon Maltais', 'price': '100000', 'location': 'Almadies, Dakar, Sénégal', 'image': 'https://images.coinafrique.com/thumb_5644579_uploaded_image1_1764272051.jpg', 'url': 'https://sn.coinafrique.com/annonce/chiens/chiot-bichon-maltais-5644579'}, {'title': 'Chiots Labradors Retriever', 'price': '300000', 'location': 'Guediawaye, Dakar, Sénégal', 'image': 'https://images.coinafrique.com/thumb_5640905_uploaded_image1_1764096718.jpg', 'url': 'https://sn.coinafrique.com/annonce/chiens/chiots-labradors-retriever-5640905'}, {'title': 'Chiot Malinois', 'price': '200000', 'location': 'Dakar, Sénégal', 'image': 'http

In [None]:
# --- 3. Main Function
def main_scraper(start_page=1, end_page=5):
    """
    Gère le flux de scraping, l'insertion dans la base de données et la fermeture.
    """
    # 1. DB configuration
    conn = setup_database()
    c = conn.cursor()
    
    total_inserted = 0
    total_skipped = 0
    
    for page in range(start_page, end_page + 1):
        # 2. Pages scraping
        ads_data = scrape_page_optimized(page)
        
        # 3. Data Insertion
        for ad in ads_data:
            # Using the IGNORE clause to avoid duplicates
            # (based on the 'product_url' column, which is UNIQUE)
            try:
                c.execute('''
                    INSERT OR IGNORE INTO annonces (title, price, location, image_url, product_url) 
                    VALUES (?, ?, ?, ?, ?)
                ''', ad)
                
                if c.rowcount > 0:
                    total_inserted += 1
                else:
                    total_skipped += 1
                    
            except sqlite3.Error as e:
                print(f"Erreur SQLite lors de l'insertion: {e}")
        
        # Commiter (saves) changes for each pages
        conn.commit()
        
    print("\n" + "="*50)
    print(f"Scraping finished on {end_page - start_page + 1} pages.")
    print(f"Total inserted announces : {total_inserted}")
    print(f"Total ignored announces : {total_skipped}")
    
    # 4. close connexion
    conn.close()

Unnamed: 0,title,price,location,image,url
0,Berger malinois charbonner,250,"Thies, Sénégal",https://images.coinafrique.com/thumb_5649558_u...,https://sn.coinafrique.com/annonce/chiens/berg...
1,Chiot Bichon Maltais,100000,"Almadies, Dakar, Sénégal",https://images.coinafrique.com/thumb_5644579_u...,https://sn.coinafrique.com/annonce/chiens/chio...
2,Chiots Labradors Retriever,300000,"Guediawaye, Dakar, Sénégal",https://images.coinafrique.com/thumb_5640905_u...,https://sn.coinafrique.com/annonce/chiens/chio...
3,Chiot Malinois,200000,"Dakar, Sénégal",https://images.coinafrique.com/thumb_5628892_u...,https://sn.coinafrique.com/annonce/chiens/chio...
4,Chiot Berger Malinois,150000,"Almadies 2, Dakar, Sénégal",https://images.coinafrique.com/thumb_5628715_u...,https://sn.coinafrique.com/annonce/chiens/chio...
...,...,...,...,...,...
79,Chiot Berger Allemand,160000,"Dakar, Sénégal",https://images.coinafrique.com/thumb_5379204_u...,https://sn.coinafrique.com/annonce/chiens/chio...
80,Chiots Bichons maltais,150000,"Dakar, Sénégal",https://images.coinafrique.com/thumb_5377145_u...,https://sn.coinafrique.com/annonce/chiens/chio...
81,Chiot Berger allemand,150000,"Pikine, Sénégal",https://images.coinafrique.com/thumb_5376933_u...,https://sn.coinafrique.com/annonce/chiens/chio...
82,Chiots malamute pure,Prixsurdemande,"Almadies 2, Dakar, Sénégal",https://images.coinafrique.com/thumb_5368072_u...,https://sn.coinafrique.com/annonce/chiens/chio...


In [None]:
# --- Exécution ---
# Vous pouvez ajuster les pages de début et de fin ici
if __name__ == "__main__":
    main_scraper(start_page=1, end_page=3)