## LEKOUNDA NGOLO Mardochet Gédéon 
### COOP Msc |DATA SCIENCE
### FINAL EXAMEN : DATA COLLECTION 
### Date : 09-12-2025

#### DATABASE CONFIGURATION

In [None]:
import sqlite3
import pandas as pd
from requests import get
from bs4 import BeautifulSoup as bs

In [None]:
DB_PATH = "scraped_data.db"

def get_connection():
    return sqlite3.connect(DB_PATH)

def init_db():
    conn = get_connection()
    conn.execute("""
        CREATE TABLE IF NOT EXISTS items (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT,
            price REAL,
            url TEXT
        )
    """)
    conn.commit()
    conn.close()

def insert_data(df):
    conn = get_connection()
    df.to_sql("items", conn, if_exists="append", index=False)
    conn.close()

def load_data():
    conn = get_connection()
    df = pd.read_sql_query("SELECT * FROM items", conn)
    conn.close()
    return df

### SCRAPPING WEB PAGES WITH BeautifulSoup

In [None]:
import time

def scrape_page(page_number):
    url = f'https://sn.coinafrique.com/categorie/chiens?page={page_number}'
    res = get(url)
    time.sleep(2)

    soup = bs(res.content, "html.parser")

    # REAL CSS CLASS FOR EACH AD PREVIEW
    containers = soup.select("div.card.ad__card") ## Temporary test
    # I Use soup.select to bypass driver.find (selenium function) error with my browser

    data = []

    for container in containers:
        try:
            # Extract product link
            a_tag = container.find("a", href=True)
            if not a_tag:
                continue

            product_url = "https://ci.coinafrique.com" + a_tag["href"]

            # Open detailed page
            sub_container = get(product_url)
            time.sleep(2)
            soup_detail = bs(sub_container.content, 'html.parser')

            # Title
            title_tag = soup_detail.find("h1")
            title = title_tag.get_text(strip=True) if title_tag else None
            # On revoie None comme valeur par defaut dans le cas ou on arrive pas a recuperer le titer
            # Price
            price_tag = soup_detail.find("p", class_="ad__price")
            if not price_tag:
                price_tag = soup_detail.find("span", class_="price")

            if price_tag:
                price_raw = price_tag.get_text(strip=True)
                price = price_raw.replace("CFA", "").replace(" ", "")
            else:
                price = None

            # full adress
            loc_tag = soup_detail.find("p", class_="ad__card-location")
            if loc_tag:
                location = loc_tag.get_text(strip=True)
            else:
                # backup
                loc2 = soup_detail.find("span", class_="location")
                location = loc2.get_text(strip=True) if loc2 else None

            # Extract Rooms / Bathrooms
            rooms = None
            bathrooms = None

            info_list = soup_detail.find_all("li", class_="center")

            for li in info_list:
                label = li.find_all("span")
                if len(label) >= 2:
                    name = label[0].get_text(strip=True).lower()
                    qt = label[1].get_text(strip=True)

                    if "pièces" in name:
                        rooms = qt
                    elif "bain" in name:
                        bathrooms = qt
            # Extract first main image
            img_tag = soup_detail.find("img")
            image_link = img_tag["src"] if img_tag else None

            data.append({
                "title": title,
                "price": price,
                "location": location,
                "rooms": rooms,
                "bathrooms": bathrooms,
                "image": image_link,
                "url": product_url
            })
            # or load it in database
            # add data to RR_table from renting_rooms.db
            #c.execute('''INSERT INTO RR_table VALUES(?,?,?,?,?,?,?)''',(title,price,location,rooms,bathrooms,url,image_link ))
            # commit the request 
            #conn.commit()

        except Exception as e:
            print("Error on item:", e)
            continue
    return data
