In [None]:
# import libraries
import requests
import re
import pandas as pd

In [11]:
def extract_make_model_year(title):
    pattern = r"(?P<make>\b[a-zA-Z\-]+)\s+(?P<model>[A-Za-z0-9\-]+(?:\s[A-Za-z0-9\-]+)?)?.*?(?P<year>\d{4})(?!\d)"
    match = re.search(pattern, title.strip())
    if match:
        return match.group("make"), match.group("model"), match.group("year")
    return None, None, None

In [12]:
def extract_condition(condition):
    condition_lower = condition.lower()
    if "foreign used" in condition_lower:
        return "foreign used"
    elif "local used" in condition_lower:
        return "local used"
    elif "new" in condition_lower:
        return "new"
    else:
        return None

In [13]:
def extract_transmission(transmission):
    transmission_lower = transmission.lower()
    if "automatic" in transmission_lower:
        return "automatic"
    elif "manual" in transmission_lower:
        return "manual"
    else:
        return None

In [14]:
def fetch_json(page):
    url = "https://jiji.ng/api_web/v1/listing"
    params = {
        "slug": "cars",
        "page": page,
        "webp": True
    }
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()
    except requests.RequestException:
        print(f"[Page] {page} . Request error")
        return []
    except ValueError:
        print(f"[Page] {page} . Decode error")

    adverts = data.get("adverts_list", {}).get("adverts", [])
    if not isinstance(adverts, list):
        print(f"An error occured. Expected list but got {type(adverts)}")
        return[]
    return adverts

In [15]:
def get_attrs_value(attrs, key_name):
    for attr in attrs:
        if attr.get("name", "").lower() == key_name.lower():
            return attr.get("value", "").strip()
    return None

In [16]:
def main():
    all_ads = []
    for page in range(1, 101):
        ads = fetch_json(page)
        print(f"page {page}: {len(ads)} Found")

        for ad in ads:
            if isinstance(ad, dict):
                attrs = ad.get("attrs", [])
                title = ad.get("title", "")
                condition_ = get_attrs_value(attrs, "condition")
                transmission_ = get_attrs_value(attrs, "transmission")
                condition = extract_condition(condition_)
                transmission = extract_transmission(transmission_)
                make, model, year = extract_make_model_year(title)
                price = ad.get("price_title", "")
                
                if price:
                    all_ads.append({
                        "title": title,
                        "make": make,
                        "model": model,
                        "year": year,
                        "condition": condition,
                        "transmission": transmission,
                        "price": price
                    })
    if all_ads:
        df = pd.DataFrame(all_ads)
        df.to_csv("jiji_car_dataset.csv", index=False)
        print("successfully extract data from jiji")
    else:
        print("Nothing to extract")
        

In [17]:
if __name__ == "__main__":
    main()

page 1: 20 Found
page 2: 20 Found
page 3: 20 Found
page 4: 20 Found
page 5: 20 Found
page 6: 20 Found
page 7: 20 Found
[Page] 8 . Request error
page 8: 0 Found
page 9: 20 Found
page 10: 20 Found
page 11: 20 Found
page 12: 20 Found
page 13: 20 Found
page 14: 20 Found
page 15: 20 Found
page 16: 20 Found
page 17: 20 Found
page 18: 20 Found
[Page] 19 . Request error
page 19: 0 Found
page 20: 20 Found
page 21: 20 Found
page 22: 20 Found
page 23: 20 Found
page 24: 20 Found
page 25: 20 Found
page 26: 20 Found
page 27: 20 Found
page 28: 20 Found
page 29: 20 Found
page 30: 20 Found
page 31: 20 Found
page 32: 20 Found
page 33: 20 Found
page 34: 20 Found
page 35: 20 Found
page 36: 20 Found
page 37: 20 Found
page 38: 20 Found
page 39: 20 Found
page 40: 20 Found
page 41: 20 Found
page 42: 20 Found
page 43: 20 Found
page 44: 20 Found
page 45: 20 Found
page 46: 20 Found
page 47: 20 Found
page 48: 20 Found
page 49: 20 Found
page 50: 20 Found
page 51: 20 Found
page 52: 20 Found
page 53: 20 Found
page 5