In [None]:
import os
import gzip
import json
import requests
from typing import List, Dict, Any

# AGMARKNET

In [2]:
def list_json_gz_files(root_dir):
    files = []
    for dirpath, _, filenames in os.walk(root_dir):
        for fname in filenames:
            if fname.lower().endswith(".jsonl.gz"):
                files.append(os.path.join(dirpath, fname))
    return files

In [3]:
def read_jsonl_gz(filepath: str) -> List[Dict[str, Any]]:
    """Read a .jsonl.gz file and return a list of dicts."""
    records: List[Dict[str, Any]] = []
    with gzip.open(filepath, "rt", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            records.append(json.loads(line))
    return records

In [4]:
def collect_unique_locations(files: List[str]) -> List[Dict[str, str]]:
    """Collect unique {state_name, district_name, market_name} records from files."""
    unique_list: List[Dict[str, str]] = []
    seen = set()

    for file in files:
        print(f"Processing file {file}")
        records = read_jsonl_gz(file)
        for record in records:
            state = record.get("state_name")
            district = record.get("district_name")
            market = record.get("market_name")

            if state and district and market:
                key = (state, district, market)
                if key not in seen:
                    seen.add(key)
                    unique_list.append({
                        "state_name": state,
                        "district_name": district,
                        "market_name": market
                    })
    return unique_list

In [5]:
def save_as_jsonl(data, filepath):
    """Save a list of dicts into a JSONL file."""
    with open(filepath, "w", encoding="utf-8") as f:
        for record in data:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

In [6]:
data_path = "../data/agmarknet"

In [None]:
files = list_json_gz_files(data_path)
unique_locations = collect_unique_locations(files)
print(f"Total unique locations: {len(unique_locations)}")
print(unique_locations[:5])  # show first 5
save_as_jsonl(unique_locations, "mandies.jsonl")

# ENAM

In [11]:
def get_states():
    url = "https://enam.gov.in/web/ajax_ctrl/states_name"
    response = requests.get(url)
    return response.json()

In [31]:
def get_apmcs():
    apmcs = []
    url = "https://enam.gov.in/web/Ajax_ctrl/apmc_list"
    states = get_states()['data']
    for state in states:
        payload = { 'state_id': state['state_id']}
        response = requests.post(url, data=payload)
        items = response.json()['data']
        for item in items:
            item['state_id'] = state['state_id']
            item['state_name']= state['state_name']
        apmcs.extend(items)       
    return apmcs

In [34]:
apmcs = get_apmcs()
save_as_jsonl(apmcs, "apmcs.jsonl")