In [1]:
import asyncio
import aiohttp
import pandas as pd
import numpy as np
import sys
import time
pd.set_option('display.max_columns', None)

In [2]:
BASE_URL = "https://gateway.chotot.com/v1/public/ad-listing"
FILE_PATH = "../data/houses-for-sale.csv"
LIMIT = 100
REGIONS = [13000]  # Ho Chi Minh , Ha Noi
AREAS = [
    [
        13096,  # Q1
        13098,  # Q3
        13099,  # Q4
        13100,  # Q5
        13101,  # Q6
        13102,  # Q7
        13103,  # Q8
        13105,  # Q10
        13106,  # Q11
        13107,  # Q12
        13108,  # Binh Tan
        13109,  # Binh Thanh
        13110,  # Go Vap
        13111,  # Phu Nhuan
        13112,  # Tan Binh
        13113,  # Tan Phu
        13115,  # Binh Chanh
        13116,  # Cu Chi
        13117,  # Hoc Mon
        13118,  # Nha Be
        13119,  # TP Thu Duc
        13120,  # Can Gio
    ],
]
REGION_MAPPING = {
    13000: "Ho Chi Minh City",
}
AREA_MAPPING = {
    # Ho Chi Minh
    13000: {
        13096: "1st District",
        13098: "3rd District",
        13099: "4th District",
        13100: "5th District",
        13101: "6th District",
        13102: "7th District",
        13103: "8th District",
        13105: "10th District",
        13106: "11th District",
        13107: "12th District",
        13108: "Binh Tan District",
        13109: "Binh Thanh District",
        13110: "Go Vap District",
        13111: "Phu Nhuan District",
        13112: "Tan Binh District",
        13113: "Tan Phu District",
        13115: "Binh Chanh District",
        13116: "Cu Chi District",
        13117: "Hoc Mon District",
        13118: "Nha Be District",
        13119: "Thu Duc City",
        13120: "Can Gio District",
    },
}
CG_APARTMENT = "1010,1020"  # houses/apartments
STEP_SIZE = 10
DETAIL_STEP_SIZE = 100
MAX_RETRIES = 5
RETRY_INTERVAL = 2

In [3]:
def ordinal(number):
    if 10 <= number % 100 <= 20:
        suffix = "th"
    else:
        suffix = {1: "st", 2: "nd", 3: "rd"}.get(number % 10, "th")
    return str(number) + suffix

In [4]:
async def get_data_for_page(session, page, region, area, fetch_houses=True):
    url = f"{BASE_URL}?region_v2={region}&area_v2={area}&cg={CG_APARTMENT}&limit={LIMIT}&o={LIMIT * page}&st=s&page={page}&w=1&"
    for num_retrial in range(MAX_RETRIES):
        try:
            async with session.get((url)) as resp:
                data = await resp.json()
                return data["ads"] if fetch_houses else int(data["total"] / LIMIT) + 1
        except Exception as e:
            print(
                f"\t\tFETCH ERROR AT PAGE {page} ({ordinal(num_retrial+1)} time): {e}"
            )
        await asyncio.sleep(RETRY_INTERVAL)

    return None if fetch_houses else 0

In [5]:
async def get_data_in_batches():
    async with aiohttp.ClientSession() as session:
        houses = []
        for region, areas in zip(REGIONS, AREAS):
            print(f"Fetch houses from {REGION_MAPPING[region]}")
            for area in areas:
                total_pages = await get_data_for_page(
                    session, 0, region, area, fetch_houses=False
                )
                print(
                    f"\tFetch houses from {AREA_MAPPING[region][area]} ({total_pages} pages)"
                )
                for i in range(0, total_pages, STEP_SIZE):
                    print(
                        f"\t\t...fetching page {i} to {i + STEP_SIZE if i + STEP_SIZE < total_pages else total_pages}"
                    )
                    batch_tasks = [
                        get_data_for_page(session, page, region, area)
                        for page in range(i, i + STEP_SIZE)
                    ]
                    data_batch = await asyncio.gather(*batch_tasks)
                    houses.extend(data_batch)
        return houses

In [6]:
def flatten_recursive_np(lst):
    result = []
    for el in lst:
        if isinstance(el, list):
            result.extend(flatten_recursive_np(el))
        else:
            result.append(el)
    return np.array(result)

In [7]:
async def get_detail(session, house):
    url = f"{BASE_URL}/{house['list_id']}"
    for num_retrial in range(MAX_RETRIES):
        try:
            async with session.get((url)) as resp:
                if resp.status == 404: # not found
                    return house
                data = await resp.json()
                return data["ad"]
        except Exception as e:
            print(
                f"\t\tFETCH ERROR AT FOR HOUSE ID {id} ({ordinal(num_retrial+1)} time): {e}"
            )
        await asyncio.sleep(RETRY_INTERVAL)
    return None

In [8]:
async def get_detail_all_houses(houses):
    async with aiohttp.ClientSession() as session:
        detailed_houses = []
        for i in range(0, len(houses), DETAIL_STEP_SIZE):
            batch_tasks = [
                get_detail(session, house) for house in houses[i : i + DETAIL_STEP_SIZE]
            ]
            data_batch = await asyncio.gather(*batch_tasks)
            detailed_houses.extend(data_batch)

            # Print progress on the same line
            progress_percentage = (i + DETAIL_STEP_SIZE) / len(houses) * 100
            sys.stdout.write(f"\rProgress: {progress_percentage:.0f}%")
            sys.stdout.flush()
        print()
        return detailed_houses

In [9]:
houses = flatten_recursive_np(await get_data_in_batches())
houses = houses[houses != None]

Fetch houses from Ho Chi Minh City
	Fetch houses from 1st District (12 pages)
		...fetching page 0 to 10
		...fetching page 10 to 12
	Fetch houses from 3rd District (16 pages)
		...fetching page 0 to 10
		...fetching page 10 to 16
	Fetch houses from 4th District (8 pages)
		...fetching page 0 to 8
	Fetch houses from 5th District (8 pages)
		...fetching page 0 to 8
	Fetch houses from 6th District (12 pages)
		...fetching page 0 to 10
		...fetching page 10 to 12
	Fetch houses from 7th District (35 pages)
		...fetching page 0 to 10
		...fetching page 10 to 20
		FETCH ERROR AT PAGE 18 (1st time): 0, message='Attempt to decode JSON with unexpected mimetype: text/html; charset=utf-8', url=URL('https://gateway.chotot.com/v1/public/ad-listing?region_v2=13000&area_v2=13102&cg=1010,1020&limit=100&o=1800&st=s&page=18&w=1&')
		FETCH ERROR AT PAGE 10 (1st time): 0, message='Attempt to decode JSON with unexpected mimetype: text/html; charset=utf-8', url=URL('https://gateway.chotot.com/v1/public/ad-l

In [10]:
start_time = time.time()
detailed_houses = flatten_recursive_np(await get_detail_all_houses(houses))
elapsed_time = time.time() - start_time
print(f"Time consumed to fetch details of all houses: {elapsed_time} seconds")

Progress: 100%
Time consumed to fetch details of all houses: 303.92775416374207 seconds


In [19]:
detailed_houses = detailed_houses[detailed_houses != None]
all_keys = set().union(*(d.keys() for d in detailed_houses))
# Create the DataFrame
df = pd.DataFrame.from_records(detailed_houses, columns=all_keys)
df = df[df['list_id'].duplicated() == False]

In [20]:
df.head()

Unnamed: 0,phone_hidden,apartment_type,escrow_can_deposit,ward,region,length,condition_ad,reviewer_image,phone,account_oid,number_of_images,floors,label_campaigns,has_video,address,payment_delivery,type,size,zero_deposit,project_oid,webp_image,contain_videos,living_size,avatar,property_status,area_name,videos,region_v2,house_type,owner,rooms,property_back_condition,images,shop_alias,unitnumber_display,image,company_logo,state,block,pty_map_modifier,body,toilets,street_number,floornumber,region_name,pty_map,shop,property_road_condition,ad_labels,account_id,property_legal_document,date,location,type_name,ward_name,subject,direction,streetnumber_display,pty_jupiter,company_ad,price_million_per_m2,reviewer_nickname,detail_address,width,list_id,area_v2,apartment_feature,landed_type,street_id,thumbnail_image,ad_id,street_name,account_name,unitnumber,projectid,pty_characteristics,furnishing_sell,list_time,price,category,price_string,latitude,protection_entitlement,category_name,balconydirection,area,longitude
0,,,2.0,9217.0,13,10.0,,https://static.chotot.com/thumbs/admin/9999999...,092766****,401d3a2722b3029a60f2d06321874778,9.0,3.0,,,,,s,32.0,False,,https://cdn.chotot.com/Vdo81ZRCsR_OctWMywFZr3h...,2.0,90.0,https://cdn.chotot.com/uac2/15097873,,Quận 1,[],13000,3.0,False,3,,[https://cdn.chotot.com/e2KeIxUm3uXx4-PhCOl-o1...,,,,,accepted,,0.0008,Chính chủ cần bán gấp nhà đường Thạch Thị Than...,4.0,,,Tp Hồ Chí Minh,https://cdn.chotot.com/admincentre/location/10...,,,,15097873,1.0,19 phút trước,"10.7901,106.693",Cần bán,Phường Tân Định,"Nhà Q1, Thạch Thị Thanh, 3L, full Nội Thất đẹp...",,,0,True,184.375,Trần Thanh,,3.2,112718747,13096,,,,https://cdn.chotot.com/kKpv7Li0ECMHvkXpoD2gihz...,153261936,Đường Thạch Thị Thanh,Trần Ngọc,,,,1.0,1703499530998,5900000000,1020,"5,9 tỷ",10.7901,False,Nhà ở,,96,106.693
1,,,2.0,9226.0,13,10.0,,https://static.chotot.com/thumbs/admin/9999999...,093366****,2ba33655e496616002ec75c20ddaf5e8,6.0,2.0,,,,,s,30.0,False,,https://cdn.chotot.com/lwpW1gYbubNF-m7uz7RkWpS...,2.0,70.0,https://cdn.chotot.com/uac2/1264592,,Quận 1,[],13000,3.0,False,2,,[https://cdn.chotot.com/luF_zbn5o6NS1iX0Fhrqe3...,,,,,accepted,,0.0008,"📌 BÁN NHÀ Quận 1- Nguyễn Văn Cừ, P.Cầu Kho, Q1...",1.0,,,Tp Hồ Chí Minh,https://cdn.chotot.com/admincentre/location/10...,,,,1264592,1.0,20 phút trước,"10.7538,106.6872",Cần bán,Phường Cầu Kho,bán nhà nhìn ra cầu nguyễn văn cừ,6.0,,0,True,183.333333,An Nhiên,Trần Hưng Ðạo,3.0,112642233,13096,,,,https://cdn.chotot.com/7TQhv1Jsfx9ym50yVnUg4xw...,153172868,Đường Võ Văn Kiệt,Phương,,,[2],,1703499439901,5500000000,1020,"5,5 tỷ",10.7538,False,Nhà ở,,96,106.6872
2,,,2.0,9217.0,13,,,https://static.chotot.com/thumbs/admin/9999999...,058518****,1ac98311a851b61e093d13cdf1f2de33,4.0,,,,,,s,10.0,False,,https://cdn.chotot.com/Euh85xletv0yyh10Oj003uI...,2.0,,https://cdn.chotot.com/uac2/26372279,,Quận 1,[],13000,3.0,False,4,,[https://cdn.chotot.com/bioAQPE4HDyOf9AOXkwkuS...,,,,,accepted,,0.0008,Diện tích: 40m2\nGiá bán: 1 tỷ 468\nKết cấu: 1...,,,,Tp Hồ Chí Minh,https://cdn.chotot.com/admincentre/location/10...,,,,26372279,,35 phút trước,"10.7917,106.6931",Cần bán,Phường Tân Định,ĐÁY BẮT GIÁ bán nhà Nguyễn Phi Khanh Q.1 1T2L ...,,,0,True,146.8,Bình An,,,112824656,13096,,,,https://cdn.chotot.com/Diwx3L28MI5zFdsAEvYfoyi...,153385154,Nguyễn Phi Khanh,Ngọc Ánh,,,,,1703498571000,1468000000,1020,"1,468 tỷ",10.7917,False,Nhà ở,,96,106.6931
3,,,2.0,9223.0,13,9.5,,https://static.chotot.com/thumbs/admin/9999999...,090187****,0ab288b7de0724988a0631cf314d5d28,6.0,,,,,,s,33.0,False,,https://cdn.chotot.com/dgff7XRz_GSDb-BbIsRg8ey...,2.0,,https://cdn.chotot.com/uac2/26182525,,Quận 1,[],13000,3.0,False,3,,[https://cdn.chotot.com/QM1co-cGTvQn8iS7h4RQtp...,,,,,accepted,,0.0008,"Giá cực tốt, cần tiền bán gấp nhà Đề Thám, Phư...",4.0,,,Tp Hồ Chí Minh,https://cdn.chotot.com/admincentre/location/10...,,,,26182525,1.0,55 phút trước,"10.763212,106.695915",Cần bán,Phường Cầu Ông Lãnh,"Chính chủ cần bán gấp nhà Đề Thám, Quận 1. Giá...",,,0,True,103.030303,Tuấn Anh,,3.5,111661713,13096,,,,https://cdn.chotot.com/_1N7QNHhsWtUlfVm3Yv7cnE...,152029634,Đề Thám,Võ Minh Thư,,,[2],1.0,1703497352000,3400000000,1020,"3,4 tỷ",10.763212,False,Nhà ở,,96,106.695915
4,,,2.0,9218.0,13,,,https://static.chotot.com/thumbs/admin/9999999...,090135****,b85c01229386d78863ff719ae27e0018,3.0,,,,,,s,17.0,False,,https://cdn.chotot.com/WqEmMlKkfx2YWphvkPfBdVC...,2.0,103.0,https://cdn.chotot.com/uac2/17667771,,Quận 1,[],13000,1.0,False,4,,[https://cdn.chotot.com/Wwav0GSz0thp1zhcP56Ftr...,,,,,accepted,,0.0008,"chính chủ Bán nhà MT Nguyễn Đình Chiểu, Q.1\n3...",5.0,,,Tp Hồ Chí Minh,https://cdn.chotot.com/admincentre/location/10...,,,,17667771,,59 phút trước,"10.7854,106.6968",Cần bán,Phường Đa Kao,Chính chủ bán nhà Ngang 3m MT Nguyễn Đình Chiể...,1.0,,0,True,529.411765,Minh Tri,,,112824022,13096,,,,https://cdn.chotot.com/MSZ08bYPeOOJqDfCJBzVvja...,153384434,Nguyễn Đình Chiểu,Đỗ Kim Sơn,,,,,1703497092000,9000000000,1020,9 tỷ,10.7854,False,Nhà ở,,96,106.6968


In [22]:
df.to_csv(FILE_PATH, index=False)