In [17]:
from datetime import datetime
import time
import random
import requests

from bs4 import BeautifulSoup

expected data:
- job url
- job title
- location
- salary
- experience
- tags
- jd
- company info (scale, field, company name, ...)
- categories (related jobs, skills)

### utils ###

In [None]:
def crawl_urls(max_page):
    job_detail_urls = []

    for page in range(1, max_page+1):
        url = f"https://www.topcv.vn/tim-viec-lam-cong-nghe-thong-tin-cr257?sort=new&type_keyword=1&page={page}&category_family=r257"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")

        jobs = soup.find_all('div', attrs={"data-box":"BoxSearchResult"}) # len(jobs)=50, the maximum number of jobs in one single page
        
        for job in jobs:
            job_detail_urls.append(job.find('h3', class_='title').find('a')['href'])

    return job_detail_urls

In [18]:
def get_page_content(url):
    headers = {
        'User-Agent': ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
                       'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0',
                       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15',
                       'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36']
        }
    response = requests.get(url, headers={'User-Agent': random.choice(headers['User-Agent'])})

    # handle request failure
    if response.status_code != 200:
        for i in range(5):
            print('failed to retrieve: ' + url)
            print('will retry after 60s...')
            time.sleep(60)
            response = requests.get(url)
            if response.status_code == 200:
                break

        if response.status_code != 200:
            print('failed to retrieve after 5 retries: ' + url)
            return None
        
    return response.content

From my exploration, IT jobs on topcv.vn can now be categorized into 3 types based on the contract between topcv and the company posting the jobs:
1. Normal jobs (https://www.topcv.vn/viec-lam/...)
2. Brand: (https://www.topcv.vn/brand/...)
    - Premium Brand
    - Brand

Each type has a different HTML structure, which requires separate handling efforts.

### 1. Normal jobs ###

In [None]:
def extract_jd_normal_job(job):
    jd = {}
    all_sections = job.find('div', class_='job-description').find_all('div', class_='job-description__item') # sections in jd part of html
    for section in all_sections:
        title = section.find('h3').get_text(strip=True)
        if section.find_all('li'):
            content = "\n".join([c.get_text(strip=True) for c in section.find_all('li')])
        else:
            content = section.find('div').get_text(strip=True, separator='\n')
            
        jd[title] = content
        
    # custom form job (job co job khong)
    cfj = job.find_all('div', class_='custom-form-job__item')
    if len(cfj) > 0:    
        for form in cfj:
            title = form.find('h3').get_text(strip=True)
            content = form.find('div', class_='custom-form-job__item--content').get_text(strip=True)

            jd[title] = content
        
    return jd


def extract_general_info_normal_job(job):
    # thong tin chung
    general_inf = {}
    all_general_inf = job.find('div', class_='job-detail__box--right job-detail__body-right--item job-detail__body-right--box-general').find_all('div', class_='box-general-group-info')
    
    for inf in all_general_inf:
        title = inf.find('div', class_='box-general-group-info-title').get_text(strip=True)
        value = inf.find('div', class_='box-general-group-info-value').get_text(strip=True)
        general_inf[title] = value
        
    return general_inf


def extract_categories_box_normal_job(job):
    categories = {}
    
    categories_box = job.find('div', class_="job-detail__box--right job-detail__body-right--item job-detail__body-right--box-category")\
                        .find_all('div', class_=['box-category', 'box-category-collapsed'])
    
    for category in categories_box:
        title = category.find('div', class_='box-title').get_text(strip=True)
        a = category.find('div', class_='box-category-tags').find_all('a')
        if a:
            tags = [tag.get_text(strip=True) for tag in category.find('div', class_='box-category-tags').find_all('a')]
        else:
            tags = [tag.get_text(strip=True) for tag in category.find('div', class_='box-category-tags').find_all('span')]
        categories[title] = tags

    return categories


def scrape_normal_job(url):
    response_content = get_page_content(url)
    if response_content is None:
        print('failed to get page content: ' + url)
        return None

    soup = BeautifulSoup(response_content, "html.parser")

    # parse html
    job = soup.find('div', class_='job-detail__body')
    
    if job is None:
        print('failed to parse: ' + url)
        return None
    
    # extract data
    job_title = job.find('h1', class_='job-detail__info--title').get_text(strip=True, separator=' ')
    
    infs = job.find_all('div', class_='job-detail__info--section-content-value')
    salary = infs[0].get_text(strip=True)
    location = infs[1].get_text(strip=True)
    exp = infs[2].get_text(strip=True)
    
    # deadline = datetime.strptime(job.find('div', class_='job-detail__info--deadline').get_text(strip=True)[15:], "%d/%m/%Y")
    deadline = job.find('div', class_='job-detail__info--deadline').get_text(strip=True)[15:]
    
    jd_tags = [a.get_text(strip=True) for a in job.find_all('a', class_='item search-from-tag link')]
    
    jd = extract_jd_normal_job(job)
    
    # company info
    company_name = job.find('div', class_='company-name-label').find('a', class_='name').get_text(strip=True)
    company_scale = job.find('div', class_='job-detail__company--information-item company-scale').find('div', class_='company-value').get_text(strip=True)
    company_address = job.find('div', class_='job-detail__company--information-item company-address').find('div', class_='company-value').get_text(strip=True)
    company_field = job.find('div', class_='job-detail__company--information-item company-field').find('div', class_='company-value').get_text(strip=True)

    general_inf = extract_general_info_normal_job(job)
    
    categories = extract_categories_box_normal_job(job)

    job.clear()
    
    return {
        'url': url, 
        'title': job_title, 
        'location': location, 
        'salary': salary, 
        'exp': exp, 
        'deadline': deadline, 
        'jd_tags': jd_tags, 
        'jd': jd, 
        'company_info': {
            'company_name': company_name, 
            'company_scale': company_scale, 
            'company_address': company_address, 
            'company_field': company_field
        }, 
        'general_inf': general_inf, 
        'categories': categories
    }

### 2. Brand ###

In [20]:
def scrape_brand_job(url):

    response_content = get_page_content(url)
    if response_content is None:
        print('failed to get page content: ' + url)
        return None
    
    soup = BeautifulSoup(response_content, "html.parser")

    job = soup.find('div', class_='block-left')

    job_title = job.find('h2', class_='title').get_text(strip=True)

    details = job.find('div', class_='box-job-info').find_all('div', class_='box-info')

    # theo nhu exploration thi cac class 'box-info' trong phan job details gom: general info, job tags va job description

    jd = {}
    general_info = {}
    job_tags = []

    for section in details:
        # general info
        if section.find('div', class_='box-main'):
            items = section.find('div', class_='box-main').find_all('div', class_='box-item')
            for item in items:
                label = item.find('strong').get_text(strip=True)
                value = item.find('span').get_text(strip=True)
                # print(f"{label}: {value}")
                general_info[label] = value

        # jd    
        else:
            title = section.find('h2').get_text(strip=True)
            content_div = section.find('div', class_='content-tab').find_all()
            content = ""
            for part in content_div:
                if part.name == 'ul':
                    lis = part.find_all('li')
                    for li in lis:
                        content += "- " + li.get_text(strip=True) + "\n"
                elif part.name == 'div' or part.name == 'p':
                    content += part.get_text(strip=True) + "\n"
            jd[title] = content

            # custom form job (job co job khong)
            cfj = section.find_all('div', class_='custom-form-job__item')
            if len(cfj) > 0:    
                for div in cfj:
                    title = div.find('h3').get_text(strip=True) 
                    content = div.find('div', class_='custom-form-job__item--content').get_text(strip=True)
                    # print(f"{title}: {content}")
                    jd[title] = content

            # job tags (phan nay co trong "mo ta cong viec")
            if section.find('div', class_='job-tags'):
                tags = section.find('div', class_='job-tags').find_all('a')
                for tag in tags:
                    # print(tag.get_text(strip=True))
                    job_tags.append(tag.get_text(strip=True))


    # address 
    address_div = job.find('div', class_='box-job-info').find('div', class_='box-address')
    address = "\n".join([a.get_text(strip=True) for a in address_div.find_all()])

    # company (phan nay trong footer)
    company = {}
    footer = soup.find('div', class_='footer-info')
    company['name'] = footer.find('div', class_='footer-info-content footer-info-company-name').get_text(strip=True)
    title_divs = footer.find_all('div', class_='footer-info-title')

    for title in title_divs:
        if title.find_next('div').get('class')[0] == 'footer-info-content':
            company[title.get_text(strip=True)] = title.find_next('div').get_text(strip=True)


    return {
        'url': url, 
        'title': job_title, 
        'address': address, 
        'general_info': general_info, 
        'job_tags': job_tags, 
        'jd': jd, 
        'company': company
    }

### 3. Premium ###

In [21]:
# jd
def extract_jd_premium(job):
    jd = {}
    boxes = job.find_all('div', class_='premium-job-description__box')
    for box in boxes:
        label = box.find('h2').get_text(strip=True)
    #     print(lable)
        if box.find_all('li'):
            content = "\n".join([c.get_text(strip=True) for c in box.find_all('li')])
        else:
            content = box.find('div').get_text(strip=True, separator='\n')
    #     print(content)
    #     print('-----')

        jd[label] = content
    return jd


def extract_general_info_premium(job):
    general_info = {}
    general_info_data = job.find_all('div', class_='general-information-data')
    for data in general_info_data:
        label = data.find('div', class_='general-information-data__label').get_text(strip=True)
        value = data.find('div', class_='general-information-data__value').get_text(strip=True)
    #     print(label + ': ' + value)
        general_info[label] = value
    return general_info


def extract_related_tags_premium(job):
    related_tags = {}

    job_related_tags = job.find_all('div', class_=["premium-job-related-tags__section", "premium-job-related-tags__section box-category collapsed"])

    for section in job_related_tags:
        title = section.find('h2', class_='premium-job-box__title').get_text(strip=True)
        tags = [tag.get_text(strip=True) for tag in section.find_all(class_='tag-item')]
    #     print(title)
    #     print(tags)
        related_tags[title] = tags
    return related_tags


def scrape_premium_brand_job(url):
    response_content = get_page_content(url)
    if response_content is None:
        print('failed to get page content: ' + url)
        return None
    
    soup = BeautifulSoup(response_content, "html.parser")

    job = soup.find('div', class_='premium-job')
    
    job_title = job.find('h2', class_='premium-job-basic-information__content--title').get_text(strip=True)
    
    info_sections = job.find('div', class_='premium-job-basic-information__content--sections').find_all('div', class_="basic-information-item")
    salary = info_sections[0].find('div', class_='basic-information-item__data--value').get_text(strip=True)
    location = info_sections[1].find('div', class_='basic-information-item__data--value').get_text(strip=True)
    exp = info_sections[2].find('div', class_='basic-information-item__data--value').get_text(strip=True)
    
    job_tags = [tag.get_text(strip=True) for tag in job.find('div', class_='job-tags').find_all('a')]
    
    jd = extract_jd_premium(job)
    
    general_info = extract_general_info_premium(job)
    
    related_tags = extract_related_tags_premium(job)
    
    return {
        'url': url, 
        'title': job_title, 
        'location': location, 
        'salary': salary, 
        'exp': exp, 
        'job_tags': job_tags, 
        'jd': jd, 
        'general_info': general_info, 
        'related_tags': related_tags
    }

### main ###

In [23]:
job_detail_urls = crawl_urls(max_page=1)

In [24]:
data = []

for url in job_detail_urls[:20]:
    print('scraping  ' + url)

    if url[21:29] == 'viec-lam':
        res = scrape_normal_job(url)

    elif url[21:26] == 'brand':
        # detecting normal brand job or premium brand job
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        premium = soup.find('div', class_='premium-job')
        
        if premium is None:
            res = scrape_brand_job(url)
        else:
            res = scrape_premium_brand_job(url)

    else: 
        print('skipped ' + url)
        continue
    
    if data is not None:
        data.append(res)
        print('Done')

        time.sleep(random.uniform(1, 5))


scraping  https://www.topcv.vn/viec-lam/data-engineer/1927366.html?ta_source=JobSearchList_LinkDetail&u_sr_id=aOxDEuuHPibDfkrFSMzdYmj5Yu0msCLHhNEqy3DM_1761447361
Done
scraping  https://www.topcv.vn/viec-lam/junior-security-engineer-upto-35tr-thang/1926793.html?ta_source=JobSearchList_LinkDetail&u_sr_id=aOxDEuuHPibDfkrFSMzdYmj5Yu0msCLHhNEqy3DM_1761447361
Done
scraping  https://www.topcv.vn/viec-lam/chuyen-vien-phan-tich-nghiep-vu-business-analyst/1926866.html?ta_source=JobSearchList_LinkDetail&u_sr_id=aOxDEuuHPibDfkrFSMzdYmj5Yu0msCLHhNEqy3DM_1761447361
Done
scraping  https://www.topcv.vn/viec-lam/business-analyst/1927367.html?ta_source=JobSearchList_LinkDetail&u_sr_id=aOxDEuuHPibDfkrFSMzdYmj5Yu0msCLHhNEqy3DM_1761447361
Done
scraping  https://www.topcv.vn/viec-lam/technical-leader-net-tu-2-nam-kinh-nghiem/1909066.html?ta_source=JobSearchList_LinkDetail&u_sr_id=aOxDEuuHPibDfkrFSMzdYmj5Yu0msCLHhNEqy3DM_1761447361
Done
scraping  https://www.topcv.vn/viec-lam/chuyen-vien-tu-van-giai-phap-pre

In [28]:
with open('D:/SelfLearning/crawlingdata/topcv/data/topcv_it_jobs_sample.json', 'w', encoding='utf-8') as f:
    import json
    json.dump(data, f, ensure_ascii=False, indent=4)

In [32]:
scrape_normal_job("https://www.topcv.vn/viec-lam/nhan-vien-2d-artist-game/1926955.html?ta_source=JobSearchList_LinkDetail&u_sr_id=aOxDEuuHPibDfkrFSMzdYmj5Yu0msCLHhNEqy3DM_1761447361")

{'url': 'https://www.topcv.vn/viec-lam/nhan-vien-2d-artist-game/1926955.html?ta_source=JobSearchList_LinkDetail&u_sr_id=aOxDEuuHPibDfkrFSMzdYmj5Yu0msCLHhNEqy3DM_1761447361',
 'title': 'Nhân Viên 2D Artist Game',
 'location': 'Hà Nội',
 'salary': 'Thoả thuận',
 'exp': '2 năm',
 'deadline': '24/11/2025',
 'jd_tags': ['Chuyên môn Game Design', 'IT - Phần mềm', 'Nghỉ thứ 7'],
 'jd': {'Mô tả công việc': 'Tham gia thiết kế cho các dự án\nTham gia nghiên cứu định hướng đồ hoạ Game\nTham gia giải quyết vấn đề dự án, quy trình\nHọc hỏi phát triển kỹ năng chuyên môn\nPhát triển kỹ năng làm việc nhóm',
  'Yêu cầu ứng viên': 'Đam mê với Game và đồ họa Game\nKỹ năng Digital drawing tốt\nCó tư duy hội họa, vẽ tốt, biết làm effect\nCó kinh nghiệm về thiết kế nhân vật, môi trường, UI\nAm hiểu về Photoshop hay các công cụ art digital khác\nTính cam kết và tinh thần trách nhiệm cao\nHam học hỏi, chủ động nâng cao kiến thức và kỹ năng trong công việc\nHiểu về 3D modeling, Game design, Animation là một lợ