In [1]:
import pandas as pd
import numpy as np
import os
import json
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import time
from concurrent.futures import ProcessPoolExecutor 
from multiprocessing import cpu_count 
import asyncio 
import aiohttp 

In [3]:
def save_json(file, filename):
    SAVE_DIR = "data/topcv/json"
    with open(f'{SAVE_DIR}/{filename}.json', 'w') as openfile:
        json.dump(file, openfile)

In [4]:
def read_json(filename):
    OPEN_DIR = "data/topcv/json"
    with open(f'{OPEN_DIR}/{filename}.json', 'r') as openfile:
        file = json.load(openfile)
    return file

In [6]:
def append_text(content, filename):
    OPEN_DIR = 'data/topcv/text'
    with open(f'{OPEN_DIR}/{filename}.txt', 'a') as openfile:
        openfile.write(f'{content}\n')

In [20]:
with open("data/topcv/json/industries_raw.json", 'r') as openfile:
    industries = json.load(openfile)

# Collect industries

### Sample record

In [4]:
resp = requests.get("https://www.topcv.vn/categories/list-top")
inds_js = json.loads(resp.text)

In [5]:
industries_js = inds_js['data']['categories']

In [6]:
industries_js[0]

{'id': 10001,
 'name': 'Kinh doanh / Bán hàng',
 'alias': 'kinh-doanh-ban-hang',
 'access_url': '/tim-viec-lam-kinh-doanh-ban-hang-c10001',
 'job_category_count': '11.119'}

In [10]:
# with open("data/topcv/json/industries_raw.json", 'w') as openfile:
#     json.dump(industries_js, openfile)

In [28]:
resp = requests.get("https://www.topcv.vn/tim-viec-lam-kinh-doanh-ban-hang-c10001")
pg_bs = BeautifulSoup(resp.text, 'html.parser')

In [29]:
job_body = pg_bs.find("div",{'class':'job-body'})
job_list = job_body.find('div', {'class':'job-list-default'})
job_items = job_list.find_all('div',{'class':'job-item-default'})
job_item_ = job_items[0].find('div',{'class':'body'})
job_url = job_item_.find('a').get('href')
job_url

'https://www.topcv.vn/viec-lam/nhan-vien-kinh-doanh-telesales-tu-van-luong-cung-tu-7-trieu-den-12-trieu-hoa-hong/1021535.html?ta_source=JobSearchList'

In [30]:
resp = requests.get(job_url)
jb_bs = BeautifulSoup(resp.text, 'html.parser')

In [31]:
job_header = jb_bs.find('div',{'class':'box-detail-job'}).find('div',{'class':'box-header'})
company_name = job_header.find('a',{'class':'company-logo'}).get('title')
job_title = job_header.find('div',{'class':'box-info-job'}).find('h1',{'class':'job-title'}).getText().strip()
job_deadline = job_header.find('div',{'class':'job-deadline'}).getText().strip()

In [32]:
jb_bs.find('div',{'class':'box-header'})

<div class="box-header">
<a class="company-logo" href="https://www.topcv.vn/cong-ty/cong-ty-tnhh-giai-phap-mui-huong-eco-life-viet-nam/18567.html" title="Công ty TNHH Giải pháp mùi hương Eco Life Việt Nam">
<div class="box-company-logo">
<img alt="Công ty TNHH Giải pháp mùi hương Eco Life Việt Nam" class="img-responsive" src="https://cdn.topcv.vn/80/company_logos/cong-ty-tnhh-giai-phap-mui-huong-eco-life-viet-nam-5c88cfef106fa_rs.jpg"/>
</div>
</a>
<div class="box-info-job">
<h1 class="job-title text-highlight bold" style="overflow-wrap:break-word;">
<a class="text-highlight" href="https://www.topcv.vn/tim-viec-lam-nhan-vien-kinh-doanh" target="_blank">Nhân Viên Kinh Doanh</a> / Telesales, Tư Vấn Lương Cứng Từ 7 Triệu Đến 12 Triệu + Hoa Hồng
</h1>
<div class="company-title">
<a class="text-dark-blue" href="https://www.topcv.vn/cong-ty/cong-ty-tnhh-giai-phap-mui-huong-eco-life-viet-nam/18567.html">Công ty TNHH Giải pháp mùi hương Eco Life Việt Nam</a>
</div>
<div class="job-deadline">
<

In [33]:
job_info = jb_bs.find('div',{'id':'tab-info'})
general_info = job_info.find_all('div',{'class':'box-info'})[-1].find_all('div',{'class':'box-item'})
job_salary = general_info[0].find('span').getText().strip()
job_type = general_info[2].find('span').getText().strip()
job_level = general_info[3].find('span').getText().strip()
job_yoe = general_info[-1].find('span').getText().strip()

In [34]:
job_addresses = [item.text.strip() for item in job_info.find('div',{'class':'box-address'}).find_all('div')[1:]]

In [35]:
job_addresses

['- Hà Nội: Tầng 04, tòa nhà Sapphire Building số 163 Bà Triệu, P. Lê Đại Hành, Q. Hai Bà Trưng, TP. Hà Nội, Hai Bà Trưng',
 '- Hồ Chí Minh: 29/23A Đoàn Thị Điểm, Phường 1, Phú Nhuận']

In [41]:
job_data = job_info.find('div',{'class':'job-data'})
job_details = job_data.find_all('div',{'class':'content-tab'})
job_desc = str(job_details[0])
job_req = str(job_details[1])

In [16]:
job_dict_arr = []
job_dict = {}
job_dict['job_desc'] = job_desc
job_dict['job_req'] = job_req
job_dict_arr.append(job_dict)
with open("data/topcv/json/topcv_industry_1", 'w') as outputfile:
    json.dump(job_dict_arr, outputfile)

In [42]:
max_page = int(pg_bs.find('ul',{'class':'pagination'}).find_all('li')[-2].text.strip())
max_page

400

### Loop through each industries and get all jobs of that industries

In [4]:
def get_jobs_by_industry(industry_name, industry_url, max_count=1000, save_name=None):
    except_urls = []
    if save_name is None:
        save_name = industry_name
    SAVE_DIR = "data/topcv/json"
    count = 0
    jobs_by_industries = {}
    base_url = "https://www.topcv.vn"
    print("Collecting industry: ", industry_name)
    jobs_by_industries[industry_name] = []
    ind_url = industry_url
    
    resp = requests.get(f'{base_url}{ind_url}')
    pg_bs = BeautifulSoup(resp.text, 'html.parser')
    try:
        max_page = int(pg_bs.find('ul',{'class':'pagination'}).find_all('li')[-2].text.strip())
    except Exception as e:
        max_page = 1
    print("Total page: ", max_page)
    for i in range(1,max_page+1):
        resp = requests.get(f'{base_url}{ind_url}?page={i}')
        print(f'{base_url}{ind_url}?page={i}')
        pg_bs = BeautifulSoup(resp.text, 'html.parser')
        job_body = pg_bs.find("div",{'class':'job-body'})
        job_list = job_body.find('div', {'class':'job-list-default'})
        job_items = job_list.find_all('div',{'class':'job-item-default'})
        for job_item in job_items:
            job_dict = {}
            job_item_ = job_item.find('div',{'class':'body'})
            job_url = job_item_.find('a').get('href')
            try:
                job_dict['url'] = job_url
                resp1 = requests.get(job_url)
                jb_bs = BeautifulSoup(resp1.text, 'html.parser')
                job_header = jb_bs.find('div',{'class':'box-detail-job'}).find('div',{'class':'box-header'})
                company_name = job_header.find('a',{'class':'company-logo'}).get('title')
                job_title = job_header.find('div',{'class':'box-info-job'}).find('h1',{'class':'job-title'}).getText().strip()
                job_deadline = job_header.find('div',{'class':'job-deadline'}).getText().strip()
                job_info = jb_bs.find('div',{'id':'tab-info'})

                general_info = job_info.find_all('div',{'class':'box-info'})[-1].find_all('div',{'class':'box-item'})
                job_salary = general_info[0].find('span').getText().strip()
                job_type = general_info[2].find('span').getText().strip()
                job_level = general_info[3].find('span').getText().strip()
                job_yoe = general_info[-1].find('span').getText().strip()
                job_addresses = [item.text.strip() for item in job_info.find('div',{'class':'box-address'}).find_all('div')[1:]]

                job_data = job_info.find('div',{'class':'job-data'})
                job_details = job_data.find_all('div',{'class':'content-tab'})
                job_desc = str(job_details[0])
                job_req = str(job_details[1])

                job_dict['job_title'] = job_title
                job_dict['job_deadline'] = job_deadline
                job_dict['company_name'] = company_name
                job_dict['job_salary'] = job_salary
                job_dict['job_type'] = job_type
                job_dict['job_yoe'] = job_yoe
                job_dict['job_addresses'] = job_addresses
                job_dict['job_desc'] = job_desc
                job_dict['job_req'] = job_req
                jobs_by_industries[industry_name].append(job_dict)
                count += 1
                if count % max_count == 0:
                    with open(f'{SAVE_DIR}/topcv_{save_name}_raw.json', 'w') as openfile:
                        json.dump(jobs_by_industries, openfile)
                    print(f"----------------Saved {count} records.----------------")
#                 print(f'{len(job_title)} - {len(job_salary)} - {len(job_type)} - {len(job_addresses)} - {len(job_desc)} - {len(job_req)}')
            except Exception as e:
                except_urls.append(job_url)
                
        print(f"Finished page {i} / {max_page}")
        
    with open(f'{SAVE_DIR}/topcv_{save_name}_raw.json', 'w') as openfile:
                        json.dump(jobs_by_industries, openfile)
    print(f"----------------Saved all {industry_name} records.----------------")
    return jobs_by_industries, except_urls

In [2]:
# time_start = time.time()
# total_lengths = []
# for industry in industries_js:
#     output = get_jobs_by_industry(industry['name'], industry['access_url'])
#     total_lengths.append(len(output))
# time_end = time.time()
# print("Finished collecting all jobs")
# print("Total time: ", (time_end - time_start))

In [22]:
exe_urls_dict = {}

Collecting Hàng cao cấp

In [44]:
luxury_goods_raw, exc_urls = get_jobs_by_industry('Hàng cao cấp', '/tim-viec-lam-hang-cao-cap-c10113')

Collecting industry:  Hàng cao cấp
Total page:  3
https://www.topcv.vn/tim-viec-lam-hang-cao-cap-c10113?page=1
Finished page 1 / 3
https://www.topcv.vn/tim-viec-lam-hang-cao-cap-c10113?page=2
Finished page 2 / 3
https://www.topcv.vn/tim-viec-lam-hang-cao-cap-c10113?page=3
Finished page 3 / 3
----------------Saved all Hàng cao cấp records.----------------


In [46]:
exe_urls_dict['Hàng cao cấp'] = exc_urls

Collecting Sản phẩm công nghiệp

In [9]:
industrial_prods_raw, exc_urls_prod = get_jobs_by_industry('Sản phẩm công nghiệp', '/tim-viec-lam-san-pham-cong-nghiep-c10125')
save_json(exc_urls_prod, 'exc_urls_prod')

Collecting industry:  Sản phẩm công nghiệp
Total page:  8
https://www.topcv.vn/tim-viec-lam-san-pham-cong-nghiep-c10125?page=1
Finished page 1 / 8
https://www.topcv.vn/tim-viec-lam-san-pham-cong-nghiep-c10125?page=2
Finished page 2 / 8
https://www.topcv.vn/tim-viec-lam-san-pham-cong-nghiep-c10125?page=3
Finished page 3 / 8
https://www.topcv.vn/tim-viec-lam-san-pham-cong-nghiep-c10125?page=4
Finished page 4 / 8
https://www.topcv.vn/tim-viec-lam-san-pham-cong-nghiep-c10125?page=5
Finished page 5 / 8
https://www.topcv.vn/tim-viec-lam-san-pham-cong-nghiep-c10125?page=6
Finished page 6 / 8
https://www.topcv.vn/tim-viec-lam-san-pham-cong-nghiep-c10125?page=7
Finished page 7 / 8
https://www.topcv.vn/tim-viec-lam-san-pham-cong-nghiep-c10125?page=8
Finished page 8 / 8
----------------Saved all Sản phẩm công nghiệp records.----------------


In [12]:
testjs = read_json('topcv_Sản phẩm công nghiệp_raw')
testjs

{'Sản phẩm công nghiệp': [{'url': 'https://www.topcv.vn/viec-lam/nhan-vien-theo-doi-don-hang-biet-tieng-anh-hoac-tieng-hoa/1018936.html?ta_source=JobSearchList',
   'job_title': 'Nhân Viên Theo Dõi Đơn Hàng - Biết Tiếng Anh Hoặc Tiếng Hoa',
   'job_deadline': 'Hạn nộp hồ sơ: 15/06/2023',
   'company_name': 'CÔNG TY TNHH JUWON VIỆT NAM',
   'job_salary': '10 - 13 triệu',
   'job_type': 'Toàn thời gian',
   'job_yoe': '3 năm',
   'job_addresses': ['- Bình Dương: Lô C5-C6 (khu A2), đường D9, khu công nghiệp Rạch Bắp, Xã An Tây, Bến Cát'],
   'job_desc': '<div class="content-tab"><ul><li>Tiếp nhận đơn hàng từ Khách hàng tìm kiếm, đặt mua và đăm vào về nguyên liệu</li><li>Quản lý PO và lịch giao hàng cho nhà cung cấp</li><li>Theo dõi sản phẩm trong từng giai đoạn</li><li>Làm việc với bộ phận sản xuất để xác nhận tiến độ sản xuất của lô hàng</li><li>Gửi chứng từ cho bộ phận xuất nhập khẩu để xuất hàng</li></ul></div>',
   'job_req': '<div class="content-tab"><ul><li>Thành thạo Tiếng Anh hoặc

Collecting 'Phi chính phủ / Phi lợi nhuận'

In [26]:
ngo_raw, exc_urls_ngo = get_jobs_by_industry('Phi chính phủ / Phi lợi nhuận', '/tim-viec-lam-phi-chinh-phu-phi-loi-nhuan-c10124',
                                            save_name='ngo')
save_json(exc_urls_ngo, 'exc_urls_ngo')

Collecting industry:  Phi chính phủ / Phi lợi nhuận
Total page:  1
https://www.topcv.vn/tim-viec-lam-phi-chinh-phu-phi-loi-nhuan-c10124?page=1
Finished page 1 / 1
----------------Saved all Phi chính phủ / Phi lợi nhuận records.----------------


Collecting 'Địa chất / Khoáng sản'

In [27]:
dcks_raw, exc_urls_dcks = get_jobs_by_industry('Địa chất / Khoáng sản', '/tim-viec-lam-dia-chat-khoang-san-c10111',
                                            save_name='dcks')
save_json(exc_urls_dcks, 'exc_urls_dcks')

Collecting industry:  Địa chất / Khoáng sản
Total page:  2
https://www.topcv.vn/tim-viec-lam-dia-chat-khoang-san-c10111?page=1
Finished page 1 / 2
https://www.topcv.vn/tim-viec-lam-dia-chat-khoang-san-c10111?page=2
Finished page 2 / 2
----------------Saved all Địa chất / Khoáng sản records.----------------


Collecting 'NGO / Phi chính phủ / Phi lợi nhuận'

In [28]:
ngo2_raw, exc_urls_ngo2 = get_jobs_by_industry('NGO / Phi chính phủ / Phi lợi nhuận', '/tim-viec-lam-ngo-phi-chinh-phu-phi-loi-nhuan-c10132',
                                            save_name='ngo2')
save_json(exc_urls_ngo2, 'exc_urls_ngo2')

Collecting industry:  NGO / Phi chính phủ / Phi lợi nhuận
Total page:  1
https://www.topcv.vn/tim-viec-lam-ngo-phi-chinh-phu-phi-loi-nhuan-c10132?page=1
Finished page 1 / 1
----------------Saved all NGO / Phi chính phủ / Phi lợi nhuận records.----------------


Collecting 'Hàng không'

In [29]:
hangkhong_raw, exc_urls_hangkhong = get_jobs_by_industry('Hàng không', '/tim-viec-lam-hang-khong-c10022',
                                            save_name=None)
save_json(exc_urls_hangkhong, 'exc_urls_hangkhong')

Collecting industry:  Hàng không
Total page:  3
https://www.topcv.vn/tim-viec-lam-hang-khong-c10022?page=1
Finished page 1 / 3
https://www.topcv.vn/tim-viec-lam-hang-khong-c10022?page=2
Finished page 2 / 3
https://www.topcv.vn/tim-viec-lam-hang-khong-c10022?page=3
Finished page 3 / 3
----------------Saved all Hàng không records.----------------


Collecting 'In ấn / Xuất bản'

In [30]:
inanxb_raw, exc_urls_inanxb = get_jobs_by_industry('In ấn / Xuất bản', '/tim-viec-lam-in-an-xuat-ban-c10024',
                                            save_name='printing')
save_json(exc_urls_inanxb, 'exc_urls_printing')

Collecting industry:  In ấn / Xuất bản
Total page:  6
https://www.topcv.vn/tim-viec-lam-in-an-xuat-ban-c10024?page=1
Finished page 1 / 6
https://www.topcv.vn/tim-viec-lam-in-an-xuat-ban-c10024?page=2
Finished page 2 / 6
https://www.topcv.vn/tim-viec-lam-in-an-xuat-ban-c10024?page=3
Finished page 3 / 6
https://www.topcv.vn/tim-viec-lam-in-an-xuat-ban-c10024?page=4
Finished page 4 / 6
https://www.topcv.vn/tim-viec-lam-in-an-xuat-ban-c10024?page=5
Finished page 5 / 6
https://www.topcv.vn/tim-viec-lam-in-an-xuat-ban-c10024?page=6
Finished page 6 / 6
----------------Saved all In ấn / Xuất bản records.----------------


In [35]:
missed_industries = []
for i in range(0, 60):
    industry = industries[i]
    job_count = int(industry['job_category_count'].replace('.',''))
    if job_count > 1000:
        continue
    try:
        js_raw, exc_urls_i = get_jobs_by_industry(industry_name=industry['name'],
                                                industry_url=industry['access_url'],
                                                 save_name=str(i))
        save_json(exc_urls_i, f'exc_urls_{i}')
    except Exception as e:
        print(e, '-', i)
        missed_industries.append(i)

Collecting industry:  Vận tải / Kho vận
Total page:  37
https://www.topcv.vn/tim-viec-lam-van-tai-kho-van-c10047?page=1
Finished page 1 / 37
https://www.topcv.vn/tim-viec-lam-van-tai-kho-van-c10047?page=2
Finished page 2 / 37
https://www.topcv.vn/tim-viec-lam-van-tai-kho-van-c10047?page=3
Finished page 3 / 37
https://www.topcv.vn/tim-viec-lam-van-tai-kho-van-c10047?page=4
Finished page 4 / 37
https://www.topcv.vn/tim-viec-lam-van-tai-kho-van-c10047?page=5
Finished page 5 / 37
https://www.topcv.vn/tim-viec-lam-van-tai-kho-van-c10047?page=6
Finished page 6 / 37
https://www.topcv.vn/tim-viec-lam-van-tai-kho-van-c10047?page=7
Finished page 7 / 37
https://www.topcv.vn/tim-viec-lam-van-tai-kho-van-c10047?page=8
Finished page 8 / 37
https://www.topcv.vn/tim-viec-lam-van-tai-kho-van-c10047?page=9
Finished page 9 / 37
https://www.topcv.vn/tim-viec-lam-van-tai-kho-van-c10047?page=10
Finished page 10 / 37
https://www.topcv.vn/tim-viec-lam-van-tai-kho-van-c10047?page=11
Finished page 11 / 37
https

Total page:  35
https://www.topcv.vn/tim-viec-lam-thoi-trang-c10042?page=1
Finished page 1 / 35
https://www.topcv.vn/tim-viec-lam-thoi-trang-c10042?page=2
Finished page 2 / 35
https://www.topcv.vn/tim-viec-lam-thoi-trang-c10042?page=3
Finished page 3 / 35
https://www.topcv.vn/tim-viec-lam-thoi-trang-c10042?page=4
Finished page 4 / 35
https://www.topcv.vn/tim-viec-lam-thoi-trang-c10042?page=5
Finished page 5 / 35
https://www.topcv.vn/tim-viec-lam-thoi-trang-c10042?page=6
Finished page 6 / 35
https://www.topcv.vn/tim-viec-lam-thoi-trang-c10042?page=7
Finished page 7 / 35
https://www.topcv.vn/tim-viec-lam-thoi-trang-c10042?page=8
Finished page 8 / 35
https://www.topcv.vn/tim-viec-lam-thoi-trang-c10042?page=9
Finished page 9 / 35
https://www.topcv.vn/tim-viec-lam-thoi-trang-c10042?page=10
Finished page 10 / 35
https://www.topcv.vn/tim-viec-lam-thoi-trang-c10042?page=11
Finished page 11 / 35
https://www.topcv.vn/tim-viec-lam-thoi-trang-c10042?page=12
Finished page 12 / 35
https://www.topcv.

KeyboardInterrupt: 

Saved at Tổ chức sự kiện / Quà tặng records 

Index 14

In [7]:
missed_industries = []
for i in range(15, 60):
    industry = industries[i]
    job_count = int(industry['job_category_count'].replace('.',''))
    if job_count > 1000:
        continue
    try:
        js_raw, exc_urls_i = get_jobs_by_industry(industry_name=industry['name'],
                                                industry_url=industry['access_url'],
                                                 save_name=str(i))
        save_json(exc_urls_i, f'exc_urls_{i}')
    except Exception as e:
        print(e, '-', i)
        missed_industries.append(i)

Collecting industry:  Thực phẩm / Đồ uống
Total page:  38
https://www.topcv.vn/tim-viec-lam-thuc-pham-do-uong-c10043?page=1
Finished page 1 / 38
https://www.topcv.vn/tim-viec-lam-thuc-pham-do-uong-c10043?page=2
Finished page 2 / 38
https://www.topcv.vn/tim-viec-lam-thuc-pham-do-uong-c10043?page=3
Finished page 3 / 38
https://www.topcv.vn/tim-viec-lam-thuc-pham-do-uong-c10043?page=4
Finished page 4 / 38
https://www.topcv.vn/tim-viec-lam-thuc-pham-do-uong-c10043?page=5
Finished page 5 / 38
https://www.topcv.vn/tim-viec-lam-thuc-pham-do-uong-c10043?page=6
Finished page 6 / 38
https://www.topcv.vn/tim-viec-lam-thuc-pham-do-uong-c10043?page=7
Finished page 7 / 38
https://www.topcv.vn/tim-viec-lam-thuc-pham-do-uong-c10043?page=8
Finished page 8 / 38
https://www.topcv.vn/tim-viec-lam-thuc-pham-do-uong-c10043?page=9
Finished page 9 / 38
https://www.topcv.vn/tim-viec-lam-thuc-pham-do-uong-c10043?page=10
Finished page 10 / 38
https://www.topcv.vn/tim-viec-lam-thuc-pham-do-uong-c10043?page=11
Fin

Finished page 10 / 37
https://www.topcv.vn/tim-viec-lam-xuat-nhap-khau-c10049?page=11
Finished page 11 / 37
https://www.topcv.vn/tim-viec-lam-xuat-nhap-khau-c10049?page=12
Finished page 12 / 37
https://www.topcv.vn/tim-viec-lam-xuat-nhap-khau-c10049?page=13
Finished page 13 / 37
https://www.topcv.vn/tim-viec-lam-xuat-nhap-khau-c10049?page=14
Finished page 14 / 37
https://www.topcv.vn/tim-viec-lam-xuat-nhap-khau-c10049?page=15
Finished page 15 / 37
https://www.topcv.vn/tim-viec-lam-xuat-nhap-khau-c10049?page=16
Finished page 16 / 37
https://www.topcv.vn/tim-viec-lam-xuat-nhap-khau-c10049?page=17
Finished page 17 / 37
https://www.topcv.vn/tim-viec-lam-xuat-nhap-khau-c10049?page=18
Finished page 18 / 37
https://www.topcv.vn/tim-viec-lam-xuat-nhap-khau-c10049?page=19
Finished page 19 / 37
https://www.topcv.vn/tim-viec-lam-xuat-nhap-khau-c10049?page=20
Finished page 20 / 37
https://www.topcv.vn/tim-viec-lam-xuat-nhap-khau-c10049?page=21
Finished page 21 / 37
https://www.topcv.vn/tim-viec-la

Finished page 9 / 13
https://www.topcv.vn/tim-viec-lam-cong-nghe-o-to-c10052?page=10
Finished page 10 / 13
https://www.topcv.vn/tim-viec-lam-cong-nghe-o-to-c10052?page=11
Finished page 11 / 13
https://www.topcv.vn/tim-viec-lam-cong-nghe-o-to-c10052?page=12
Finished page 12 / 13
https://www.topcv.vn/tim-viec-lam-cong-nghe-o-to-c10052?page=13
Finished page 13 / 13
----------------Saved all Công nghệ Ô tô records.----------------
Collecting industry:  Mỹ phẩm / Trang sức
Total page:  21
https://www.topcv.vn/tim-viec-lam-my-pham-trang-suc-c10031?page=1
Finished page 1 / 21
https://www.topcv.vn/tim-viec-lam-my-pham-trang-suc-c10031?page=2
Finished page 2 / 21
https://www.topcv.vn/tim-viec-lam-my-pham-trang-suc-c10031?page=3
Finished page 3 / 21
https://www.topcv.vn/tim-viec-lam-my-pham-trang-suc-c10031?page=4
Finished page 4 / 21
https://www.topcv.vn/tim-viec-lam-my-pham-trang-suc-c10031?page=5
Finished page 5 / 21
https://www.topcv.vn/tim-viec-lam-my-pham-trang-suc-c10031?page=6
Finished p

Finished page 25 / 33
https://www.topcv.vn/tim-viec-lam-san-xuat-c10126?page=26
Finished page 26 / 33
https://www.topcv.vn/tim-viec-lam-san-xuat-c10126?page=27
Finished page 27 / 33
https://www.topcv.vn/tim-viec-lam-san-xuat-c10126?page=28
Finished page 28 / 33
https://www.topcv.vn/tim-viec-lam-san-xuat-c10126?page=29
Finished page 29 / 33
https://www.topcv.vn/tim-viec-lam-san-xuat-c10126?page=30
Finished page 30 / 33
https://www.topcv.vn/tim-viec-lam-san-xuat-c10126?page=31
Finished page 31 / 33
https://www.topcv.vn/tim-viec-lam-san-xuat-c10126?page=32
Finished page 32 / 33
https://www.topcv.vn/tim-viec-lam-san-xuat-c10126?page=33
Finished page 33 / 33
----------------Saved all Sản xuất records.----------------
Collecting industry:  Mỹ thuật / Nghệ thuật / Điện ảnh
Total page:  21
https://www.topcv.vn/tim-viec-lam-my-thuat-nghe-thuat-dien-anh-c10032?page=1
Finished page 1 / 21
https://www.topcv.vn/tim-viec-lam-my-thuat-nghe-thuat-dien-anh-c10032?page=2
Finished page 2 / 21
https://www

Finished page 2 / 27
https://www.topcv.vn/tim-viec-lam-spa-lam-dep-c10130?page=3
Finished page 3 / 27
https://www.topcv.vn/tim-viec-lam-spa-lam-dep-c10130?page=4
Finished page 4 / 27
https://www.topcv.vn/tim-viec-lam-spa-lam-dep-c10130?page=5
Finished page 5 / 27
https://www.topcv.vn/tim-viec-lam-spa-lam-dep-c10130?page=6
Finished page 6 / 27
https://www.topcv.vn/tim-viec-lam-spa-lam-dep-c10130?page=7
Finished page 7 / 27
https://www.topcv.vn/tim-viec-lam-spa-lam-dep-c10130?page=8
Finished page 8 / 27
https://www.topcv.vn/tim-viec-lam-spa-lam-dep-c10130?page=9
Finished page 9 / 27
https://www.topcv.vn/tim-viec-lam-spa-lam-dep-c10130?page=10
Finished page 10 / 27
https://www.topcv.vn/tim-viec-lam-spa-lam-dep-c10130?page=11
Finished page 11 / 27
https://www.topcv.vn/tim-viec-lam-spa-lam-dep-c10130?page=12
Finished page 12 / 27
https://www.topcv.vn/tim-viec-lam-spa-lam-dep-c10130?page=13
Finished page 13 / 27
https://www.topcv.vn/tim-viec-lam-spa-lam-dep-c10130?page=14
Finished page 14 / 

https://www.topcv.vn/tim-viec-lam-moi-truong-xu-ly-chat-thai-c10030?page=6
Finished page 6 / 7
https://www.topcv.vn/tim-viec-lam-moi-truong-xu-ly-chat-thai-c10030?page=7
Finished page 7 / 7
----------------Saved all Môi trường / Xử lý chất thải records.----------------
Collecting industry:  Bưu chính - Viễn thông
Total page:  15
https://www.topcv.vn/tim-viec-lam-buu-chinh-vien-thong-c10005?page=1
Finished page 1 / 15
https://www.topcv.vn/tim-viec-lam-buu-chinh-vien-thong-c10005?page=2
Finished page 2 / 15
https://www.topcv.vn/tim-viec-lam-buu-chinh-vien-thong-c10005?page=3
Finished page 3 / 15
https://www.topcv.vn/tim-viec-lam-buu-chinh-vien-thong-c10005?page=4
Finished page 4 / 15
https://www.topcv.vn/tim-viec-lam-buu-chinh-vien-thong-c10005?page=5
Finished page 5 / 15
https://www.topcv.vn/tim-viec-lam-buu-chinh-vien-thong-c10005?page=6
Finished page 6 / 15
https://www.topcv.vn/tim-viec-lam-buu-chinh-vien-thong-c10005?page=7
Finished page 7 / 15
https://www.topcv.vn/tim-viec-lam-buu-c

Finished page 2 / 4
https://www.topcv.vn/tim-viec-lam-dau-khi-hoa-chat-c10012?page=3
Finished page 3 / 4
https://www.topcv.vn/tim-viec-lam-dau-khi-hoa-chat-c10012?page=4
Finished page 4 / 4
----------------Saved all Dầu khí/Hóa chất records.----------------


In [18]:
for ind,industry in enumerate(industries):
    print(ind, '-', industry)

0 - {'id': 10001, 'name': 'Kinh doanh / Bán hàng', 'alias': 'kinh-doanh-ban-hang', 'access_url': '/tim-viec-lam-kinh-doanh-ban-hang-c10001', 'job_category_count': '11.119'}
1 - {'id': 10026, 'name': 'IT phần mềm', 'alias': 'it-phan-mem', 'access_url': '/tim-viec-lam-it-phan-mem-c10026', 'job_category_count': '3.248'}
2 - {'id': 10023, 'name': 'Hành chính / Văn phòng', 'alias': 'hanh-chinh-van-phong', 'access_url': '/tim-viec-lam-hanh-chinh-van-phong-c10023', 'job_category_count': '3.996'}
3 - {'id': 10017, 'name': 'Giáo dục / Đào tạo', 'alias': 'giao-duc-dao-tao', 'access_url': '/tim-viec-lam-giao-duc-dao-tao-c10017', 'job_category_count': '2.856'}
4 - {'id': 10045, 'name': 'Tư vấn', 'alias': 'tu-van', 'access_url': '/tim-viec-lam-tu-van-c10045', 'job_category_count': '4.092'}
5 - {'id': 10029, 'name': 'Marketing / Truyền thông / Quảng cáo', 'alias': 'marketing-truyen-thong-quang-cao', 'access_url': '/tim-viec-lam-marketing-truyen-thong-quang-cao-c10029', 'job_category_count': '5.453'}

Collecting industries with > 1000 jobs

In [23]:
num_cores = cpu_count() # number of CPU cores 

async def extract_details(page, session, save_name_exc_urls): 
    # concatenate page number to base URL 
    async with session.get(f"{page}") as resp:
        jobs_by_industries = []
        except_urls = []
        print(f'{page}')
        pg_bs = BeautifulSoup(resp.text, 'html.parser')
        job_body = pg_bs.find("div",{'class':'job-body'})
        job_list = job_body.find('div', {'class':'job-list-default'})
        job_items = job_list.find_all('div',{'class':'job-item-default'})
        for job_item in job_items:
            job_dict = {}
            job_item_ = job_item.find('div',{'class':'body'})
            job_url = job_item_.find('a').get('href')
            try:
                job_dict['url'] = job_url
                resp1 = requests.get(job_url)
                jb_bs = BeautifulSoup(resp1.text, 'html.parser')
                job_header = jb_bs.find('div',{'class':'box-detail-job'}).find('div',{'class':'box-header'})
                company_name = job_header.find('a',{'class':'company-logo'}).get('title')
                job_title = job_header.find('div',{'class':'box-info-job'}).find('h1',{'class':'job-title'}).getText().strip()
                job_deadline = job_header.find('div',{'class':'job-deadline'}).getText().strip()
                job_info = jb_bs.find('div',{'id':'tab-info'})

                general_info = job_info.find_all('div',{'class':'box-info'})[-1].find_all('div',{'class':'box-item'})
                job_salary = general_info[0].find('span').getText().strip()
                job_type = general_info[2].find('span').getText().strip()
                job_level = general_info[3].find('span').getText().strip()
                job_yoe = general_info[-1].find('span').getText().strip()
                job_addresses = [item.text.strip() for item in job_info.find('div',{'class':'box-address'}).find_all('div')[1:]]

                job_data = job_info.find('div',{'class':'job-data'})
                job_details = job_data.find_all('div',{'class':'content-tab'})
                job_desc = str(job_details[0])
                job_req = str(job_details[1])

                job_dict['job_title'] = job_title
                job_dict['job_deadline'] = job_deadline
                job_dict['company_name'] = company_name
                job_dict['job_salary'] = job_salary
                job_dict['job_type'] = job_type
                job_dict['job_yoe'] = job_yoe
                job_dict['job_addresses'] = job_addresses
                job_dict['job_desc'] = job_desc
                job_dict['job_req'] = job_req
                jobs_by_industries.append(job_dict)
#                 print(f'{len(job_title)} - {len(job_salary)} - {len(job_type)} - {len(job_addresses)} - {len(job_desc)} - {len(job_req)}')
            except Exception as e:
                except_urls.append(job_url)
        append_text(except_urls, save_name_exc_urls)
        return jobs_by_industries

async def extract_details_task(pages_for_task, save_name_exc_urls): 
    async with aiohttp.ClientSession() as session: 
        tasks = [ 
            extract_details(page, session, save_name_exc_urls) 
            for page in pages_for_task 
        ] 
        list_of_lists = await asyncio.gather(*tasks) 
        return sum(list_of_lists, []) 
 
 
def asyncio_wrapper(pages_for_task, save_name_exc_urls): 
    return asyncio.run(extract_details_task(pages_for_task, save_name_exc_urls))

def execute_parallel_scrape(pages, save_name, save_name_exc_urls): 
    executor = ProcessPoolExecutor(max_workers=num_cores) 
    tasks = [ 
        executor.submit(asyncio_wrapper, pages_for_task, save_name_exc_urls) 
        for pages_for_task in np.array_split(pages, num_cores) 
    ] 
    doneTasks, _ = concurrent.futures.wait(tasks) 
 
    results = [ 
        item.result() 
        for item in doneTasks 
    ] 
    save_json(results, save_name) 
    
def parallel_get_jobs_by_industry(industry_name, industry_url, save_name=None):
    base_url = "https://www.topcv.vn"
    ind_url = industry_url
    if save_name is None:
        save_name = industry_name
    save_name_exc_urls = f'exc_urls_{save_name}' 
    print("Collecting industry: ", industry_name)
    
    resp = requests.get(f'{base_url}{ind_url}')
    pg_bs = BeautifulSoup(resp.text, 'html.parser')
    try:
        max_page = int(pg_bs.find('ul',{'class':'pagination'}).find_all('li')[-2].text.strip())
    except Exception as e:
        max_page = 1
    print("Total page: ", max_page)
    pages_to_scrape = [f'{base_url}{ind_url}?page={i}' for i in range(1,max_page+1)]
    
    execute_parallel_scrape(pages_to_scrape, save_name, save_name_exc_urls)

Collecting IT Phần mềm

In [21]:
industries

[{'id': 10001,
  'name': 'Kinh doanh / Bán hàng',
  'alias': 'kinh-doanh-ban-hang',
  'access_url': '/tim-viec-lam-kinh-doanh-ban-hang-c10001',
  'job_category_count': '11.119'},
 {'id': 10026,
  'name': 'IT phần mềm',
  'alias': 'it-phan-mem',
  'access_url': '/tim-viec-lam-it-phan-mem-c10026',
  'job_category_count': '3.248'},
 {'id': 10023,
  'name': 'Hành chính / Văn phòng',
  'alias': 'hanh-chinh-van-phong',
  'access_url': '/tim-viec-lam-hanh-chinh-van-phong-c10023',
  'job_category_count': '3.996'},
 {'id': 10017,
  'name': 'Giáo dục / Đào tạo',
  'alias': 'giao-duc-dao-tao',
  'access_url': '/tim-viec-lam-giao-duc-dao-tao-c10017',
  'job_category_count': '2.856'},
 {'id': 10045,
  'name': 'Tư vấn',
  'alias': 'tu-van',
  'access_url': '/tim-viec-lam-tu-van-c10045',
  'job_category_count': '4.092'},
 {'id': 10029,
  'name': 'Marketing / Truyền thông / Quảng cáo',
  'alias': 'marketing-truyen-thong-quang-cao',
  'access_url': '/tim-viec-lam-marketing-truyen-thong-quang-cao-c10029

In [10]:
itpm_js = read_json('itphanmem')

In [17]:
sum([len(item) for item in itpm_js])

3075

In [22]:
itpm_js[0]

[{'url': 'https://www.topcv.vn/viec-lam/nhan-vien-seo-web/1011826.html?ta_source=JobSearchList',
  'job_title': 'Nhân Viên SEO Web',
  'job_deadline': 'Hạn nộp hồ sơ: 07/06/2023',
  'company_name': 'CÔNG TY TNHH XUẤT NHẬP KHẨU THƯƠNG MẠI YÊN PHÁT',
  'job_salary': '10 - 15 triệu',
  'job_type': 'Toàn thời gian',
  'job_yoe': '1 năm',
  'job_addresses': ['- Hà Nội: Nhà A14 ngõ 20 Phố Huy Du, Phường Cầu Diễn, Quận Nam Từ Liêm, Nam Từ Liêm'],
  'job_desc': '<div class="content-tab"><p>- Tối ưu website với các công cụ tìm kiếm Google\xa0<br>- Seo từ khóa website trên các công cụ tìm kiếm….\xa0<br>- Seo mạng xã hội, thiết lập và quản trị các trang mạng xã hội của công ty…<br>- Quảng bá dịch vụ sản phẩm của công ty trên các diễn đàn, blog, mạng xã hội,...\xa0<br>- Lập kế hoạch phân tích đánh giá từ khóa, đánh giá thị trường thông qua mức độ tìm kiếm từ khóa\xa0<br>- Thực hiện chiến dịch xây dựng link liên kết, hệ thống website của công ty\xa0<br>- Sử dụng các công cụ hỗ trợ để đẩy mạnh kết

In [25]:
for index,industry in enumerate(industries):
    print(industry)

{'id': 10001, 'name': 'Kinh doanh / Bán hàng', 'alias': 'kinh-doanh-ban-hang', 'access_url': '/tim-viec-lam-kinh-doanh-ban-hang-c10001', 'job_category_count': '11.119'}
{'id': 10026, 'name': 'IT phần mềm', 'alias': 'it-phan-mem', 'access_url': '/tim-viec-lam-it-phan-mem-c10026', 'job_category_count': '3.248'}
{'id': 10023, 'name': 'Hành chính / Văn phòng', 'alias': 'hanh-chinh-van-phong', 'access_url': '/tim-viec-lam-hanh-chinh-van-phong-c10023', 'job_category_count': '3.996'}
{'id': 10017, 'name': 'Giáo dục / Đào tạo', 'alias': 'giao-duc-dao-tao', 'access_url': '/tim-viec-lam-giao-duc-dao-tao-c10017', 'job_category_count': '2.856'}
{'id': 10045, 'name': 'Tư vấn', 'alias': 'tu-van', 'access_url': '/tim-viec-lam-tu-van-c10045', 'job_category_count': '4.092'}
{'id': 10029, 'name': 'Marketing / Truyền thông / Quảng cáo', 'alias': 'marketing-truyen-thong-quang-cao', 'access_url': '/tim-viec-lam-marketing-truyen-thong-quang-cao-c10029', 'job_category_count': '5.453'}
{'id': 10047, 'name': '