In [1]:
# Import essential modules

import os            # miscellaneous operating system interfaces
import requests      # send HTTP requests using Python
import time
import random

from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [4]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless=new')
chrome_options.add_argument('--no-sandbox')

driver = webdriver.Chrome('chromedriver', service=Service(ChromeDriverManager().install()), options=chrome_options)


In [6]:
# Create a folder for storing data

root_dir = 'C://Users/NGUYEN DIEU HUONG/OneDrive/Desktop/AIO2023/vn_news_corpus'
root_dir_img = 'C://Users/NGUYEN DIEU HUONG/OneDrive/Desktop/AIO2023/vn_news_thumbnail'
os.makedirs(root_dir, exist_ok=True)
os.makedirs(root_dir_img, exist_ok=True)
n_pages = 2
article_id = 0
img_id = 0

#### Text Crawling

In [7]:
# Initialize Google Chrome browser


for page_idx in range(n_pages):
    
    # Access to table page
    main_url = f'https://vietnamnet.vn/giai-tri-page{page_idx}'
    driver.get(main_url)
    
    # Get list of articles (list of URLs)
    news_ls_xpath = '//div[@class="topStory-15nd"]/div/div[1]/a'
    news_tags = driver.find_elements(By.XPATH, news_ls_xpath)
    news_url_ls = [news_tag.get_attribute('href') for news_tag in news_tags]
    
    
    for news_url in news_url_ls:
        
        # Access to each news page
        driver.get(news_url)
        time.sleep(1)
        
        # Try to get main content tags
        main_content_xpath = '//div[@class="content-detail"]'
        try:
            main_content_tags = driver.find_element(By.XPATH, main_content_xpath)
        except:
            continue
            
        # Ignore video articles
        video_content_xpath = '//div[@class="video-detail"]'
        try:
            video_content_tags = driver.find_element(By.XPATH, video_content_xpath)
            continue
        except:
            pass
            
        # Get news title
        title = main_content_tags.find_element(By.TAG_NAME, 'h1').text.strip()
        
        # Get news abstract
        abstract = main_content_tags.find_element(By.TAG_NAME, 'h2').text.strip()
        
        # Get news author
        author = main_content_tags.find_element(By.TAG_NAME, 'span').text.strip()
        
        # Get news paragraphs
        paragraph_tags = main_content_tags.find_elements(By.XPATH, '//div[@class="maincontent main-content"]/p')
        paragraph_ls = [paragraph_tag.text.strip() for paragraph_tag in paragraph_tags]
        
        paragraphs = ' '.join(paragraph_ls)
        
        # Combine title, abstract, paragraphs, author
        final_content_ls = [title, abstract, paragraphs, author]
        final_content = '\n\n'.join(final_content_ls)
        
        # Save final content to file
        filename = f'article_{article_id:05d}.txt'
        savepath = os.path.join(root_dir, filename)
        
        article_id += 1
        
        with open(savepath, 'w', encoding='utf-8') as file:
            file.write(final_content)
            
        

#### Image crawling

In [None]:
from PIL import Image
import io

# Process for crawling Images


for page_idx in tqdm(range(n_pages)):
    
  # Access to table page
  main_url = f'https://vietnamnet.vn/giai-tri-page{page_idx}'
  driver.get(main_url)
    
  # Get list of articles (list of URLs)
  news_ls_xpath = '//div[@class="topStory-15nd"]/div/div[1]/a/img'
  news_tags = driver.find_elements(By.XPATH, news_ls_xpath)
  news_url_ls = [news_tag.get_attribute('href') for news_tag in news_tags]
    
  # Get list of images (list of URLs)
  img_url_ls = [news_tag.get_attribute('src') for news_tag in news_tags]
    
  # Process for thumbnails
  for img_url in img_url_ls:
    img_url_response = requests.get(img_url)
    try:
      img = Image.open(io.BytesIO(img_url_response.content))
    except:
      continue

    if img.mode == 'P':
      img = img.convert('RGB')

    img_name = f'IMG_{img_id:05}.png'
    img_savepath = os.path.join(root_dir_img, img_name)

    img.save(img_savepath)
    
    img_id += 1