In [100]:
!pip install chromedriver-py
!pip install pandas

In [120]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver as ChromeWebDriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, date
import time
import math
import os
import csv
import re
import requests
from typing import List, Dict, Union, Optional

In [121]:
relevant_titles_output_file = "global_times_relevant_articles.csv"

In [122]:
all_write_dir = "data"

if all_write_dir not in os.listdir():
    os.mkdir(f"./{all_write_dir}")

In [123]:
sub_dir = "global_times"

if sub_dir not in os.listdir(all_write_dir):
    os.mkdir(f"./{all_write_dir}/{sub_dir}")

In [124]:
def create_chrome_driver(headless: bool=False) -> ChromeWebDriver:
    chrome_options = Options()
    if headless:
        chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

In [125]:
# Using advanced search, we can see that articles from the Global Times start at 2008/09/28.
# Let's search month by month, setting the begin_date and end_date to the first and last date of that month respectively,
# For every month from 2008/09 to 2025/08

def get_first_last_days_of_months(
    start_date: Union[str, date, datetime],
    end_date: Union[str, date, datetime]
) -> List[Dict[str, Union[str, date]]]:
    """Get first and last day of each month in range using pandas"""
    
    # Create date range of all months
    months = pd.date_range(start=start_date, end=end_date, freq='MS')  # Month Start
    
    results = []
    for month_start in months:
        # First day is the month start
        first_day = month_start
        
        # Last day is the last day of that month
        last_day = month_start + pd.offsets.MonthEnd(0)
        
        results.append({
            'month': month_start.strftime('%Y-%m'),
            'first_day': first_day.date(),
            'last_day': last_day.date()
        })

    return results

In [131]:
def query_model(prompt: str, model="llama2:7b") -> str:
    url = "http://localhost:11434/api/generate"
    data = {
        "model": model,
        "prompt": prompt,
        "stream": False
    }
    
    response = requests.post(url, json=data)
    
    # Add error checking
    if response.status_code != 200:
        return f"Error: {response.json()}"
    
    return response.json()["response"].strip()

"""
each returned row is formatted like [date (of article), article title, article url]
"""
def get_article_data(month: str, page_source: BeautifulSoup) -> List[List[str]]:
    article_data = []
    
    for bc in soup.find_all("blockquote"):
        try:
            article_date = re.findall(r"^\d{2,4}/\d{2}/\d{2}", bc.small.text.strip())[0]
            article_title = bc.a.text.strip()
            article_link = bc.a.get("href")
            article_data.append([month, article_date, article_title, article_link])
        except Exception as e:
            print(f"Insufficient article data - {e}")
    
    return article_data


def get_relevant_articles(article_titles: List[str]) -> List[str]:
    # remove pipes if they exist in article titles
    article_titles = [article_title.replace("|", "") for article_title in article_titles]
    prompt = """Hey Gemma, here are some article titles separated by a pipe. 
    Please return a pipe-separated list of which ones correspond to China. 
    Only return the pipe-separated list
    and nothing else. It should be formatted like <article_title>|<article_title>|<article_title>.
    If an article is not related to China, don't include it in the
    list or in the output at all.
    Here are the article links: {}""".format("|".join(article_titles))
    print(prompt)
    result = query_model(prompt, model="gemma3:12b")
    relevant_article_titles = []
    if not re.search("^Error", result):
        print(result)
        relevant_article_titles.extend(result.split("|"))
    else:
        print(result)
    return relevant_article_titles

In [132]:
"""
each row in data should be formatted like 
[month, date (of article), article title, article url]
"""
def append_article_data(
                        output_file: str,
                        data: List[List[str]]
                       ) -> None:
    if output_file not in os.listdir():
        with open(output_file, "w+", encoding="cp1252", newline="") as csv_file:
            csv_writer = csv.writer(csv_file)
            headers = ["month", "date", "article_title", "article_link"]
            csv_file.writerow(headers)

    with open(output_file, "a", encoding="cp1252", newline="") as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerows(data)


# search for all Global Times articles within a month range
def month_search(month: str, begin_date: str, end_date: str) -> None:
    driver = create_chrome_driver()
    wait = WebDriverWait(driver, 10)
    try:
        driver.get("https://search.globaltimes.cn/SearchCtrl")
        
        begin_date_input = wait.until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="begin_date"]'))
        )
        end_date_input = wait.until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="end_date"]'))
        )
        driver.execute_script(f"arguments[0].setAttribute('value', '{begin_date}')", begin_date_input)
        driver.execute_script(f"arguments[0].setAttribute('value', '{end_date}')", end_date_input)
        
        search_button = wait.until(
            EC.presence_of_element_located((By.XPATH, '/html/body/div[3]/div/div/form/div[8]'))
        )
        search_button.click()
        
        time.sleep(10)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        num_articles_element = wait.until(
            EC.presence_of_element_located((By.XPATH, '/html/body/div[4]/div[1]/div[12]/font'))
        )
        num_articles = 0
        if re.search(r'\d+', num_articles_element.text):
            num_articles = int(re.findall(r'\d+', num_articles_element.text)[0])

        for page in range(num_articles):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)

            page_soup = BeautifulSoup(driver.page_source, "html.parser")
            page_article_data = get_article_data(month, page_soup)
            page_article_titles = [row[2] for row in page_article_data]
            page_relevant_article_titles = get_relevant_articles(page_article_titles)
            relevant_article_data = [row for row in page_article_data if row[2] in page_relevant_article_titles]

            append_article_data(relevant_titles_output_file, relevant_article_data)            
            
            all_buttons = driver.find_elements(By.CLASS_NAME, "btn")
            if len(all_buttons) > 0:
                next_button = all_buttons[0]
                for button in all_buttons:
                    if 'Next' in button.get_attribute("innerHTML"):
                        next_button = button
                        next_page_script = next_button.get_attribute("onclick")
                        driver.execute_script(next_page_script)
                        break
            
            
    except TimeoutException:
        print(f"Timeout waiting for elements for date range {begin_date} to {end_date}")
    except NoSuchElementException as e:
        print(f"Element not found: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")