In [100]:
!pip install chromedriver-py
!pip install pandas

In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.webdriver import WebDriver as ChromeWebDriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pandas as pd
from datetime import datetime, date
import time
import math
import os
import re
import requests
from typing import List, Dict, Union, Optional

In [2]:
all_write_dir = "./data"

if all_write_dir not in os.listdir():
    os.mkdir(all_write_dir)

In [4]:
sub_dir = f"{all_write_dir}/global_times"

if sub_dir not in os.listdir():
    os.mkdir(sub_dir)

In [5]:
def create_chrome_driver(headless: bool=False) -> ChromeWebDriver:
    chrome_options = Options()
    if headless:
        chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

In [6]:
# Using advanced search, we can see that articles from the Global Times start at 2008/09/28.
# Let's search month by month, setting the begin_date and end_date to the first and last date of that month respectively,
# For every month from 2008/09 to 2025/08

def get_first_last_days_of_months(
    start_date: Union[str, date, datetime],
    end_date: Union[str, date, datetime]
) -> List[Dict[str, Union[str, date]]]:
    """Get first and last day of each month in range using pandas"""
    
    # Create date range of all months
    months = pd.date_range(start=start_date, end=end_date, freq='MS')  # Month Start
    
    results = []
    for month_start in months:
        # First day is the month start
        first_day = month_start
        
        # Last day is the last day of that month
        last_day = month_start + pd.offsets.MonthEnd(0)
        
        results.append({
            'month': month_start.strftime('%Y-%m'),
            'first_day': first_day.date(),
            'last_day': last_day.date()
        })

    return results

In [9]:
def query_ollama(prompt: str, model="llama2:7b") -> str:  # Changed from gpt-oss:20b
    url = "http://localhost:11434/api/generate"
    data = {
        "model": model,
        "prompt": prompt,
        "stream": False
    }
    
    response = requests.post(url, json=data)
    
    # Add error checking
    if response.status_code != 200:
        return f"Error: {response.json()}"
    
    return response.json()["response"]


def get_relevant_articles(article_titles: List[str]) -> List[str]:
    # remove pipes if they exist in article titles
    article_titles = [article_title.replace("|", "") for article_title in article_titles]
    prompt = """Hey LLama, here are some article titles separated by a pipe. Please return a pipe-separated list of which ones correspond to China
    and China-related geopolitics. Here are the article links: {}""".format("|".join(article_titles))
    result = query_ollama(prompt)
    relevant_article_titles = []
    if not re.search("^Error", result):
        relevant_article_titles.extend(result.split("|"))
    else:
        print(result)
    return relevant_article_titles

In [7]:
# search for all Global Times articles within a month range
def month_search(begin_date: str, end_date: str) -> None:
    driver = create_chrome_driver()
    wait = WebDriverWait(driver, 10)
    try:
        driver.get("https://search.globaltimes.cn/SearchCtrl")
        
        begin_date_input = wait.until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="begin_date"]'))
        )
        end_date_input = wait.until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="end_date"]'))
        )
        driver.execute_script(f"arguments[0].setAttribute('value', '{begin_date}')", begin_date_input)
        driver.execute_script(f"arguments[0].setAttribute('value', '{end_date}')", end_date_input)
        
        search_button = wait.until(
            EC.presence_of_element_located((By.XPATH, '/html/body/div[3]/div/div/form/div[8]'))
        )
        search_button.click()
        
        time.sleep(10)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        num_articles_element = wait.until(
            EC.presence_of_element_located((By.XPATH, '/html/body/div[4]/div[1]/div[12]/font'))
        )
        num_articles = 0
        if re.search(r'\d+', num_articles_element.text):
            num_articles = int(re.findall(r'\d+', num_articles_element.text)[0])

        for page in range(num_articles):
            next_button = wait.until(
                EC.presence_of_element_located((By.XPATH, "//div[text()=' Next >> ']"))
            )
            next_button.click()
            time.sleep(1)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            
    except TimeoutException:
        print(f"Timeout waiting for elements for date range {begin_date} to {end_date}")
    except NoSuchElementException as e:
        print(f"Element not found: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

In [99]:
month_search("2011-05-01", "2011-05-31")

Timeout waiting for elements for date range 2011-05-01 to 2011-05-31


In [10]:
query_ollama("Hi, how are you?")

"\nI'm just an AI assistant and do not have feelings or emotions, so I cannot experience emotions or respond in the way a human would. However, I'm here to help you with any questions or tasks you may have! How can I assist you today?"

In [22]:
import pandas as pd
from datetime import datetime

def get_month_boundaries_pandas(start_date, end_date):
    

# Usage
start_date = '2023-03-15'
end_date = '2024-02-10'

boundaries = get_month_boundaries_pandas(start_date, end_date)
for boundary in boundaries:
    print(f"{boundary['month']}: {boundary['first_day']} to {boundary['last_day']}")

2023-04: 2023-04-01 to 2023-04-30
2023-05: 2023-05-01 to 2023-05-31
2023-06: 2023-06-01 to 2023-06-30
2023-07: 2023-07-01 to 2023-07-31
2023-08: 2023-08-01 to 2023-08-31
2023-09: 2023-09-01 to 2023-09-30
2023-10: 2023-10-01 to 2023-10-31
2023-11: 2023-11-01 to 2023-11-30
2023-12: 2023-12-01 to 2023-12-31
2024-01: 2024-01-01 to 2024-01-31
2024-02: 2024-02-01 to 2024-02-29
