In [1]:
import json

def extract_course_prefixes(file_path):
    """
    Reads a JSON file containing a list of course codes, extracts the 
    first 4 letters of each code, and returns them as a list of strings.
    """
    try:
        with open(file_path, 'r') as f:
            course_codes = json.load(f)
        
        # Extract the first 4 letters from each course code
        prefixes = [code[:4] for code in course_codes]
        
        return list(sorted(set(prefixes)))
        
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found.")
        return []
    except json.JSONDecodeError:
        print(f"Error: The file '{file_path}' does not contain valid JSON.")
        return []

# Usage example:
# prefixes = extract_course_prefixes('data/course_codes_only.json')
# print(prefixes)

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import os
import concurrent.futures # <--- Th∆∞ vi·ªán ƒë·ªÉ ch·∫°y ƒëa lu·ªìng
import time

MAX_WORKER = 5


# Danh s√°ch c√°c m√£ ng√†nh quan tr·ªçng c·ªßa EAIT
EAIT_PREFIXES = extract_course_prefixes('../data/course_codes_only.json')

def fetch_course_codes_via_search(prefix):
    # D√πng trang SEARCH thay v√¨ Browse
    url = f"https://my.uq.edu.au/programs-courses/search.html?keywords={prefix}&searchType=course"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    codes = set()
    
    print(f"   -> [B·∫Øt ƒë·∫ßu] ƒêang t√¨m ki·∫øm: {prefix}...")
    try:
        # Timeout ƒë·ªÉ tr√°nh b·ªã treo n·∫øu m·∫°ng lag
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code != 200:
            print(f"      ‚ö†Ô∏è L·ªói k·∫øt n·ªëi {prefix}: {response.status_code}")
            return []

        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a', href=True)
        
        for link in links:
            href = link['href']
            if "course_code=" in href:
                try:
                    part = href.split("course_code=")[1]
                    code = part.split("&")[0]
                    if code.startswith(prefix) and len(code) == 8:
                        codes.add(code)
                except:
                    continue
        
        print(f"      ‚úÖ [Xong] {prefix}: T√¨m th·∫•y {len(codes)} m√¥n.")
        return list(codes)
                    
    except Exception as e:
        print(f"‚ùå L·ªói khi qu√©t {prefix}: {e}")
        return []

# --- CH·∫†Y L·∫†I B∆Ø·ªöC 1 V·ªöI CONCURRENT ---
def main():
    all_codes = []
    print("üöÄ B·∫ÆT ƒê·∫¶U QU√âT M√É M√îN (ƒêA LU·ªíNG)...")
    
    start_time = time.time()
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKER) as executor:
        # G·ª≠i t·∫•t c·∫£ c√°c task v√†o executor
        future_to_prefix = {executor.submit(fetch_course_codes_via_search, prefix): prefix for prefix in EAIT_PREFIXES}
        
        # X·ª≠ l√Ω k·∫øt qu·∫£ khi t·ª´ng task ho√†n th√†nh
        for future in concurrent.futures.as_completed(future_to_prefix):
            prefix = future_to_prefix[future]
            try:
                data = future.result()
                all_codes.extend(data)
            except Exception as exc:
                print(f'{prefix} generated an exception: {exc}')

    # L·ªçc tr√πng v√† s·∫Øp x·∫øp
    all_codes = sorted(list(set(all_codes)))
    
    end_time = time.time()
    
    # ƒê·∫£m b·∫£o th∆∞ m·ª•c t·ªìn t·∫°i
    output_dir = os.path.join('..', 'data')
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, 'all_course_codes
    .json')

    # L∆∞u file
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(all_codes, f, indent=4)

    print(f"\nüéâ T·ªîNG K·∫æT: ƒê√£ t√¨m th·∫•y {len(all_codes)} m√£ m√¥n ƒë·ªôc nh·∫•t.")
    print(f"‚è±Ô∏è Th·ªùi gian ch·∫°y: {end_time - start_time:.2f} gi√¢y")
    print("File ƒë√£ l∆∞u: ", output_path)

if __name__ == "__main__":
    main()

üöÄ B·∫ÆT ƒê·∫¶U QU√âT M√É M√îN (ƒêA LU·ªíNG)...
   -> [B·∫Øt ƒë·∫ßu] ƒêang t√¨m ki·∫øm: ACCT...
   -> [B·∫Øt ƒë·∫ßu] ƒêang t√¨m ki·∫øm: ADPS...
   -> [B·∫Øt ƒë·∫ßu] ƒêang t√¨m ki·∫øm: AGRC...
   -> [B·∫Øt ƒë·∫ßu] ƒêang t√¨m ki·∫øm: ANAT...
   -> [B·∫Øt ƒë·∫ßu] ƒêang t√¨m ki·∫øm: ANIM...
      ‚úÖ [Xong] ADPS: T√¨m th·∫•y 11 m√¥n.
   -> [B·∫Øt ƒë·∫ßu] ƒêang t√¨m ki·∫øm: AUDL...
      ‚úÖ [Xong] ANAT: T√¨m th·∫•y 11 m√¥n.
   -> [B·∫Øt ƒë·∫ßu] ƒêang t√¨m ki·∫øm: AUIL...
      ‚úÖ [Xong] ACCT: T√¨m th·∫•y 30 m√¥n.
   -> [B·∫Øt ƒë·∫ßu] ƒêang t√¨m ki·∫øm: BINF...
      ‚úÖ [Xong] ANIM: T√¨m th·∫•y 40 m√¥n.
   -> [B·∫Øt ƒë·∫ßu] ƒêang t√¨m ki·∫øm: BIOC...
      ‚úÖ [Xong] AGRC: T√¨m th·∫•y 56 m√¥n.
   -> [B·∫Øt ƒë·∫ßu] ƒêang t√¨m ki·∫øm: BIOL...
      ‚úÖ [Xong] AUDL: T√¨m th·∫•y 25 m√¥n.
   -> [B·∫Øt ƒë·∫ßu] ƒêang t√¨m ki·∫øm: BIOM...
      ‚úÖ [Xong] AUIL: T√¨m th·∫•y 4 m√¥n.
   -> [B·∫Øt ƒë·∫ßu] ƒêang t√¨m ki·∫øm: BIOT...
      ‚úÖ [Xong] BINF: T√¨m th·∫•y 3 m√¥n.
   -> [B·∫Øt ƒë·∫ßu] ƒêa