# üéì UQ Course Scraper & Data Engineer
**Project:** UQ Course Navigator & Grade Tracker  
**Phase:** 1 - Data Engineering  
**Status:** ‚úÖ Ready for Phase 2 (Database)

---
**M·ª•c ti√™u:**
1. Thu th·∫≠p th√¥ng tin chi ti·∫øt c√°c m√¥n h·ªçc (Description, Units, Contact Hours).
2. Truy c·∫≠p Electronic Course Profile (ECP) ƒë·ªÉ l·∫•y b·∫£ng ƒëi·ªÉm (Assessments).
3. X·ª≠ l√Ω d·ªØ li·ªáu th√¥: G·∫Øn c·ªù (Hurdle, Team-based), t√≠nh tr·ªçng s·ªë %.

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd # D√πng pandas ƒë·ªÉ xem b·∫£ng cho ƒë·∫πp trong Notebook
from tqdm import tqdm # <--- TH√äM D√íNG N√ÄY
import time # <--- TH√äM D√íNG N√ÄY (ƒë·ªÉ d√πng sleep tr√°nh b·ªã ch·∫∑n IP)
import concurrent.futures
import os


## üõ†Ô∏è 1. Core Scraper Functions
Ph·∫ßn n√†y ƒë·ªãnh nghƒ©a c√°c h√†m x·ª≠ l√Ω ch√≠nh:
* **`scrape_uq_course`**: L·∫•y th√¥ng tin t·ªïng quan t·ª´ trang ch·ªß m√¥n h·ªçc.
* **`scrape_assessment_table`**: ƒêi s√¢u v√†o link ECP ƒë·ªÉ b√≥c t√°ch b·∫£ng ƒëi·ªÉm.
* **`clean_assessment_task`**: D√πng Regex ƒë·ªÉ l√†m s·∫°ch t√™n b√†i t·∫≠p v√† g·∫Øn c·ªù (`is_hurdle`, `is_in_person`).
* 

In [2]:
def extract_course_codes(text):
    return re.findall(r'[A-Z]{4}\d{4}', text)

def scrape_uq_course(course_code):
    url = f"https://my.uq.edu.au/programs-courses/course.html?course_code={course_code}"
    headers = {"User-Agent": "Mozilla/5.0"}
    
    try:
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            return None
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Helper function ƒë·ªÉ l·∫•y text an to√†n
        def get_text(selector_id):
            element = soup.find(id=selector_id)
            return element.get_text(strip=True) if element else "N/A"

        # 1. Th√¥ng tin ƒë·ªãnh danh
        full_title = get_text('course-title')
        # T√°ch l·∫•y t√™n m√¥n (b·ªè ph·∫ßn m√£ m√¥n trong ngo·∫∑c)
        course_name = re.sub(r'\s\([A-Z]{4}\d{4}\)', '', full_title)

        # 2. Th√¥ng tin chi ti·∫øt (Summary Panel)
        level = get_text('course-level')
        faculty = get_text('course-faculty')
        school = get_text('course-school')
        units = int(get_text('course-units'))
        duration = get_text('course-duration')
        mode = get_text('course-mode')
        contact_hours = soup.find(id='course-contact').get_text(separator=' ', strip=True) if soup.find(id='course-contact') else "N/A"        
        
        # 3. ƒêi·ªÅu ki·ªán v√† R√†ng bu·ªôc
        prereq_raw = get_text('course-prerequisite')
        incomp_raw = get_text('course-incompatible')
        
        # 4. T√≥m t·∫Øt n·ªôi dung & ƒê√°nh gi√° s∆° b·ªô
        description = get_text('course-summary')
        assessment_summary = get_text('course-assessment-methods')
        coordinator = get_text('course-coordinator')

        # 5. Link quan tr·ªçng
        ecp_link = ""
        ecp_tag = soup.find('a', class_='profile-available')
        if ecp_tag:
            ecp_link = ecp_tag['href']
            # N·∫øu link l√† t∆∞∆°ng ƒë·ªëi, n·ªëi th√™m domain
            if ecp_link.startswith('/'):
                ecp_link = "https://programs-courses.uq.edu.au" + ecp_link

        return {
            "code": course_code,
            "title": course_name,
            "units": units,
            "level": level,
            "faculty": faculty,
            "school": school,
            "description": description,
            "contact_hours": contact_hours,
            "assessment_summary": assessment_summary,
            "prerequisites_text": prereq_raw,
            "prerequisites_list": extract_course_codes(prereq_raw),
            "incompatible_list": extract_course_codes(incomp_raw),
            "coordinator": coordinator,
            "ecp_link": ecp_link,
            "url": url
        }
    except Exception as e:
        print(f"Error scraping {course_code}: {e}")
        return None

def clean_assessment_task(raw_name):
    # Kh·ªüi t·∫°o c√°c flag m·∫∑c ƒë·ªãnh l√† False
    flags = {
        "is_hurdle": False,
        "is_identity_verified": False,
        "is_in_person": False,
        "is_team_based": False
    }
    
    # 1. Ki·ªÉm tra s·ª± t·ªìn t·∫°i c·ªßa c√°c t·ª´ kh√≥a (kh√¥ng ph√¢n bi·ªát hoa th∆∞·ªùng)
    if re.search(r'hurdle', raw_name, re.IGNORECASE):
        flags["is_hurdle"] = True
    if re.search(r'identity verified', raw_name, re.IGNORECASE):
        flags["is_identity_verified"] = True
    if re.search(r'in-person', raw_name, re.IGNORECASE):
        flags["is_in_person"] = True
    if re.search(r'team', raw_name, re.IGNORECASE):
        flags["is_team_based"] = True
        
    # 2. X√≥a c√°c t·ª´ kh√≥a n√†y kh·ªèi chu·ªói
    # Regex n√†y t√¨m c√°c t·ª´ ƒë√≥ k√®m theo d·∫•u ph·∫©y ho·∫∑c ngo·∫∑c ƒë∆°n xung quanh ch√∫ng
    clean_name = re.sub(r'\(?Hurdle\)?', '', raw_name, flags=re.IGNORECASE)
    clean_name = re.sub(r'\(?Identity Verified\)?', '', clean_name, flags=re.IGNORECASE)
    clean_name = re.sub(r'\(?In-person\)?', '', clean_name, flags=re.IGNORECASE)
    clean_name = re.sub(r'\(?Team or group-based\)?', '', clean_name, flags=re.IGNORECASE)
    
    # 3. D·ªçn d·∫πp c√°c k√Ω t·ª± th·ª´a (d·∫•u ph·∫©y d∆∞, kho·∫£ng tr·∫Øng d∆∞)
    clean_name = clean_name.replace(', ,', ',').strip(' ,()')
    clean_name = re.sub(r'\s+', ' ', clean_name) # X√≥a kho·∫£ng tr·∫Øng k√©p
    
    return clean_name, flags

def scrape_assessment_table(ecp_url):
    if not ecp_url or ecp_url == "N/A":
        return []
    
    # ƒê·∫£m b·∫£o ch√∫ng ta v√†o ƒë√∫ng trang Assessment (Section 5)
    # L∆∞u √Ω: T√πy link m√† UQ cung c·∫•p, c√≥ khi ph·∫£i append th√™m ƒë·ªÉ ra trang full assessment
    # ·ªû phi√™n b·∫£n V1, ta gi·∫£ s·ª≠ link d·∫´n ƒë·∫øn trang c√≥ ch·ª©a b·∫£ng Assessment
    
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        response = requests.get(ecp_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        assessments = []
        
        # T√¨m b·∫£ng Assessment - UQ th∆∞·ªùng d√πng class 'assessment-details' ho·∫∑c t√¨m theo text 'Assessment Task'
        table = soup.find('section', class_='section section--course-profile section--in-view') 
        
        if not table:
            # Plan B: T√¨m table b·∫•t k·ª≥ c√≥ ch·ª©a ch·ªØ "Weight"
            tables = soup.find_all('table')
            for t in tables:
                if "Weight" in t.text:
                    table = t
                    break
        
        if table:
            rows = table.find_all('tr')[1:] # B·ªè qua h√†ng ti√™u ƒë·ªÅ (header)
            for row in rows:
                cols = row.find_all('td')
                if len(cols) >= 2:
                    category = cols[0].get_text(strip=True)
                    assesment_task= cols[1].get_text(strip=True)
                    
                    
                    weight_raw = cols[2].get_text(strip=True)
                    due_date = cols[3].get_text(separator=' ', strip=True) if len(cols) > 3 else "N/A"                    
                    
                    # Clean tr·ªçng s·ªë: "20%" -> 0.2
                    weight_percent = re.findall(r'\d+', weight_raw)
                    weight_value = int(weight_percent[0]) / 100 if weight_percent else 0
                    
                    # Trong h√†m scrape_assessment_table, ƒëo·∫°n x·ª≠ l√Ω col[0] (task_name):

                    task_name_raw = cols[1].get_text(strip=True)
                    clean_name, flags = clean_assessment_task(task_name_raw)

                    assessments.append({
                        "category": category,
                        "assesment_task": clean_name,
                        "weight": weight_value,
                        "due_date": due_date,
                        "flags": flags
                        
                    })
        
        return assessments
    except Exception as e:
        print(f"L·ªói khi c√†o b·∫£ng ƒëi·ªÉm t·∫°i {ecp_url}: {e}")
        return []

# --- H√ÄM T·ªîNG H·ª¢P ---
def get_full_course_data(course_code):
    course_code = course_code.upper()
    # B∆∞·ªõc 1: L·∫•y th√¥ng tin chung
    course_data = scrape_uq_course(course_code)
    
    if course_data and course_data['ecp_link']:
        course_data['assessments'] = scrape_assessment_table(course_data['ecp_link'])
        
    return course_data

## üì• 2. Load Input Data
ƒê·ªçc danh s√°ch m√£ m√¥n h·ªçc (Course Codes) t·ª´ file `data/eait_codes_only.json` ƒë√£ ƒë∆∞·ª£c qu√©t ·ªü b∆∞·ªõc tr∆∞·ªõc (Scan Phase).

In [3]:
import os 
import json

# --- C·∫§U H√åNH ƒê∆Ø·ªúNG D·∫™N ---
# ƒêi t·ª´ folder 'scraper' l√πi ra ngo√†i (..) r·ªìi v√†o folder 'data'
input_path = os.path.join('..', 'data', 'all_course_codes.json')

# ƒê·ªçc file danh s√°ch m√£ m√¥n
try:
    with open(input_path, 'r', encoding='utf-8') as f:
        course_list = json.load(f)
    
    print(f"‚úÖ ƒê√£ t√¨m th·∫•y file t·∫°i: {input_path}")
    print(f"‚úÖ ƒê√£ load th√†nh c√¥ng {len(course_list)} m√£ m√¥n.")

except FileNotFoundError:
    print(f"‚ùå L·ªói: Kh√¥ng t√¨m th·∫•y file t·∫°i '{input_path}'")
    print("üëâ H√£y ki·ªÉm tra l·∫°i xem file json ƒë√£ n·∫±m trong folder 'data' ch∆∞a.")
    course_list = []

‚úÖ ƒê√£ t√¨m th·∫•y file t·∫°i: ../data/all_course_codes.json
‚úÖ ƒê√£ load th√†nh c√¥ng 3860 m√£ m√¥n.


## üöÄ 3. Main Execution Loop
B·∫Øt ƒë·∫ßu qu√° tr√¨nh c√†o d·ªØ li·ªáu chi ti·∫øt.
> **L∆∞u √Ω:**
> * S·ª≠ d·ª•ng `tqdm` ƒë·ªÉ hi·ªÉn th·ªã thanh ti·∫øn ƒë·ªô.
> * Script s·∫Ω t·∫°m ngh·ªâ `0.5s` gi·ªØa m·ªói request ƒë·ªÉ tu√¢n th·ªß quy t·∫Øc Rate Limiting c·ªßa UQ.

In [4]:
# --- C·∫§U H√åNH T·ªêC ƒê·ªò ---
MAX_WORKERS = 20  # S·ªë lu·ªìng ch·∫°y song song (ƒê·ª´ng ƒë·ªÉ qu√° cao, UQ s·∫Ω ch·∫∑n. 5-10 l√† an to√†n)

results = []
failed_courses = []

print(f"üöÄ B·∫Øt ƒë·∫ßu c√†o d·ªØ li·ªáu v·ªõi {MAX_WORKERS} lu·ªìng song song...")


# H√†m wrapper ƒë·ªÉ x·ª≠ l√Ω ngo·∫°i l·ªá trong Thread
def process_course(code):
    try:
        # Kh√¥ng c·∫ßn time.sleep ·ªü ƒë√¢y n·ªØa v√¨ m·∫°ng s·∫Ω t·ª± t·∫°o ƒë·ªô tr·ªÖ t·ª± nhi√™n
        # ho·∫∑c gi·ªØ time.sleep(0.1) n·∫øu mu·ªën c·ª±c k·ª≥ an to√†n
        return get_full_course_data(code)
    except Exception as e:
        return None

# S·ª¨ D·ª§NG THREAD POOL
with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    # G·ª≠i t·∫•t c·∫£ nhi·ªám v·ª• v√†o Pool
    future_to_code = {executor.submit(process_course, code): code for code in course_list}
    
    # D√πng tqdm ƒë·ªÉ theo d√µi ti·∫øn ƒë·ªô khi c√°c task ho√†n th√†nh
    for future in tqdm(concurrent.futures.as_completed(future_to_code), total=len(course_list), desc="Downloading"):
        code = future_to_code[future]
        try:
            data = future.result()
            if data:
                results.append(data)
            else:
                failed_courses.append(code)
        except Exception as exc:
            print(f"‚ö†Ô∏è {code} sinh ra l·ªói: {exc}")
            failed_courses.append(code)


üöÄ B·∫Øt ƒë·∫ßu c√†o d·ªØ li·ªáu v·ªõi 20 lu·ªìng song song...


Downloading: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3860/3860 [11:52<00:00,  5.42it/s]


## üìä 4. Data Verification & Export
* Ki·ªÉm tra nhanh d·ªØ li·ªáu b·∫±ng `pandas`.
* Xu·∫•t to√†n b·ªô d·ªØ li·ªáu s·∫°ch ra file `data/master_courses.json` ƒë·ªÉ chu·∫©n b·ªã import v√†o Supabase.

In [6]:
df = pd.DataFrame(results)
output_path = os.path.join('..', 'data', 'master_courses.json')
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)
    
print(f"‚úÖ Ho√†n t·∫•t! ƒê√£ c√†o {len(results)} m√¥n. (Th·∫•t b·∫°i: {len(failed_courses)})")
print(f"‚úÖ ƒê√£ l∆∞u file th√†nh c√¥ng t·∫°i: {output_path}")

‚úÖ Ho√†n t·∫•t! ƒê√£ c√†o 3860 m√¥n. (Th·∫•t b·∫°i: 0)
‚úÖ ƒê√£ l∆∞u file th√†nh c√¥ng t·∫°i: ../data/master_courses.json
