# Scrape Unit of Competency
---
### ▶▶ Run All
---

In [46]:
! python -m pip install -r ../requirements.txt

Defaulting to user installation because normal site-packages is not writeable


In [47]:
# Add the src directory to the path

import sys

src_path = "../src/"

sys.path.append(src_path)
print(f"📘 Added src directory to the path: {src_path}")

📘 Added src directory to the path: ../src/


In [48]:
# Get the unit of competency code from the user

import re
from utils import StopExecution

while True:
    unit_code = input("Enter the unit of competency code: ")

    if unit_code == "":
        print("🟤 Input aborted by user")
        raise StopExecution

    if not re.match(r'[A-Z]{3}[A-Z]{3}\d{3}', unit_code):
        print("🟠 Invalid unit code. Please try again.")
        continue
    else:
        print(f"🔹 Unit of Competency Code: {unit_code}")
        break



🔹 Unit of Competency Code: CUADIG304


In [49]:
# Get the unit directory path

from utils import unit_path_from_code

folder_path = unit_path_from_code(unit_code)
print(f'🔹 Unit Directory Path: {folder_path}')

🔹 Unit Directory Path: ../Units/CUADIG304


In [50]:
# Create the unit directory

import requests
from utils import get_unit_xml_url, StopExecution

# Get the XML file from the training.gov.au website
url = get_unit_xml_url(unit_code)
response = requests.get(url)

if response.status_code == 200:
  print(f"🟢 Successfully retrieved XML file.")
  # print(f"Content: {response.content}")
  
elif response.status_code == 404:
  print(f"🔴 XML file not found. Status code: {response.status_code}")
  print(f"🔵 Please check the unit code and try again.")
  raise StopExecution

else:
  print(f"🔴 Failed to retrieve XML file. Status code: {response.status_code}")
  raise StopExecution

🟢 Successfully retrieved XML file.


In [51]:
# Parse the XML file

from lxml import etree

root = etree.fromstring(response.content)

print("🟢 Successfully parsed the XML file")

🟢 Successfully parsed the XML file


In [52]:
# Extract Unit Title

from utils import namespaces

xpath = './/a:Book[.//a:Description[contains(text(), "Release")]]//a:VariableAssignments/a:VariableAssignment[./a:Name/text()="Title"]/a:Value/text()'
unit_title = root.xpath(xpath, namespaces=namespaces)[0]

print("🟢 Successfully extracted Unit Title.")

🟢 Successfully extracted Unit Title.


In [53]:
# Extract Elements and Performance Criteria

xpath = './/a:Topic[.//a:Description[text()=$topic_title]]'
topic_title="Elements and Performance Criteria"
topic = root.xpath(xpath, namespaces=namespaces, topic_title=topic_title)[0]
rows = topic.xpath('./a:Text/a:table/a:tr[position() > 2]', namespaces=namespaces)

elements = []
for row in rows:
    element = row.xpath('./a:td[1]/a:p/text()', namespaces=namespaces)[0]
    [element_index, element_title] = str.split(element, '. ', 1)
    performance_criteria = row.xpath('./a:td[2]/a:p/text()', namespaces=namespaces)
    def split_pc(x):
        [index, description] = x.split(' ', 1)
        return {'index': index, 'description': description}.copy()
    
    performance_criteria = list(map(split_pc, performance_criteria))
    elements.append({'index': element_index, 'title':element_title, 'performance_criteria': performance_criteria})

print("🟢 Successfully extracted Elements and Performance criteria.")

🟢 Successfully extracted Elements and Performance criteria.


In [54]:
# Extract Foundation Skills

xpath = './/a:Topic[.//a:Description[text()=$topic_title]]'
topic_title="Foundation Skills"
topic = root.xpath(xpath, namespaces=namespaces, topic_title=topic_title)[0]
rows = topic.xpath('./a:Text/a:table/a:tr[position() > 1]', namespaces=namespaces)

foundational_skills = []
for row in rows:
    skill = row.xpath('./a:td[1]/a:p/text()', namespaces=namespaces)[0]
    descriptions = row.xpath('./a:td[last()]/a:p/text()', namespaces=namespaces)

    if row.xpath('./a:td[2]', namespaces=namespaces) != row.xpath('./a:td[last()]', namespaces=namespaces):
        _performance_criteria = row.xpath('./a:td[2]/a:p/text()', namespaces=namespaces)[0].split(', ')
    else:
        _performance_criteria = []
    
    foundational_skills.append({
      'skill': skill,
      'performance_criteria': _performance_criteria,
      'descriptions': descriptions,
    })

print("🟢 Successfully extracted Foundation Skills.")

🟢 Successfully extracted Foundation Skills.


In [55]:
# Extract Performance Evidence

xpath = './/a:Topic[.//a:Description[text()=$topic_title]]'
topic_title="Performance Evidence"
topic = root.xpath(xpath, namespaces=namespaces, topic_title=topic_title)[0]
performance_evidence = topic.xpath('./a:Text/a:p/text()', namespaces=namespaces)

print("🟢 Successfully extracted Performance Evidence.")

🟢 Successfully extracted Performance Evidence.


In [56]:
# Extract Knowledge Evidence

xpath = './/a:Topic[.//a:Description[text()=$topic_title]]'
topic_title="Knowledge Evidence"
topic = root.xpath(xpath, namespaces=namespaces, topic_title=topic_title)[0]
knowledge_evidence = topic.xpath('./a:Text/a:p/text()', namespaces=namespaces)

print("🟢 Successfully extracted Knowledge Evidence.")

🟢 Successfully extracted Knowledge Evidence.


In [57]:
# Extract Assessment Conditions

xpath = './/a:Topic[.//a:Description[text()=$topic_title]]'
topic_title="Assessment Conditions"
topic = root.xpath(xpath, namespaces=namespaces, topic_title=topic_title)[0]
assessment_conditions = topic.xpath('./a:Text/a:p/text()', namespaces=namespaces)

print("🟢 Successfully extracted Assessment Conditions.")

🟢 Successfully extracted Assessment Conditions.


In [58]:
# Save the extracted details to a file

import json
import os

uoc = {
    "unit_code": unit_code,
    "unit_title": unit_title,
    "elements": elements,

    "foundational_skills": foundational_skills,
    "performance_evidence": performance_evidence,
    "knowledge_evidence": knowledge_evidence,
    "assessment_conditions": assessment_conditions,
}

# Create the folder if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Save the details to a file
filename = os.path.join(folder_path, f"{unit_code}_details.json").replace("\\", "/")

with open(filename, "w") as file:
    json.dump(uoc, file, indent=2) # pretty print

print(f"🟢 Successfully saved unit details to: {filename}")

🟢 Successfully saved unit details to: ../Units/CUADIG304/CUADIG304_details.json
