In [27]:
# Convenient HTTP requests.
import requests as rq

# Regular expressions.
import re

# Dates and times.
import datetime as dt

In [28]:
# Fetch the CAO points URL.
resp = rq.get('http://www2.cao.ie/points/l8.php')

# Have a quick peek. 200 means OK.
resp

<Response [200]>

In [35]:
# Get the current date and time.
now = dt.datetime.now()

# Format as a string.
nowstr = now.strftime('%Y%m%d_%H%M%S')

In [36]:
# Create a file path for the original data.
path = 'datasets/cao2021_' + nowstr + '.html'


In [37]:
# The server uses the wrong encoding, fix it.
original_encoding = resp.encoding

# Change to cp1252.
resp.encoding = 'cp1252'

# Save the original html file.
with open(path, 'w') as f:
    f.write(resp.text)

In [38]:
# Compile the regular expression for matching lines.
re_course = re.compile(r'([A-Z]{2}[0-9]{3})  (.*)([0-9]{3})(\*?) *')

In [43]:
# The file path for the csv file.
path = 'cao2021_csv_' + nowstr + '.csv'

# Keep track of how many courses we process.
no_lines = 0

# Open the csv file for writing.
with open(path, 'w') as f:
# Open the csv file for writing.
#with open(path, 'w') as f:
    # Loop through lines of the response.
    for line in resp.iter_lines():
            # Decode the line, using the wrong encoding!
            dline = line.decode('cp1252')
            # Match only the lines representing courses.
            if re_course.fullmatch(dline):
                # Add one to the lines counter.
                no_lines = no_lines + 1
                csv_version = re_course.sub(r'\1,\2,\3,\4', dline) # Getting relevant parts for lines which match
                #print(csv_version) # Print CSV style
                f.write(csv_version)

# Print the total number of processed lines.
print(f"Total number of lines is {no_lines}.")

Total number of lines is 922.
