# CAO Points 2021/2020/2019 Level 8 Courses Points Comparison 
***




The first thing we need to do is to load the libraries and modules that required to run this notebook. This list will increase as we describe the dataset.

In [1]:
# Convenient HTTP requests.
import requests as rq

# Regular expressions.
import re

# Dates and times.
import datetime as dt

# Data frames.
import pandas as pd

# For downloading.
import urllib.request as urlrq

#for plotting
import matplotlib.pyplot as plt
import seaborn as sns

import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
from plotly import subplots
from plotly.subplots import make_subplots

In [2]:
# Get the current date and time.
now = dt.datetime.now()

# Format as a string.
nowstr = now.strftime('%Y%m%d_%H%M%S')

### 2021 Points
http://www.cao.ie/index.php?page=points&p=2021

In [3]:
# following code was devised using videos https://web.microsoftstream.com/video/6524792e-734f-4dc5-8dbd-31efccc3c0a9
# Fetch the CAO points URL.
resp = rq.get('http://www2.cao.ie/points/l8.php')

# Have a quick peek. 200 means OK. To see the entire uplaod we could use resp.text
resp

<Response [200]>

The code below was initially used when doing the dataset preprocessing however it was not needed once he final code was written in the cell below. I want to keep it in the notebook in case I may need it for future reference.
***


**Save the original data set** 

Create a file path for the original data:

pathhtml = 'data/cao2021_' + nowstr + '.html'

**The server uses the wrong encoding, fix it.**

original_encoding = resp.encoding

**Change to cp1252.**

resp.encoding = 'cp1252'

**Error on Sever**

Technically, the server says we should decode as per:

Content-Type: text/html; charset=iso-8859-1

However, one line uses \x96 which isn't defined in iso-8859-1.

Therefore we use the similar decoding standard cp1252, which is very similar but includes #x96.

**Save the original html file**.

with open(pathhtml, 'w') as f:

    f.write(resp.text)

In [4]:
# Compile the regular expression for matching lines. The {} are the quantifiers for the letter or number we want to match. The * is the wildcard and will select everything else on the line.
#ref https://docs.python.org/3/library/re.html
re_course = re.compile(r'([A-Z]{2}[0-9]{3})(.*)')

In [5]:
# The file path for the csv file.
path2021 = 'data/cao2021_csv_' + nowstr + '.csv'

In [6]:
# Keep track of how many courses we process.
no_lines = 0

# Open the csv file for writing.
with open(path2021, 'w') as f:
    # Write a header row wth the names on the columns that we need.
    f.write(','.join(['code', 'title', 'pointsR1', 'pointsR2']) + '\n')
    # Loop through lines of the response.
    for line in resp.iter_lines():
        # Decode the line, using the wrong encoding!
        dline = line.decode('cp1252')
        # Match only the lines representing courses.
        if re_course.fullmatch(dline):
            # Add one to the lines counter.
            no_lines = no_lines + 1
            # The course code.
            course_code = dline[:5]
            # The course title.
            course_title = dline[7:57].strip()
            # Round one points.
            course_points = re.split(' +', dline[60:])
            if len(course_points) != 2:
                course_points = course_points[:2]
            # Join the fields using a comma.
            linesplit = [course_code, course_title, course_points[0], course_points[1]]
            # Rejoin the substrings with commas in between.
            f.write(','.join(linesplit) + '\n')

# Print the total number of processed lines.
print(f"Total number of lines is {no_lines}.")

Total number of lines is 949.


In [7]:
df2021 = pd.read_csv(path2021, encoding='cp1252')

In [8]:
df2021

Unnamed: 0,code,title,pointsR1,pointsR2
0,AL801,Software Design for Virtual Reality and Gaming,300,
1,AL802,Software Design in Artificial Intelligence for...,313,
2,AL803,Software Design for Mobile Apps and Connected ...,350,
3,AL805,Computer Engineering for Network Infrastructure,321,
4,AL810,Quantity Surveying,328,
...,...,...,...,...
944,WD211,Creative Computing,270,
945,WD212,Recreation and Sport Management,262,
946,WD230,Mechanical and Manufacturing Engineering,230,230
947,WD231,Early Childhood Care and Education,266,
