# Working Info Retriver for EmptyRooms at YorkU

Retrieves the schedule info of all courses from York courses website

In [9]:
import os
import re
import json
from bs4 import BeautifulSoup
import unicodedata
import chardet

from selenium import webdriver

from datetime import datetime

### Step 1: preperation

In [2]:
driver = webdriver.Firefox()

In [3]:
course_site_url = r"https://w2prod.sis.yorku.ca/Apps/WebObjects/cdm"
base_site_url = r"https://w2prod.sis.yorku.ca"

In [4]:
driver.get(course_site_url)

Follow the following instructions

- Login with your York account manually.
- Navigate to "View Active Course Timetables by Faculty".
- Download all Course Timetable pages of each faculty from the page (by clicking `Ctrl + S`), store them in a single directory.

### Step 2: Active Course Timetables html to per dept file

Change the following strings to directory names:

`input_dir` is where you downloaded the html table files in\
`output_dir` is where you want to output the `<DEPT>.json` files\
they will be in the CourseDelta data files format but without the descriptions and with schedule data\

`dept_list_file` should point to a .txt file that looks like:
```
<select size="10" class="bodytext" id="subjectSelect" name="subjectPopUp"><option selected="selected" value="0">ACTG - Accounting - ( SB, ED ) </option><option value="1">ADLW - Administrative Law - ( GS ) </option><option value="2">ADMS - Administrative Studies - ( AP ) </option><option value="3">ALDR - Dispute Resolution - ( GS ) </option><option value="4">ANTH - Anthropology - ( AP, GS ) </option><option value="5">ARB  - Arabic - ( AP ) </option><option value="6">ARTH - Art History - ( FA, GS ) </option><option value="7">ARTM - Arts and Media - ( SB ) </option>

...
```

which is the html elements plain text in the 'Subject' option list in 'Search Current Courses by Subject' page\
(tips: Inspect element -> `Ctrl + C` the whole `<select id="subjectSelect" ...>` tag)

In [13]:
input_dir = 'active schedules 2023_09_12'
output_dir = 'output_courses_json 2023_09_12 b'
dept_list_file = 'dept_list_f2023_w2024.txt'

this opens the .html files and read them to python code space

In [11]:
lis = os.listdir(input_dir)

faculties = {}

for fname in lis:
    if fname[-5:] != '.html':
        continue
    
    # detect encoding first
    f = open(os.path.join(input_dir,fname), 'rb')
    # f = open(os.path.join(input_dir,fname), 'r', encoding="utf-8")
    content = f.read()
    f.close()
    encoding = chardet.detect(content)['encoding']
    
    # then read as string
    f = open(os.path.join(input_dir,fname), 'r', encoding=encoding)
    content = f.read()
    f.close()
    
    faculty = fname[-7:-5]
    faculties[faculty] = content

In [12]:
#faculties['ED']

this parses the html files and read all schedule to python objects/dicts

In [14]:
depts = {}

In [15]:
cur_faculty = ''
cur_dept = ''
cur_term = ''
cur_course_title = ''

for f in faculties:
    soup = BeautifulSoup(faculties[f], 'html.parser')
    main_table = soup.find_all('tr', {"bgcolor": "#000000"})[0].parent
    main_table_rows = main_table.findChildren(recursive=False)
    
    for row in main_table_rows[1:]:
        if len(row.find_all('td', {"colspan": "8"})) > 0:
            texts = row.find_all('strong')
            cur_faculty = texts[0].text.strip()
            cur_dept = texts[1].text.strip()
            cur_term = texts[2].text.strip()
            cur_course_title = texts[3].text.strip()
            
            #print(cur_faculty_dept)
            
        else:
            tds = row.findChildren(recursive=False)
            
            if tds[0].get("colspan") == "5":
                mtype = tds[1].text.strip()
                mnumb = tds[2].text.strip()
                details = tds[4]
            else:
                codes = tds[1].text.strip()
                sp = re.sub('\s+', ' ', codes).split(' ')
                ccode = sp[0]
                cred = sp[1]
                sect = sp[2]
            
                mtype = tds[3].text.strip()
                mnumb = tds[4].text.strip()
                details = tds[6]
            
            
            
            ckey = {'faculty': cur_faculty, 'dept': cur_dept, 'code': ccode, 'credit': float(cred)}
            if cur_dept not in depts:
                depts[cur_dept] = {}
            if str(ckey) not in depts[cur_dept]:
                depts[cur_dept][str(ckey)] = {'key': ckey, 'name': cur_course_title, 'desc': '', 'prereq':[], 'schedule':{}}
            #print(depts[cur_dept])
            if cur_term+'-'+sect not in depts[cur_dept][str(ckey)]['schedule']:
                depts[cur_dept][str(ckey)]['schedule'][cur_term+'-'+sect] = {'term': cur_term, 'section': sect, 'classes':{}}
            if mtype+" "+mnumb not in depts[cur_dept][str(ckey)]['schedule'][cur_term+'-'+sect]['classes']:
                depts[cur_dept][str(ckey)]['schedule'][cur_term+'-'+sect]['classes'][mtype+" "+mnumb] = {'name':mtype+" "+mnumb, 'timeslot':[]}
            
            if len(details.findChildren(recursive=False)) > 0:
                details_rows = details.find_all('tr')
            
                for dr in details_rows:
                    ds = dr.find_all('td')
                    weekday = ds[0].text.strip()
                    time = ds[1].text.strip()
                    duration = ds[2].text.strip()
                    campus = ds[3].text.strip()
                    room = ds[4].text.strip()
                    depts[cur_dept][str(ckey)]['schedule'][cur_term+'-'+sect]['classes'][mtype+" "+mnumb]['timeslot'].append({'weekday': weekday, 'time': time, 'duration': duration, 'room': room, 'campus': campus})
            else:
                weekday = ''
                time = ''
                duration = ''
                campus = ''
                room = ''
                depts[cur_dept][str(ckey)]['schedule'][cur_term+'-'+sect]['classes'][mtype+" "+mnumb]['timeslot'].append({'weekday': weekday, 'time': time, 'duration': duration, 'room': room, 'campus': campus})
            
            #depts[cur_dept][str(ckey)]['schedule'][cur_term+'-'+sect]['classes'][mtype+" "+mnumb]['timeslot'].append({'weekday': weekday, 'time': time, 'duration': duration, 'room': room, 'campus': campus})
            
            #print(ccode, cred, sect, mtype, mnumb, weekday, time, duration, campus, room)
            # break

In [16]:
#depts['ADMS']["{'faculty': 'AP', 'dept': 'ADMS', 'code': '1000', 'credit': 3.0}"]

In [17]:
print('number of depts:', len(depts))

number of depts: 207


this stores all the transformed files to `output_dir`

In [18]:
depts_new = {}

In [19]:
for dep in depts:
    depts_new[dep] = []
    for clas in depts[dep]:
        src = depts[dep][clas]
        res = {'key': src['key'], 'name': src['name'], 'desc': '', 'prereq':[], 'schedule':[]}
        for sce in src['schedule']:
            src2 = src['schedule'][sce]
            res2 = {'term': src2['term'], 'section': src2['section'], 'classes':[]}
            for clas_type in src2['classes']:
                res2['classes'].append(src2['classes'][clas_type])
            res['schedule'].append(res2)
        depts_new[dep].append(res)

In [20]:
len(depts_new)

207

In [21]:
#depts_new['ADMS'][0]

In [22]:
def read_dept_list_from_file(file_text):
    soup = BeautifulSoup(file_text, 'html.parser')
    res = {}

    options = soup.find_all('option')
    for op in options:
        id = op['value']
        inner_html = op.string
        split = inner_html.split('-')
        depcode = split[0].strip()
        depname = "-".join(split[1:-1]).strip()
        faculties = split[-1].replace('(', '').replace(')', '').strip().split(',')
        faculties = [f.strip() for f in faculties]
        res[depcode] = {"dept_code": depcode, "dept_name": depname, "faculties": faculties}

    return res

In [23]:
dept_h_list = read_dept_list_from_file(open(dept_list_file, 'r').read())

In [24]:
for dep in depts_new:
    fname = os.path.join(output_dir, dep+'.json')
    
    obj = {"dept_code": dep, "dept_name": dept_h_list[dep]['dept_name'], "faculties": dept_h_list[dep]['faculties'], "courses": depts_new[dep]}
    
    f = open(fname, 'w')
    f.write(json.dumps(obj))
    f.close()

In [25]:
#dept_h_list['EECS']

after running the above codes you will have

```
<output_dir>
|- ACTG.json
|- ADLW.json
|- ADMS.json
...
(all the depts)
```

in the directory specified by `output_dir`

### Step 3: Transform to `all_rooms_09_12.json`

Transform the above json files to one `all_rooms.json` file to be put in EmptyRooms server

Change:\
`out_pth` to the directory to store the result file in\
`out_name` will be the name of the result 'all rooms' json file

In [26]:
in_pth = output_dir
out_pth = r'emptyrooms_fw23'
out_name = "all_rooms_09_12_b.json"

this grabs the above `<DEPT>.json` files and transform them to `all_rooms.json`

In [27]:
dir = os.listdir(in_pth)
rooms = {}
terms = {}

# for each of the files
for fil in dir:
    if fil[-5:] != '.json':
        continue

    f = open(os.path.join(in_pth, fil), "r")
    j = json.loads(f.read())

    courses = j["courses"]

    for course in courses:
        fullcode = course["key"]

        faculty = course["key"]["faculty"].upper()
        dept = course["key"]["dept"].upper()
        num_code = course["key"]["code"]
        credit = course["key"]["credit"]

        formcode = "{}/{} {} {}".format(faculty, dept, num_code, credit)
        formobj = {"faculty": faculty, "dept": dept, "code": num_code, "credit": credit}
        sections = course["schedule"]
        for sec in sections:
            classes = sec["classes"]
            term = sec["term"]
            if term not in terms:
                terms[term] = 0
            terms[term] += 1
            section = sec["section"]
            for cl in classes:
                sched = cl["timeslot"]
                title = cl["name"].replace("   ", " ").replace("  ", " ")
                for item in sched:
                    room = item["room"].replace("   ", " ").replace("  ", " ")
                    day = item["weekday"]
                    time = item["time"]
                    duration = item["duration"]

                    itemobj = {"course": formobj, "section": section, "term": term, "title": title, "day": day, "time": time, "duration": duration, }

                    if room in rooms:
                        rooms[room].append(itemobj)
                    else:
                        rooms[room] = [itemobj,]

# print terms
print(terms)

# write back to output
outp = json.dumps(rooms)
g = open(os.path.join(out_pth, out_name), "w")
g.write(outp)

{'F': 3615, 'W': 3680, 'F2': 167, 'W2': 161, 'A': 33, 'C': 21, 'N': 21, 'M': 8, 'F3': 18, 'W3': 18, 'Y': 2201, 'B2': 3, 'B4': 6, 'WS': 17, 'B1': 1, 'FA': 4, 'WA': 20, 'FB': 4, 'WL': 11, 'E1': 1, 'FE': 2, 'WB': 15, 'WP': 11, 'LB': 1, 'EF': 15, 'EW': 36, 'ER': 12, 'GH': 3, 'B3': 4, 'LC': 1, 'LA': 1, 'Z1': 6, 'F4': 1, 'W4': 1, 'FP': 3, 'WE': 2, 'FS': 3, 'LD': 2}


3191991

### Step 4: Term dates

generates the `term_dates.json` file used by EmptyRooms server

- Go to 'Search Course by Term', there wil be a 'Term Start and End Dates' table
- Save the table as a text file 
  - Inspect element -> `Ctrl + C` the whole `<table ...>` tag
- follow the next steps

Change:
`in_pth` to saved text file
`out_pth` to the path you want to store the `term_dates.json` file
`year` to current year (year of start of term, put 2023 for F23-W24)

In [29]:
in_pth = r'..\scraped backup\emptyrooms_fw23\term_dates_23_24_b.txt'
out_pth = r'..\scraped backup\emptyrooms_fw23\term_dates_23_24_b.json'
year = 2023

In [30]:
months = {"jan": 1, "feb":2, "mar":3, "apr":4, "may":5, "jun":6, "jul":7, "aug":8, "sep":9, "oct":10, "nov":11, "dec":12}

# process a date in the format of 'March 2', 'Sept. 16', etc. found on the page
def date_process(st, year):
    sp = st.split(' ')
    if len(sp) < 2: # for the "TBD" case
        return str(year) + "-04-31"
    month = months[sp[0].lower()[:3]]
    if month == None:
        return str(year) + "-04-31"
    day = sp[1]
    if month > 8 or (month == 8 and day > 15):
        return "{}-{:02d}-{:02d}".format(year, month, day)
    else:
        return "{}-{:02d}-{:02d}".format(year+1, month, day)

In [31]:
f = open(in_pth)
soup = BeautifulSoup(f.read(), 'html.parser')
trs = soup.find_all('tbody')[0].find_all('tr')

res = []
for tr in trs[1:]:
    tds = tr.find_all('td')
    term = tds[0].text
    faculty = tds[1].text
    # start = date_process(tds[2].text, year)
    # end = date_process(tds[-1].text, year)
    start = datetime.strptime(tds[2].text, "%b %d, %Y").strftime("%Y-%m-%d")
    end = datetime.strptime(tds[-1].text, "%b %d, %Y").strftime("%Y-%m-%d")
    res.append({"term": term, "faculty": faculty, "start": start, "end": end})

# write back to output
outp = json.dumps(res)
g = open(out_pth, "w")
g.write(outp)

6853