# Epitech modules information extractor

Cmd: /Applications/Arc.app/Contents/MacOS/Arc --remote-debugging-port=9222
I have used playwright for that, it's a web driver that allow reading page content directly

In [1]:
import csv
import re
from playwright.async_api import async_playwright

## Starting playwrigh in CDP mode

We use cdp mode because we need to have a browser that's can bypass the epitech new verification guard. CDP allow to use the devtools in chromium browser into normal window, so that allow to directly read from the user view and bypass the check.

In [2]:
playwright = await async_playwright().start()
browser = await playwright.chromium.connect_over_cdp("http://localhost:9222")
default_context = browser.contexts[0]
pages = default_context.pages

base_module_url = "https://intra.epitech.eu/module"
epitechPage = next((page for page in pages if page.url.startswith(base_module_url)), None)

if epitechPage is None:
    print("Can't find epitech page")
    exit(1)

## Extract the modules list from the tab

In [3]:
base_node = epitechPage.locator('//*[@id="sidebar"]/form/div[3]/dl')

#semesters = await base_node.locator('//dt/span/a').click()
moduleLocator = base_node.locator('//dd/dl/dt/a[2]')
count = await moduleLocator.count()

## Creating a module class that will store all the value extracted

In [4]:

class Module:
    
    def __init__(
        self,
        title,
        description,
        code,
        instance,
        credits,
        skills,
        startDate,
        endDate,
        endRegistrationDate,
        project,
        activities,
        appointements,
        registered,
    ):
        # Merge location of activities and appointements
        locations = list(map(lambda activity: activity['location'], activities))
        locations += list(map(lambda appointement: appointement['location'], appointements))
        
        self.title = title
        self.description = description
        self.code = code
        self.instance = instance
        self.credits = credits
        self.skills = skills
        self.startDate = startDate
        self.endDate = endDate
        self.endRegistrationDate = endRegistrationDate
        self.isPresential = any(location.find('Visio') == -1 for location in locations)
        self.project_name = 'N/A' if project is None else project['name']
        self.project_link = 'N/A' if project is None else project['link']
        self.project_start_date = 'N/A' if project is None else project['start_date']
        self.project_end_date = 'N/A' if project is None else project['end_date']
        self.activities = list(map(lambda activity: activity['name'], activities))
        self.appointements = list(map(lambda appointement: appointement['name'], appointements))
        self.locations = locations
        self.registered = registered

    def __str__(self):
        return f"{self.title} ({self.code})"

    def __repr__(self):
        return self.__str__()
    
    def get_url(self):
        return f"{base_module_url}/2024/{self.code}/{self.instance}"

## The functions that will extract the content

In [5]:
async def extract_appointements():
    activities = epitechPage.locator('//*[@id="module"]/div[3]/div/ul/li[contains(@class, "rdv")]')
    if activities is None:
        return []
    
    activitiesCount = await activities.count()
    appointements = []
    for i in range(activitiesCount):
        activity = activities.nth(i)
        
        activityTitle = await activity.locator(f'//div[1]/h2/span/a').inner_text()
        try:
            activityLocation = await activity.locator(f'//div[2]/div[2]/div/div[2]/div/div[3]/span[2]').inner_text(timeout=1000)
        except:
            activityLocation = None
            
        appointements.append({
            'name': activityTitle,
            'location': 'N/A' if activityLocation is None else activityLocation
        })
        
    return appointements

async def extract_activities():
    activities = epitechPage.locator('//*[@id="module"]/div[3]/div/ul/li[contains(@class, "tp")]')
    if activities is None:
        return []
    
    activitiesCount = await activities.count()
    
    activitiesResult = []
    for i in range(activitiesCount):
        activity = activities.nth(i)
        
        activityTitle = await activity.locator(f'//div[1]/h2/span/a').inner_text()
        try:
            activityLocation = await activity.locator(f'//div[2]/div[2]/div/div[2]/div/div[3]/span[2]').inner_text(timeout=1000)
        except:
            activityLocation = None
            
        activitiesResult.append({
            'name': activityTitle,
            'location': 'N/A' if activityLocation is None else activityLocation
        })
        
    return activitiesResult

async def extract_project(code, instance):
    projects = epitechPage.locator('//*[@id="module"]/div[3]/div/ul/li[contains(@class, "proj")]')
    if projects is None:
        return None
    projectsCount = await projects.count()
    if projectsCount == 0:
        return None
        
    project = projects.nth(0)
    projectTitle = await project.locator(f'//div[1]/h2/span/a').inner_text()
    try:
        projectStartDate = await project.locator(f'//div[2]/div[1]/div[1]/div[1]/span[2]').inner_text(timeout=1000)
    except:
        projectStartDate = None
        
    try:
        projectEndDate = await project.locator(f'//div[2]/div[1]/div[1]/div[2]/span[2]').inner_text(timeout=1000)
    except:
        projectEndDate = None
        
    try:
        projectLink = await project.locator(f'//div[2]/div[1]/div[2]/div[2]/div[1]/div/ul/li[1]/a').get_attribute(name='href', timeout=1000)
        projectLink = f'{base_module_url}/2024/{code}/{instance}/{projectLink}'
    except:
        projectLink = None
        
    return {
        'name': projectTitle,
        'link': projectLink,
        'start_date': projectStartDate,
        'end_date': projectEndDate
    }
    
async def get_students():
    registered_btn = epitechPage.locator('//*[@id="module"]/div[1]/div[2]/a[1]')
    if registered_btn is None:
        return []
    
    try:
        await registered_btn.click(timeout=1000)
        # Wait for the page to load
        await epitechPage.wait_for_load_state('load', timeout=2000)
        
        studentsList = epitechPage.locator('//*[@id="grid-note"]/div[2]/div/table/tbody/tr')
        count = await studentsList.count()
        
        students = []
        for i in range(count):
            student = studentsList.nth(i)
            studentName = await student.locator(f'//td[1]').inner_text()
            students.append(studentName)
        
        return students
    except:
        return []
    
    
async def extract_module_data():
    metadata = epitechPage.locator('//*[@id="module"]/div[1]/div[2]/div[3]')

    title = await metadata.locator('//h1').inner_text()
    
    codeModule = await metadata.locator('//div[1]/span').inner_text()
    codeModule = codeModule.lstrip('(').rstrip(')')

    codeInstance = await metadata.locator('//div[2]/span').inner_text()
    codeInstance = codeInstance.lstrip('(').rstrip(')')

    creditsNumber = await metadata.locator('//div[3]/span').inner_text()
    creditsNumber = creditsNumber.lstrip('(').split(' ')[0]

    timeline = epitechPage.locator('//*[@id="timeline"]/span/span')
    description = await epitechPage.locator('//*[@id="module"]/div[2]/div[2]/div[1]/div').inner_text(timeout=1000)

    skills = await epitechPage.locator('//*[@id="module"]/div[2]/div[2]/div[2]/div').inner_text(timeout=1000)
    skills = [re.sub(r'[^A-Za-z ]+', '', skill.strip()) for skill in skills.split('\n') if skill != '']

    startModuleDate = await timeline.locator('span.date_start.bulle').inner_text(timeout=1000)
    endModuleDate = await timeline.locator('span.date_end.bulle').inner_text(timeout=1000)
    endRegistration = await timeline.locator('span.wrapper-timeleft > span.end_reg > span').inner_text(timeout=1000)
    
    print(f"Extracting data for {title} ({codeModule})")
    
    appointements = await extract_appointements()
    print(f"Appointements: {appointements}")
    activities = await extract_activities()
    print(f"Activities: {activities}")
    project = await extract_project(codeModule, codeInstance)
    print(f"Project: {project}")
    registered = await get_students()
    print(f"Registered: {registered}")
    return Module(
        title,
        description,
        codeModule,
        codeInstance,
        creditsNumber,
        skills,
        startModuleDate.split(', ')[0],
        endModuleDate.split(', ')[0],
        endRegistration.split(', ')[0],
        project,
        activities,
        appointements,
        registered
    )

## Launch the process on each module that we have found, so click on the page collect data needed and go to the next

In [6]:
modules = []

for i in range(count):
    await moduleLocator.nth(i).click()
    modules.append(await extract_module_data())

Extracting data for G9 - Certification Assessment (G-RCP-900)
Appointements: [{'name': "Bloc 7 - Piloter un projet de conception et développment d'une solution logicielle", 'location': 'N/A'}, {'name': "Bloc 6 - Piloter la mise en production d'un projet de développement logiciel", 'location': 'N/A'}, {'name': "Bloc 5 - Définir et piloter la politique d'assurance qualité d'un projet de développement logiciel", 'location': 'N/A'}, {'name': 'Bloc 4 - Créer une architecture logicielle pour des applications web - AREA', 'location': 'N/A'}, {'name': 'Bloc 3 - Créer une architecture logicielle pour des applications dites "lourdes" - GLaDOS', 'location': 'N/A'}, {'name': 'Bloc 3 - Créer une architecture logicielle pour des applications dites "lourdes" - R-Type', 'location': 'N/A'}, {'name': 'Bloc 2 - Concevoir une architecture logicielle - R-Type', 'location': 'N/A'}, {'name': 'Bloc 2 - Concevoir une architecture logicielle - AREA', 'location': 'N/A'}, {'name': 'Bloc 2 - Concevoir une architec

## Save the data to a csv that we will use in the frontend

In [7]:
def write_csv(modules):
    with open('modules.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["module_title", "module_url", "module_code", "module_instance", "module_credits", "module_skills", "module_start_date", "module_end_date", "module_end_registration_date", "on_site", "project_name", "project_link", "project_start_date", "project_end_date", "module_activities", "module_appointements", "module_locations", "module_registered"])
        for module in modules:
            writer.writerow([module.title, module.get_url(), module.code, module.instance, module.credits, ','.join(module.skills), module.startDate, module.endDate, module.endRegistrationDate, module.isPresential, module.project_name, module.project_link, module.project_start_date, module.project_end_date, ','.join(module.activities),  ','.join(module.appointements), ','.join(module.locations), ','.join(module.registered)])
write_csv(modules)