In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import csv
import time
import re

options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
driver.get('https://ocw.mit.edu/search/?s=department_course_numbers.sort_coursenum')
time.sleep(5)

all_cards = []
for page in range(1, 4):  # 3 pages pour ~30 cards
    print(f"Scraping page {page}...")
    for _ in range(3):  # Scroll 3 fois pour charger plus
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    cards = soup.find_all('div', class_='card learning-resource-card list-view')
    all_cards.extend(cards)
    print(f"  Trouvé {len(cards)} cards sur page {page} (total: {len(all_cards)})")
    
    # Clique Next si possible
    try:
        next_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a.next')))
        next_button.click()
        time.sleep(3)
    except:
        print(f"  Pas de page suivante après {page}")
        break

driver.quit()

print(f"Total cards: {len(all_cards)}")

courses = []
for i, card in enumerate(all_cards[:50]):  # Limite 50 total
    try:
        # Level + dept_num (fallback)
        level_elem = card.find('div', class_='resource-type')
        level_text = level_elem.text.strip() if level_elem else 'N/A'
        dept_num = re.match(r'^(\d+\.\d+)', level_text).group(1) if level_text != 'N/A' and re.match(r'^(\d+\.\d+)', level_text) else 'N/A'
        title_elem = card.find('div', class_='lr-row course-title')
        title = title_elem.find('a').text.strip() if title_elem and title_elem.find('a') else 'N/A'
        if dept_num == 'N/A' and title:
            dept_num = re.match(r'^(\d+\.\d+)', title).group(1) if re.match(r'^(\d+\.\d+)', title) else 'N/A'
        url = 'https://ocw.mit.edu' + title_elem.find('a')['href'] if title_elem and title_elem.find('a') else 'N/A'
        
        # Prof
        prof_text = 'N/A'
        subtitle_div = card.find('div', class_='lr-row subtitle')
        if subtitle_div:
            prof_elem = subtitle_div.find(string=re.compile(r'Dr\. |Prof\. ', re.I))
            prof_text = prof_elem.strip() if prof_elem else 'N/A'
        
        # Topics (tous a.topic-link dans card)
        topic_links = card.find_all('a', class_='topic-link')
        topics_str = '|'.join([link.text.strip() for link in topic_links]) if topic_links else 'N/A'
        if topics_str == 'N/A' and title:
            topics_str = '|'.join(title.lower().split()[:3])
        
        courses.append({
            'title': title,
            'dept_num': dept_num,
            'level': level_text,
            'prof': prof_text,
            'topics': topics_str,
            'url': url
        })
        print(f"{i+1}: {dept_num} - {title[:50]} (topics: {topics_str[:30]})")
        
    except Exception as e:
        print(f"Erreur {i+1}: {e}")
        continue

with open('courses_mit_full.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['title', 'dept_num', 'level', 'prof', 'topics', 'url'])
    writer.writeheader()
    writer.writerows(courses)

print(f"\n{len(courses)} cours complets dans courses_mit_full.csv !")

Scraping page 1...
  Trouvé 40 cards sur page 1 (total: 40)
  Pas de page suivante après 1
Total cards: 40
1: 1.00 - Introduction to Computers and Engineering Problem  (topics: Engineering|Computer Science|S)
2: 1.010 - Uncertainty in Engineering (topics: Engineering|Mathematics|Probab)
3: 1.011 - Project Evaluation (topics: Engineering|Economics|Social S)
4: 1.012 - Introduction to Civil Engineering Design (topics: Engineering|Civil Engineering|)
5: 1.017 - Computing and Data Analysis for Environmental Appl (topics: Engineering|Computer Science|M)
6: 1.018 - Ecology I: The Earth System (topics: Science|Earth Science|Biology)
7: 1.020 - Ecology II: Engineering for Sustainability (topics: Engineering|Science|Earth Scie)
8: 1.022 - Introduction to Network Models (topics: Engineering|Systems Engineerin)
9: 1.033 - Mechanics of Material Systems: An Energy Approach (topics: Engineering|Science|Physics)
10: 1.040 - Project Management (topics: Engineering|Business|Civil Eng)
11: 1.040 - Proje

In [20]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import csv
import time
import re

options = Options()
options.add_argument('--headless')  # Sans fenêtre pour test rapide
driver = webdriver.Chrome(options=options)
driver.get('https://ocw.mit.edu/search/?s=department_course_numbers.sort_coursenum')
time.sleep(5)  # Initial load

all_cards = []
previous_len = 0
scroll_count = 0
max_scrolls = 10  # Max 10 scrolls pour ~50-100 cards

while len(all_cards) < 70 and scroll_count < max_scrolls:  # Stop à 50 ou max scrolls
    # Scroll en bas
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)  # Attends charge JS
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    cards = soup.find_all('div', class_='card learning-resource-card list-view')
    
    if len(cards) == previous_len:  # Plus rien ne charge
        print(f"  Plus de nouveaux cards après scroll {scroll_count} (total: {len(all_cards)})")
        break
    
    all_cards = cards  # Mise à jour (remplace pour éviter doublons)
    previous_len = len(cards)
    scroll_count += 1
    print(f"  Scroll {scroll_count}: {len(cards)} cards chargés (total: {len(all_cards)})")

driver.quit()
print(f"Total cards après scroll: {len(all_cards)}")

courses = []
for i, card in enumerate(all_cards[:70]):
    try:
        # Level + dept_num
        level_elem = card.find('div', class_='resource-type')
        level_text = level_elem.text.strip() if level_elem else 'N/A'
        dept_num = re.match(r'^(\d+\.\d+)', level_text).group(1) if level_text != 'N/A' and re.match(r'^(\d+\.\d+)', level_text) else 'N/A'
        title_elem = card.find('div', class_='lr-row course-title')
        title = title_elem.find('a').text.strip() if title_elem and title_elem.find('a') else 'N/A'
        if dept_num == 'N/A' and title:
            dept_num = re.match(r'^(\d+\.\d+)', title).group(1) if re.match(r'^(\d+\.\d+)', title) else 'N/A'
        url = 'https://ocw.mit.edu' + title_elem.find('a')['href'] if title_elem and title_elem.find('a') else 'N/A'
        
        # Prof
        prof_text = 'N/A'
        subtitle_div = card.find('div', class_='lr-row subtitle')
        if subtitle_div:
            prof_elem = subtitle_div.find(string=re.compile(r'Dr\. |Prof\. ', re.I))
            prof_text = prof_elem.strip() if prof_elem else 'N/A'
        
        # Topics
        topic_links = card.find_all('a', class_='topic-link')
        topics_str = '|'.join([link.text.strip() for link in topic_links]) if topic_links else 'N/A'
        if topics_str == 'N/A' and title:
            topics_str = '|'.join(title.lower().split()[:3])
        
        courses.append({
            'title': title,
            'dept_num': dept_num,
            'level': level_text,
            'prof': prof_text,
            'topics': topics_str,
            'url': url
        })
        print(f"{i+1}: {dept_num} - {title[:50]} (topics: {topics_str[:30]})")
        
    except Exception as e:
        print(f"Erreur {i+1}: {e}")
        continue

with open('courses_mit_scroll.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['title', 'dept_num', 'level', 'prof', 'topics', 'url'])
    writer.writeheader()
    writer.writerows(courses)

print(f"\n{len(courses)} cours scrollés dans courses_mit_scroll.csv !")

  Scroll 1: 20 cards chargés (total: 20)
  Scroll 2: 30 cards chargés (total: 30)
  Scroll 3: 40 cards chargés (total: 40)
  Scroll 4: 50 cards chargés (total: 50)
  Scroll 5: 60 cards chargés (total: 60)
  Scroll 6: 70 cards chargés (total: 70)
Total cards après scroll: 70
1: 1.00 - Introduction to Computers and Engineering Problem  (topics: Engineering|Computer Science|S)
2: 1.010 - Uncertainty in Engineering (topics: Engineering|Mathematics|Probab)
3: 1.011 - Project Evaluation (topics: Engineering|Economics|Social S)
4: 1.012 - Introduction to Civil Engineering Design (topics: Engineering|Civil Engineering|)
5: 1.017 - Computing and Data Analysis for Environmental Appl (topics: Engineering|Computer Science|M)
6: 1.018 - Ecology I: The Earth System (topics: Science|Earth Science|Biology)
7: 1.020 - Ecology II: Engineering for Sustainability (topics: Engineering|Science|Earth Scie)
8: 1.022 - Introduction to Network Models (topics: Engineering|Systems Engineerin)
9: 1.033 - Mechanics

In [21]:
from rdflib import Graph, Namespace, Literal, URIRef
from rdflib.namespace import RDF, RDFS
import csv
import re  # Pour nettoyer IDs

# Crée le graphe
g = Graph()
ex = Namespace('http://univ.example.org/')  # Ton namespace
schema = Namespace('http://schema.org/')
dc = Namespace('http://purl.org/dc/elements/1.1/')  # Pour description/identifier
g.bind('ex', ex)
g.bind('schema', schema)
g.bind('dc', dc)

# Charge ton ontology
g.parse('ontology.ttl', format='turtle')

# Fonction utilitaire pour IDs uniques (ex. "Introduction to Computers" → "introduction_to_computers")
def create_id(name):
    return re.sub(r'[^a-zA-Z0-9]', '_', name.lower()[:20])

print("Ontology chargée !")

Ontology chargée !


In [22]:
# Charge CSV
with open('courses_mit_scroll.csv', 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:  # Pour chaque ligne (cours)
        # 1. Crée l'instance Course
        cid = create_id(row['title'])
        course = ex[f'Course_{cid}']
        g.add((course, RDF.type, ex.Course))  # Type : Course
        g.add((course, schema.name, Literal(row['title'])))  # Nom : title
        g.add((course, ex.dept_num, Literal(row['dept_num'])))  # Dept_num
        g.add((course, schema.educationalLevel, Literal(row['level'])))  # Level
        g.add((course, dc.identifier, Literal(row['url'])))  # URL comme ID
        
        # 2. Mappe topics → Skills + requiresSkill
        for topic in row['topics'].split('|'):
            if topic.strip():
                sid = create_id(topic)
                skill = ex[f'Skill_{sid}']
                g.add((skill, RDF.type, ex.Skill))  # Instance Skill
                g.add((skill, schema.name, Literal(topic)))  # Nom du skill
                g.add((course, ex.requiresSkill, skill))  # Lien : course requires skill
        
        # 3. Mappe prof → Professor + teaches
        if row['prof'] != 'N/A':
            pid = create_id(row['prof'])
            prof = ex[f'Professor_{pid}']
            g.add((prof, RDF.type, ex.Professor))  # Instance Professor
            g.add((prof, schema.name, Literal(row['prof'])))  # Nom
            g.add((prof, ex.teaches, course))  # Lien : prof teaches course (inverse de ton ontology, mais adaptable)
        
        print(f"Ajouté cours : {row['title'][:50]} (skills: {row['topics'][:30]})")
        # Infère hasPrerequisite (ex. si 'Advanced' dans topics, prerequisite = cours basique)
        if 'advanced' in row['topics'].lower() or 'calculus' in row['topics'].lower():
            prereq_id = create_id(f'basic_{row["dept_num"]}')
            prereq_course = ex[f'Course_{prereq_id}']
            g.add((prereq_course, RDF.type, ex.Course))  # Crée cours fictif
            g.add((prereq_course, schema.name, Literal(f'Basic Prerequisite for {row["title"]}'))) 
            g.add((course, ex.hasPrerequisite, prereq_course))  # Lien
            print(f"Inféré prerequisite pour {row['title']}")
# Sauvegarde
g.serialize('kg_aligned.ttl', format='turtle')
print(f"Graphe sauvegardé avec {len(g)} triples ! Ouvre kg_aligned.ttl dans Protégé.")

Ajouté cours : Introduction to Computers and Engineering Problem  (skills: Engineering|Computer Science|S)
Ajouté cours : Uncertainty in Engineering (skills: Engineering|Mathematics|Probab)
Ajouté cours : Project Evaluation (skills: Engineering|Economics|Social S)
Ajouté cours : Introduction to Civil Engineering Design (skills: Engineering|Civil Engineering|)
Ajouté cours : Computing and Data Analysis for Environmental Appl (skills: Engineering|Computer Science|M)
Ajouté cours : Ecology I: The Earth System (skills: Science|Earth Science|Biology)
Ajouté cours : Ecology II: Engineering for Sustainability (skills: Engineering|Science|Earth Scie)
Ajouté cours : Introduction to Network Models (skills: Engineering|Systems Engineerin)
Ajouté cours : Mechanics of Material Systems: An Energy Approach (skills: Engineering|Science|Physics)
Ajouté cours : Project Management (skills: Engineering|Business|Civil Eng)
Ajouté cours : Project Management (skills: Engineering|Business|Civil Eng)
Ajouté co

In [28]:
# Dict plus précise (priorité par sous-skill, ajoute tes topics MIT)
skill_to_career = {
    'computer science': 'Software Engineer',
    'mathematics': 'Data Scientist',
    'economics': 'Economist',
    'biology': 'Biologist',
    'physics': 'Physicist',
    'earth science': 'Environmental Scientist',
    'civil engineering': 'Civil Engineer',
    'systems engineering': 'Systems Analyst',
    'social science': 'Policy Analyst',
    'business': 'Project Manager',
    'engineering': 'General Engineer'  # Fallback pour "engineering" seul
}

# Boucle pour recos (plus intelligente)
for course in g.subjects(RDF.type, ex.Course):
    course_name = g.value(course, schema.name)
    row_title = course_name.value if course_name else 'Unknown'
    added_careers = set()  # Évite dups
    for skill in g.objects(course, ex.requiresSkill):
        skill_name = g.value(skill, schema.name)
        if skill_name:
            skill_lower = skill_name.value.lower()
            matched = False
            # Priorité : cherche exact ou sous-string
            for key in skill_to_career:
                if key in skill_lower:
                    career_name = skill_to_career[key]
                    if career_name not in added_careers:  # Une fois par cours
                        cid = create_id(career_name)
                        career = ex[f'Career_{cid}']
                        g.add((career, RDF.type, ex.Career))
                        g.add((career, schema.name, Literal(career_name)))
                        g.add((career, dc.description, Literal(f"Career in {key} field.")))
                        g.add((skill, ex.leadsToCareer, career))
                        g.add((course, ex.recommends, career))
                        print(f"Reco précise: {row_title[:30]} → {career_name} (via {skill_name.value})")
                        added_careers.add(career_name)
                        matched = True
                        break
            if not matched:
                # Fallback
                default_career = ex['Career_general_engineer']
                g.add((default_career, RDF.type, ex.Career))
                g.add((default_career, schema.name, Literal('General Engineer')))
                g.add((course, ex.recommends, default_career))
                print(f"Fallback pour {row_title[:30]} → General Engineer")

g.serialize('kg_aligned.ttl', format='turtle')
print(f"KG avec recos précises sauvé: {len(g)} triples !")

Reco précise: Introduction to Computers and  → General Engineer (via Engineering)
Reco précise: Introduction to Computers and  → Software Engineer (via Computer Science)
Reco précise: Introduction to Computers and  → Systems Analyst (via Systems Engineering)
Reco précise: Uncertainty in Engineering → General Engineer (via Engineering)
Reco précise: Uncertainty in Engineering → Data Scientist (via Mathematics)
Fallback pour Uncertainty in Engineering → General Engineer
Reco précise: Project Evaluation → General Engineer (via Engineering)
Reco précise: Project Evaluation → Economist (via Economics)
Reco précise: Project Evaluation → Policy Analyst (via Social Science)
Reco précise: Introduction to Civil Engineer → General Engineer (via Engineering)
Reco précise: Introduction to Civil Engineer → Civil Engineer (via Civil Engineering)
Fallback pour Introduction to Civil Engineer → General Engineer
Fallback pour Introduction to Civil Engineer → General Engineer
Reco précise: Computing and D

In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd  # Pour exporter en CSV
import re
from urllib.parse import urljoin, urlparse

# Base URL for MIT Course Catalog
BASE_URL = 'https://catalog.mit.edu'

def fetch_page(url):
    """Fetch a webpage and return BeautifulSoup object."""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return BeautifulSoup(response.content, 'html.parser')

def extract_prereqs(text):
    """Parse text to extract prerequisites."""
    # Pattern for Prereq
    prereq_pattern = r'Prereq:\s*(.*?)(?=\s*(U\s*\(|Coreq:|\d+-\d+-\d+\s+units|Can\.simul\.))'
    match = re.search(prereq_pattern, text, re.IGNORECASE | re.DOTALL)
    
    prereqs = []
    if match:
        prereq_str = match.group(1).strip()
        # Split on top-level commas (avoid inside parens)
        items = re.split(r',\s*(?![^(]*\))', prereq_str)
        prereqs = [item.strip() for item in items if item.strip()]
        if not prereqs:
            prereqs = [prereq_str]
    
    return [p for p in prereqs if p.lower() != 'none'] if prereqs else []

def scrape_subject_page(subject_url):
    """Scrape all courses from a single subject page."""
    soup = fetch_page(subject_url)
    courses = []
    
    # Find all course blocks
    course_blocks = soup.find_all('div', class_='courseblock')
    
    for block in course_blocks:
        course_data = {}
        
        # Extract course code and title from h4.courseblocktitle strong
        title_elem = block.find('h4', class_='courseblocktitle')
        if title_elem:
            strong = title_elem.find('strong')
            if strong:
                full_title = strong.get_text(strip=True)
                # Parse course code like '6.100A Introduction to ...'
                code_match = re.match(r'(\d+\.?\d*[A-Za-z]?)', full_title)
                course_data['code'] = code_match.group(1) if code_match else None
                course_data['title'] = full_title.replace(course_data['code'], '').strip(': ')
        
        # Get full block text for prereqs
        full_text = block.get_text(separator=' ', strip=True)
        
        # Extract prereqs from full text
        prereqs = extract_prereqs(full_text)
        course_data['prerequisites'] = prereqs
        
        # Extract instructors from p.courseblockinstructors span
        inst_elem = block.find('p', class_='courseblockinstructors')
        if inst_elem:
            spans = inst_elem.find_all('span')
            instructors = [span.get_text(strip=True) for span in spans if span.get_text(strip=True).strip()]
            course_data['instructors'] = instructors if instructors else None
        else:
            # Fallback: regex for names at end (e.g., "A. Bell, J. V. Guttag")
            name_pattern = r'([A-Z]\.\s[A-Z][a-z]+(?:,\s[A-Z]\.\s[A-Z][a-z]+)*)\s*$'
            name_match = re.search(name_pattern, full_text)
            course_data['instructors'] = [name.strip() for name in name_match.group(1).split(',')] if name_match else None
        
        # Skip if no code (invalid block)
        if course_data.get('code'):
            courses.append(course_data)
    
    return courses

def get_subject_links():
    """Get links to all subject pages from the main subjects page."""
    soup = fetch_page(f'{BASE_URL}/subjects/')
    subject_links = []
    
    # Links like <a href="/subjects/6/">Course 6</a>
    subject_as = soup.find_all('a', href=re.compile(r'^/subjects/\d+/'))
    for a in subject_as:
        href = a['href']
        full_url = urljoin(BASE_URL, href)
        subject_links.append({
            'name': a.get_text(strip=True),
            'url': full_url
        })
    
    return subject_links

def scrape_all_subjects(output_file='mit_courses_final.csv'):
    """Scrape all subjects and save to CSV (code, title, instructors, prerequisite)."""
    all_data = []  # Liste plate pour CSV
    subject_links = get_subject_links()
    
    for subject in subject_links:
        print(f"Scraping {subject['name']}...")
        try:
            courses = scrape_subject_page(subject['url'])
            for course in courses:
                # Ajoute le sujet à chaque cours
                row = course.copy()
                row['subject'] = subject['name']
                # Joint les listes pour CSV (séparées par ";")
                row['prerequisite'] = '; '.join(row['prerequisites']) if row['prerequisites'] else ''
                row['instructors'] = '; '.join(row['instructors']) if row['instructors'] else ''
                all_data.append(row)
        except Exception as e:
            print(f"Error scraping {subject['name']}: {e}")
    
    # Crée un DataFrame et exporte en CSV (colonnes demandées + subject)
    if all_data:
        df = pd.DataFrame(all_data)
        # Colonnes: subject, code, title, prerequisite, instructors
        df = df[['subject', 'code', 'title', 'prerequisite', 'instructors']]
        df.to_csv(output_file, index=False, encoding='utf-8')
        print(f"Scraping complete. Data saved to {output_file} ({len(all_data)} rows)")
        
        # Debug: Show examples with prereqs
        with_prereqs = df[df['prerequisite'] != '']
        if not with_prereqs.empty:
            print("Examples:")
            print(with_prereqs[['code', 'prerequisite', 'instructors']].head(3).to_string(index=False))
    else:
        print("No data scraped.")
    
    return all_data

# Test sur Course 6 pour vérifier

# Pour tout scraper en CSV:
scrape_all_subjects()

Scraping Aeronautics and Astronautics (Course 16)...
Scraping Architecture (Course 4)...
Scraping Biological Engineering (Course 20)...
Scraping Biology (Course 7)...
Scraping Brain and Cognitive Sciences (Course 9)...
Scraping Chemical Engineering (Course 10)...
Scraping Chemistry (Course 5)...
Scraping Civil and Environmental Engineering (Course 1)...
Scraping Earth, Atmospheric, and Planetary Sciences (Course 12)...
Scraping Economics (Course 14)...
Scraping Electrical Engineering and Computer Science (Course 6)...
Scraping Humanities (Course 21)...
Scraping Linguistics and Philosophy (Course 24)...
Scraping Management (Course 15)...
Scraping Materials Science and Engineering (Course 3)...
Scraping Mathematics (Course 18)...
Scraping Mechanical Engineering (Course 2)...
Scraping Nuclear Science and Engineering (Course 22)...
Scraping Physics (Course 8)...
Scraping Political Science (Course 17)...
Scraping Urban Studies and Planning (Course 11)...
Scraping Aeronautics and Astronautic

[{'code': '16.001',
  'title': 'Unified Engineering: Materials and Structures',
  'prerequisites': ['Calculus II (GIR) and Physics I (GIR) ;'],
  'instructors': 'R. Radovitzky, D. L. Darmofal',
  'subject': 'Aeronautics and Astronautics (Course 16)',
  'prerequisite': 'Calculus II (GIR) and Physics I (GIR) ;'},
 {'code': '16.002',
  'title': 'Unified Engineering: Signals and Systems',
  'prerequisites': ['Calculus II (GIR) ;'],
  'instructors': 'J. P. How',
  'subject': 'Aeronautics and Astronautics (Course 16)',
  'prerequisite': 'Calculus II (GIR) ;'},
 {'code': '16.003',
  'title': 'Unified Engineering: Fluid Dynamics',
  'prerequisites': ['Calculus II (GIR)',
   'Physics II (GIR)',
   'and ( 18.03 or 18.032 );'],
  'instructors': 'D. L. Darmofal',
  'subject': 'Aeronautics and Astronautics (Course 16)',
  'prerequisite': 'Calculus II (GIR); Physics II (GIR); and ( 18.03 or 18.032 );'},
 {'code': '16.004',
  'title': 'Unified Engineering: Thermodynamics and Propulsion',
  'prerequis

In [34]:
df=pd.read_csv('mit_courses_finalMITEDU.csv')
df

Unnamed: 0,subject,code,title,prerequisite,instructors
0,Aeronautics and Astronautics (Course 16),16.001,Unified Engineering: Materials and Structures,Calculus II (GIR) and Physics I (GIR) ;,"R. Radovitzky, D. L. Darmofal"
1,Aeronautics and Astronautics (Course 16),16.002,Unified Engineering: Signals and Systems,Calculus II (GIR) ;,J. P. How
2,Aeronautics and Astronautics (Course 16),16.003,Unified Engineering: Fluid Dynamics,Calculus II (GIR); Physics II (GIR); and ( 18....,D. L. Darmofal
3,Aeronautics and Astronautics (Course 16),16.004,Unified Engineering: Thermodynamics and Propul...,Calculus II (GIR); Physics II (GIR); and ( 18....,"Z. S. Spakovszky, D. L. Darmofal"
4,Aeronautics and Astronautics (Course 16),16.06,Principles of Automatic Control,16.002,S. R. Hall
...,...,...,...,...,...
12055,Course 24 Linguistics and Philosophy,24.T,HG Graduate Thesis,,Staff
12056,Course 24 Linguistics and Philosophy,24.S,93 Special Seminar: Linguistics,,Staff
12057,Course 24 Linguistics and Philosophy,24.S,94 Special Seminar: Linguistics,Permission of instructor Acad Year 2025-2026: ...,Staff
12058,Course 24 Linguistics and Philosophy,24.S,95 Special Seminar: Linguistics,Permission of instructor G (Spring),Staff


In [35]:
df.describe()

Unnamed: 0,subject,code,title,prerequisite,instructors
count,12060,12060,12060,8985,12060
unique,42,3487,3029,1283,1735
top,Management (Course 15),6.S,HG Graduate Thesis,Permission of instructor,Staff
freq,922,324,54,453,1152


In [38]:
df.dtypes
df.nunique()


subject           42
code            3487
title           3029
prerequisite    1283
instructors     1735
dtype: int64

In [39]:
df.isna().sum()


subject            0
code               0
title              0
prerequisite    3075
instructors        0
dtype: int64

In [43]:
df.duplicated().sum()


np.int64(0)

In [42]:
df = df.drop_duplicates()


In [44]:
df.describe(include='all')

Unnamed: 0,subject,code,title,prerequisite,instructors
count,8028,8028,8028,5988,8028
unique,42,3487,3029,1283,1735
top,Course 15 Management,6.S,HG Graduate Thesis,Permission of instructor,Staff
freq,461,216,34,302,760


In [50]:
df['instructors'].value_counts() 

instructors
Staff                                 760
Consult Department                    190
Consult Sloan Educational Services     74
Consult A. Aksamija                    66
Consult J. Hising DiFabio              56
                                     ... 
R. Williams, V. Williams                2
P. Indyk                                2
K. Daskalakis                           2
V. Williams                             2
E. Demaine                              2
Name: count, Length: 1735, dtype: int64

In [51]:
df.isna().sum()


subject            0
code               0
title              0
prerequisite    2040
instructors        0
dtype: int64

In [55]:
df.head()

Unnamed: 0,subject,code,title,prerequisite,instructors
0,Aeronautics and Astronautics (Course 16),16.001,Unified Engineering: Materials and Structures,Calculus II (GIR) and Physics I (GIR) ;,"R. Radovitzky, D. L. Darmofal"
1,Aeronautics and Astronautics (Course 16),16.002,Unified Engineering: Signals and Systems,Calculus II (GIR) ;,J. P. How
2,Aeronautics and Astronautics (Course 16),16.003,Unified Engineering: Fluid Dynamics,Calculus II (GIR); Physics II (GIR); and ( 18....,D. L. Darmofal
3,Aeronautics and Astronautics (Course 16),16.004,Unified Engineering: Thermodynamics and Propul...,Calculus II (GIR); Physics II (GIR); and ( 18....,"Z. S. Spakovszky, D. L. Darmofal"
4,Aeronautics and Astronautics (Course 16),16.06,Principles of Automatic Control,16.002,S. R. Hall
