In [19]:
import requests
from bs4 import BeautifulSoup as bs
import os

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
base_url = 'https://www.cs.stonybrook.edu'
faculty_url = f'{base_url}/people/faculty'

In [5]:
# Send an HTTP GET request to fetch the faculty webpage
response = requests.get(faculty_url)

In [8]:

bio_url = []


In [9]:
# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the faculty profile URLs
    faculty_links = soup.select('div.views-field-title a[href^="/people/faculty/"]')

    # Extract the URLs from the anchor tags
    faculty_urls = [f"{base_url}{faculty_link['href']}" for faculty_link in faculty_links]

    # Add the faculty profile URLs to the bio_urls list
    bio_url.extend(faculty_urls)

     # Save the faculty profile URLs to a text file in your Google Drive
    file_path = '/content/drive/My Drive/bio_url.txt'
    with open(file_path, 'w') as file:
        for url in bio_url:
            file.write(url + '\n')

    print(f"Faculty profile URLs have been saved to {file_path} in your Google Drive.")
else:
    print(f"Failed. Status code: {response.status_code}")

Faculty profile URLs have been saved to /content/drive/My Drive/bio_url.txt in your Google Drive.


In [11]:
def extract_teaching_summary(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract professor's name from the title element
        title_element = soup.select_one('title')
        if title_element:
            title_text = title_element.text.strip()
            professor_name = title_text.replace("Michael Bender | Department of Computer Science", "").strip()
        else:
            professor_name = 'Professor Name Not Found'

        # Extract teaching summary
        teaching_summary_section = soup.select_one('div.field-name-field-teachingsummary div.field-item')
        if teaching_summary_section:
            summary = teaching_summary_section.get_text(strip=True)
            return professor_name, summary
    return None, None

In [12]:
# Send an HTTP GET request to fetch the faculty webpage
response = requests.get(faculty_url)

In [14]:
courses = []

In [15]:
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the faculty profile URLs and names
    faculty_profiles = soup.select('div.views-field-title')
    for faculty_profile in faculty_profiles:
        professor_name = faculty_profile.text.strip()
        faculty_link = faculty_profile.select_one('a[href^="/people/faculty/"]')
        if faculty_link:
            faculty_url = f"{base_url}{faculty_link['href']}"
            teaching_summary = extract_teaching_summary(faculty_url)
            if teaching_summary:
                courses.append((professor_name, teaching_summary[1]))

    # Save courses list to a text file in Google Drive
    file_path = '/content/drive/My Drive/course_taught.txt'
    with open(file_path, 'w') as file:
        for professor, summary in courses:
            file.write(f"Professor: {professor}\n")
            file.write(f"Teaching Summary:\n{summary}\n\n")

    print("Teaching summaries have been saved to course_taught.txt in your Google Drive.")
else:
    print(f"Failed. Status code: {response.status_code}")


Teaching summaries have been saved to course_taught.txt in your Google Drive.


In [25]:
bios = []
bios = set()

In [26]:
# Function to extract the biography from a faculty profile page
def extract_biography(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = bs(response.content, 'html.parser')
        biography_section = soup.select_one('div.field-name-field-biography div.field-item')
        if biography_section:
            biography = biography_section.get_text(strip=True)
            return biography
    return None

In [28]:
if response.status_code == 200:
    soup = bs(response.content, 'html.parser')
    faculty_links = soup.select('a[href^="/people/faculty"]')

    for faculty_link in faculty_links:
        faculty_url = f"{base_url}{faculty_link['href']}"
        biography = extract_biography(faculty_url)
        if biography:
            bios.add(biography)

# Save the bios list to a file called bios.txt in Google Drive
file_path = "/content/drive/My Drive/bios.txt"
with open(file_path, 'w', encoding='utf-8') as file:
    for bio in bios:
        file.write(bio + "\n")

print("Biographies saved to bios.txt in Google Drive.")

Biographies saved to bios.txt in Google Drive.


In [29]:
bios

{"Adam Siepel is Professor in the Watson School of Biological Sciences and Chair of the Simons Center for Quantitative Biology at Cold Spring Harbor Laboratory. Siepel studied engineering at Cornell (1990-1994), then worked in software development for bioinformatics for several years in the late 1990s, before attending graduate school in Computer Science (M.S., University of New Mexico, 2001, and Ph.D., UC Santa Cruz, 2005). He was on the faculty of the Department of Biological Statistics and Computational Biology at Cornell from 2006 to 2014, where he taught courses in computational genomics and machine learning, directed Cornell's Ph.D. program in Computational Biology, and served as an associate director for the Cornell Center for Comparative and Population Genomics (3CPG). Siepel has also served on the editorial boards for Genome Research and PLoS Computational Biology, on review panels for the National Science Foundation and the National Institutes of Health, and on advisory commi

In [23]:
courses

[('Stanley Bak', None),
 ('Aruna Balasubramanian', 'CSE 692'),
 ('Niranjan Balasubramanian', 'CSE 628'),
 ('Ritwik Banerjee', None),
 ('Michael Bender',
  'CSE 150, CSE 303, CSE 350, CSE 373, CSE 385, CSE 495/496, CSE 548, CSE 638, CSE 642'),
 ('Xiaojun Bi',
  'CSE 518 Foundations of Human Computer Interaction, CSE/ISE/EST 323 Human Computer Interaction'),
 ('Barbara Chapman', None),
 ('Omar Chowdhury', None),
 ('Rezaul Chowdhury', 'CSE 548, CSE 590, CSE 613, CSE 638, CSE 642'),
 ('Samir R. Das',
  'CSE 570 (usually in Fall semesters), occasionally CSE 534, CSE 370 and CSE 310'),
 ('Ahmad Esmaili',
  'ISE 102, ISE 215, CSE 102, CSE 110, CSE 113, CSE 114, CSE 130, CSE 214, CSE 215, CSE 219, CSE 230, CSE 302, CSE 311, HON 111, ITS 102, ISE 302, ISE 305, ISE 311, ISE 321, CSE390 (Mobile App Development)'),
 ('Michael Ferdman', 'CSE 502, CSE 602, CSE 506'),
 ('Paul Fodor',
  'CSE 114, CSE 215, CSE 219, CSE 260, CSE 305, CSE 307, CSE 312, CSE 371, CSE 392, CSE 532, CSE 505, CSE 645, HON 111

In [24]:
bio_url

['https://www.cs.stonybrook.edu/people/faculty/StanleyBak',
 'https://www.cs.stonybrook.edu/people/faculty/ArunaBalasubramanian',
 'https://www.cs.stonybrook.edu/people/faculty/NiranjanBalasubramanian',
 'https://www.cs.stonybrook.edu/people/faculty/RitwikBanerjee',
 'https://www.cs.stonybrook.edu/people/faculty/MichaelBender',
 'https://www.cs.stonybrook.edu/people/faculty/XiaojunBi',
 'https://www.cs.stonybrook.edu/people/faculty/BarbaraChapman',
 'https://www.cs.stonybrook.edu/people/faculty/OmarChowdhury',
 'https://www.cs.stonybrook.edu/people/faculty/RezaulChowdhury',
 'https://www.cs.stonybrook.edu/people/faculty/SamirDas',
 'https://www.cs.stonybrook.edu/people/faculty/AhmadEsmaili',
 'https://www.cs.stonybrook.edu/people/faculty/MichaelFerdman',
 'https://www.cs.stonybrook.edu/people/faculty/PaulFodor',
 'https://www.cs.stonybrook.edu/people/faculty/PramodGanapathi',
 'https://www.cs.stonybrook.edu/people/faculty/AnshulGandhi',
 'https://www.cs.stonybrook.edu/people/faculty/Xi