In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import re 
import urllib
import time

In [2]:
#create a webdriver object and set options for headless browsing
options = Options()
options.headless = True
driver = webdriver.Chrome(ChromeDriverManager().install())

[WDM] - Current google-chrome version is 83.0.4103
[WDM] - Get LATEST driver version for 83.0.4103


 


[WDM] - Get LATEST driver version for 83.0.4103
[WDM] - Trying to download new driver from http://chromedriver.storage.googleapis.com/83.0.4103.39/chromedriver_win32.zip
[WDM] - Driver has been saved in cache [C:\Users\knimashakavi\.wdm\drivers\chromedriver\win32\83.0.4103.39]


In [3]:
#uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url,driver):
    driver.get(url)
    res_html = driver.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
    return soup

#tidies extracted text 
def process_bio(bio):
    bio = bio.encode('ascii',errors='ignore').decode('utf-8')       #removes non-ascii characters
    bio = re.sub('\s+',' ',bio)       #repalces repeated whitespace characters with single space
    return bio
def remove_script(soup):
    for script in soup(["script", "style"]):
        script.decompose()
    return soup


#Checks if bio_url is a valid faculty homepage
def is_valid_homepage(bio_url,dir_url):
    if bio_url.endswith('.pdf'): #we're not parsing pdfs
        return False
    try:
        #sometimes the homepage url points to the same page as the faculty profile page
        #which should be treated differently from an actual homepage
        ret_url = urllib.request.urlopen(bio_url).geturl() 
    except:
        return False       #unable to access bio_url
    urls = [re.sub('((https?://)|(www.))','',url) for url in [ret_url,dir_url]] #removes url scheme (https,http or www) 
    return not(urls[0]== urls[1])

In [4]:
#extracts all Faculty Profile page urls from the Directory Listing Page
def scrape_dir_page(dir_url,driver):
    print ('-'*20,'Scraping directory page','-'*20)
    faculty_links = []
    faculty_base_url = 'https://cbn.rutgers.edu'
    #execute js on webpage to load faculty listings on webpage and get ready to parse the loaded HTML 
    soup = get_js_soup(dir_url,driver) 
    table = soup.find_all('table')[0] # Grab the first table
    #print (table)
    #print (soup)
    for link_holder in soup.find_all('div',class_='cbUserListFieldLine cbUserListFL_formatname'): #get list of all <div> of class 'name'
        rel_link = link_holder.find('a')['href'] #get url
       # print(rel_link)
        #url returned is relative, so we need to add base url
        faculty_links.append(rel_link)
    print ('-'*20,'Found {} faculty profile urls'.format(len(faculty_links)),'-'*20)
    return faculty_links

In [5]:
dir_url = 'https://cbn.rutgers.edu/people/faculty' #url of directory listings of CS faculty
faculty_links = scrape_dir_page(dir_url,driver)

-------------------- Scraping directory page --------------------
-------------------- Found 31 faculty profile urls --------------------


In [6]:
def scrape_faculty_page(fac_url,driver):
    soup = get_js_soup(fac_url,driver)
    homepage_found = False
    bio_url = ''
    bio = ''
    for link_holder in soup.find_all('div',class_='cb_field'):
        #print (link_holder)
        try:
            bio_url = link_holder.find('a')['href']
            if (bio_url.startswith('http')):
                #print (bio_url)
                homepage_found = True
                try:
                    bio_soup = remove_script(get_js_soup(bio_url,driver))
                    break
                except:
                    print ('Could not access {}'.format(bio_url))
                    bio_url=None
                    homepage_found = False
            else:
                homepage_found = False
            
        except:
            continue
    if homepage_found:
        #get all the text from homepage(bio)
        bio = process_bio(bio_soup.get_text(separator=' '))
        #print (bio_url)
        #print(bio)        
    print(bio_url)
    print(bio)
    return bio_url,bio

In [None]:
#Scrape homepages of all urls
bio_urls, bios = [],[]
tot_urls = len(faculty_links)
for i,link in enumerate(faculty_links):
    print ('-'*20,'Scraping faculty url {}/{}'.format(i+1,tot_urls),'-'*20)
    bio_url,bio = scrape_faculty_page(link,driver)
    #print (bio_url)
    if bio.strip()!= '' and bio_url.strip()!='':
        bio_urls.append(bio_url.strip())
        bios.append(bio)
driver.close()

-------------------- Scraping faculty url 1/31 --------------------
http://www.abrairalab.org
 ABRAIRA LAB Home Research Funding People Publications Contact Us Openings Useful Links News Member Access more... Central Support Team Post Docs Graduate Students Undergraduate Interns Publications Contact Us Openings Useful Links News Member Access WIRED FOR TOUCH what we study Our tactile world is rich, if not infinite . The flutter of an insects wings a warm breeze raindrops and a mothers gentle caress all impose mechanical forces upon our skin and yet we encounter no difficulty in telling them apart and react differently to each. How do we recognize and interpret the myriad of tactile stimuli to perceive the richness of the physical world? Our lab utilizes the power of mouse molecular genetics to understand our sense of touch , from pain to pleasure and everything in between. THE NEURONS AND CIRCUITS OF THE SOMATOSENSORY SYSTEM Research Learn More People Meet Us Publications Read Our Work

https://sites.rutgers.edu/firesteinlab/
 Skip to main content Rutgers.edu New Brunswick RBHS Newark Camden Online Rutgers Health Rutgers Search Firestein Laboratory's Facebook Page LinkedIn Page Rutgers University Logo Firestein Laboratory Menu Home About News Archived News Publications Bonfire Program Current Personnel Past laboratory members Contact Open Search Input Homepage Firestein Lab Our research focuses on the role of guanine metabolism in neuronal development and in recovery after injury. My lab identified the postsynaptic density protein-95 (PSD-95) interactor cypin (cytosolic PSD-95 interactor; aka guanine deaminase or GDA), a purine metabolic enzyme, as a core regulator of neuronal development that directly interacts with the cytoskeleton and alter its dynamics. My laboratory studies the role of cypin/GDA in the promotion of recovery after traumatic brain injury and glutamate-induced toxicity. We determined that overexpression of cypin is neuroprotective as is treatment wi

/research/faculty-publications

-------------------- Scraping faculty url 15/31 --------------------
https://hudalab.org/people
 Research People Techniques Two-photon imaging Quantitative mouse behavior Anatomy Publications News Opportunities HUDA LABORATORY Research People Techniques Two-photon imaging Quantitative mouse behavior Anatomy Publications News Opportunities lab members Rafiq Huda, PhD Principal Investigator Assistant Professor, W.M. Keck Center for Collaborative Neuroscience, Dept. of Cell Biology and Neuroscience, Rutgers University - New Brunswick I was born in the seaport city of Karachi in Pakistan. I immigrated to Chicago at the age of 13 and attended Senn High School. I was fortunate to win a transformative scholarship from the Posse foundation , allowing me to attend Carleton College . I majored in Biology and graduated magna cum laude with honors in 2008. That same year, I started my graduate studies with Prof. Marco Martina in the Dept. of Physiology at Northweste

https://sites.google.com/view/kwan-lab-rutgers/home
Search this site Kwan Lab @ Rutgers Home Research Publications Datasets Current Members & Alumni RU Forms Protocols Lab Jobs Lab Documents Kwan Lab @ Rutgers Home Research Publications Datasets Current Members & Alumni RU Forms Protocols Lab Jobs Lab Documents More Home Research Publications Datasets Current Members & Alumni RU Forms Protocols Lab Jobs Lab Documents Kwan Lab Our Interests The Kwan Laboratory at Rutgers is interested in regenerating spiral ganglion neurons of the vertebrate inner ear. We employ a variety of techniques including super-resolution imaging and massively parallelized sequencing methods. We are currently investigating the molecular role of chromodomain helicase DNA binding proteins during differentiation of spiral ganglion neurons. Contact Us Office (848) 445-1781 Email Kwan@dls.rutgers.edu Lab (848) 445-9552 Fax (732) 445-2063 FAQs Access Box for cloud storage Join the Kwan Lab Slack Workspace (kwanlabrutge

In [8]:
def write_lst(lst,file_):
    with open(file_,'w') as f:
        for l in lst:
            f.write(l)
            f.write('\n')

In [9]:
bio_urls_file = '../bio_urls.txt'
bios_file = '../bios.txt'
write_lst(bio_urls,bio_urls_file)
write_lst(bios,bios_file)