# Web scraping for People information

Using the KI People page, this code will scrape relevant information from Faculty.

In [64]:
import bs4
from bs4 import BeautifulSoup
from pathlib import Path
import glob
import json

def parse_contact(soup, verbose=False):
    _dcont = {}
    for _i in soup.select('div[class=person-section-contact] > div > div[class="person-contact-box"]'):
        try:
            _t = " ".join([_j.capitalize() for _j in _i.select('h3')[0].text.split()])
        except:
            # One case skips the h3 tags, no contact subsections
            _t = "Primary"

        _dcont[_t] = []
        for _j in _i.select('p'):
            try:
                _href = _j.select('a')[0]['href']
            except:
                _href = None

            try: 
                _dcont[_t].append({
                    'type': _j['class'][0],
                    'val': _j.text,
                    'href': _href
                })
            except:
                continue
    
    _contact = {}
    for _i in _dcont.keys():
        if verbose: print("\tContact:", _i, '\n')
        _contact[_i] = {}
        for _j in _dcont[_i]:
            _t = _j['type']
            _v = _j['val']
            _l = _j['href']
            if _l is not None and 'website' in _v:
                _v = _l
            if verbose: print(f"\t\t{_t.capitalize()}\t{_v}")
            _contact[_i][_t] = _v 
        if verbose: print()
            
    return _contact

def parse_paragraph_content(soup, verbose=False):
    _tagdict = {}
    try:
        _tagdict['Quote'] = soup.select('div[class=person-section-body]')[0].select('div > p[class=quote]')[0].text.strip()
        if verbose: print(f"\t{briefp}\n")
    except:
        if verbose: print("\tERROR: NO PARAGRAPH QUOTE FOUND\n")
    _stmp = soup.select('div[class=person-section-body]')[0].select('div[class=paragraph-content-textcolumn]')
    _last = ""
    for _s in _stmp[0]:
        if type(_s) is bs4.element.Tag:
            if _s.name == 'h3':
                _last = _s.text
                _tagdict[_s.text] = []
                if verbose: print(f"{_s.text}")
            elif _s.name == 'p':
                _tagdict[_last] = _s.text
                if verbose: print(f"\t{_s.text}")
    return _tagdict

def parse_publication_link(soup, verbose = False):
    _subs = soup.select('div[class=person-section-body]')[0]
    _publink = ""
    for _link in _subs.find_all('a', href=True):
        if 'list of publications' in _link.text:
            if verbose: print(_link['href'])
            _publink = _link['href']
    return _publink
        
def parse_faculty_page(fn = "people/faculty/matthew-vander-heiden.html", verbose=False):
    _entry = {}
    soup = BeautifulSoup(Path(fn).read_text(), 'html.parser')
    try:
        _entry['Name'] = soup.select('title')[0].text.split('|')[0].strip()
        _entry['Affiliation'] = soup.select('title')[0].text.split('|')[1].strip()
        if verbose: print(f"{_name}, {_inst}")
    except:
        if verbose: print("\tERROR: NO TITLE FOUND")
    _entry['Titles'] = []
    for _i in soup.select('div[class=person-titles] > p'):
        if verbose: print(f"\t- {_i.text}")
        _entry['Titles'].append(_i.text)
    if verbose: print()
    _entry['Info'] = parse_paragraph_content(soup, verbose=verbose)
    _entry['Contact'] = parse_contact(soup, verbose=verbose)
    _hp = 'ki.mit.edu'
    _entry['Profile'] = "https://" + _hp + str(fn).split(_hp)[1].split('.')[0]
    _entry['PubMed'] = parse_publication_link(soup, verbose=verbose)
    return soup, _entry
    
def parse_all_faculty_pages(fp='people/faculty', verbose=False):
    soups = {}
    cards = {}
    _p = Path(fp).absolute()
    for _i in _p.glob("*.html"):
        soups[_i.name], cards[_i.name] = parse_faculty_page(_i)
        if verbose: print()
    return soups, cards

def pull_faculty(fp='people/faculty', verbose=False):
    soups, cards = parse_all_faculty_pages(fp=fp, verbose=verbose)
    if verbose:
        print('Faculty','\n')
        for _k in soups.keys():
            print(f"\t- {_k}")
        print()
    faculty_info = {}
    for _k, _v in cards.items():
        faculty_info[_v['Name']] = _v
    return faculty_info

def pull_clinical_investigators_and_research_fellows(fp='people/clinical-investigators-research-fellows', 
                                                     verbose=False):
    soups, cards = parse_all_faculty_pages(fp=fp, verbose=verbose)
    if verbose:
        print('Clinical Investigators and Research Fellows','\n')
        for _k in soups.keys():
            print(f"\t- {_k}")
        print()
    faculty_info = {}
    for _k, _v in cards.items():
        faculty_info[_v['Name']] = _v
    return faculty_info

def pull_leadership(fp='people/leadership', verbose=False):
    soups, cards = parse_all_faculty_pages(fp=fp, verbose=verbose)
    if verbose:
        print('Leadership','\n')
        for _k in soups.keys():
            print(f"\t- {_k}")
        print()
    faculty_info = {}
    for _k, _v in cards.items():
        faculty_info[_v['Name']] = _v
    return faculty_info

def pull_all(verbose=False):
    faculty = pull_faculty()
    clinical_investigators_and_research_fellows = pull_clinical_investigators_and_research_fellows()
    leadership = pull_leadership()
    return leadership | faculty | clinical_investigators_and_research_fellows

def print_dict(d, indent=4):
    print(json.dumps(d, indent=indent))

## Pull All 

Now I can pull all of the Clinical Investigators, Research Fellows, Faculty, and Leadership people pages in one line.

In [68]:
ki_people = pull_all()

print("KI People\n")
for _p in ki_people.keys():
    print(f"\t- {_p}")
print()

KI People

	- Karen Sveda
	- Andreea O'Connell
	- Jane Wilkinson
	- Susan Hockfield
	- Angela Koehler
	- David Housman
	- Richard O. Hynes
	- Ömer Yilmaz
	- Frank Solomon
	- Daniel Anderson
	- Sangeeta Bhatia
	- Michael Hemann
	- Ram Sasisekharan
	- Jianzhu Chen
	- Forest White
	- Robert Langer
	- Frank B. Gertler
	- Yadira Soto-Feliciano
	- Matthew Vander Heiden
	- Angela Belcher
	- Nancy Hopkins
	- Stefani Spranger
	- Michael Yaffe
	- Darrell Irvine
	- Francisco J. Sánchez-Rivera
	- Phillip Sharp
	- Tyler Jacks
	- Paula Hammond
	- Dane Wittrup
	- Scott Manalis
	- Michael Cima
	- Michael Birnbaum
	- J. Christopher Love
	- Jacqueline Lees
	- Kristin Knouse
	- Joelle Straehla
	- Salil Garg
	- Hojun Li



In [67]:
print_dict(ki_people["Matthew Vander Heiden"])

{
    "Name": "Matthew Vander Heiden",
    "Affiliation": "Koch Institute",
    "Titles": [
        "Director, Koch Institute for Integrative Cancer Research",
        "Lester Wolfe (1919) Professor of Molecular Biology",
        "Professor of Biology",
        "Member, MIT Center for Precision Cancer Medicine",
        "Member, Ludwig Center at MIT",
        "Member, Broad Institute of Harvard and MIT"
    ],
    "Info": {
        "Quote": "Our improved understanding of cancer metabolism\u2014what nutrients cancer cells use and how those nutrients are used in different conditions\u2014has led to exciting new ways to think about drug development.",
        "Research Summary": "Through this work, we aim to advance understanding of metabolic pathway biochemistry and its relationship to cancer and mammalian physiology. Together, these studies will broaden our understanding of cancer cell metabolism and identify approaches to target metabolism for cancer therapy.",
        "Biography": "Ma