Goal: scrape NIH CSR site for study section members

data source: https://public.csr.nih.gov/StudySections/StandingStudySections

example site: https://public.era.nih.gov/pubroster/preRosIndex.era?AGENDA=438116&CID=102353

https://beautiful-soup-4.readthedocs.io/en/latest/#

In [None]:
# # OLD:
# # Nested for loop version, keeping original chunks separate below because I still need to figure out how to extract the section ID and date info
# from bs4 import BeautifulSoup
# from selenium import webdriver

# driver = webdriver.Firefox()

# target_url = 'https://public.csr.nih.gov/StudySections/StandingStudySections'

# driver.get(target_url)
# soup = BeautifulSoup(driver.page_source, 'html.parser')

# urls = [item.get("href") for item in soup.find_all("a")]
# urls_final = [x for x in urls if x.startswith('/StudySections')]

# full_urls = []
# for ending in urls_final[10:-8]:
#     full_urls.append('https://public.csr.nih.gov'+ending)

# member_list = list()
# for url in full_urls:
#     driver.get(url)
#     soup = BeautifulSoup(driver.page_source, 'html.parser')

#     roster_urls = [item.get("href") for item in soup.find_all("a")]
#     try:
#         roster_url_final = [x for x in roster_urls if x.startswith('https://public.era.nih.gov/pubroster/preRosIndex')]
#     except:
#         continue
    
#     rosters = str(roster_url_final)
#     rosters = rosters.split("'")
#     rosters = [x for x in rosters if x.startswith('https://public.era.nih.gov/pubroster/preRosIndex')]

#     for roster in rosters:
            
#         driver.get(roster)

#         html = driver.page_source
#         soup = BeautifulSoup(html)

#         title_info = []
#         for center_tag in soup.find_all('center'):
#             snippet = center_tag.text
#             snippet = snippet.split('\n')
#             snippet = [i.strip() for i in snippet if i]
#             title_info = title_info + snippet
            
#         blocks = soup.find_all('p')
#         try:
#             del blocks[0] # deletes block that doesn't contain member info
#         except:
#             continue

#         for block in blocks:
#             block = str(block).replace('\n',"")
#             block = block.replace('\t',"")
#             block = block.replace('</p>',"")
#             block = block.replace('</font>',"")
#             block = block.replace('\xa0'," ")
#             block = block.replace("    ", "")
#             block = block.replace('<p><font color="Navy" size="2"><font color="Navy" size="2">',"")
#             block = block.split('<br/>')
#             member_list.append(title_info[0:3] + block)

In [3]:
# June 2024:
# Extracts names of reviewers of standing study sections from rosters on CSR's website 

from bs4 import BeautifulSoup
import requests

source_url = 'https://public.csr.nih.gov/StudySections/StandingStudySections'
response = requests.get(source_url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table')
SS_urls = ['https://public.csr.nih.gov' + x.get('href') for x in table.find_all('a') if '/StudySections' in x.get('href')]

member_list = []

# scrape urls corresponding to each study section's meetings:
for SS_url in SS_urls:
    response = requests.get(SS_url)
    soup = BeautifulSoup(response.content, "html.parser")
    try:
        roster_urls = [x.get('href') for x in soup.find_all('a') if '/pubroster/preRosIndex' in x.get('href')]
    except: # some sections do not have rosters published (new and haven't met yet, etc)
        continue

    # scrape info from each meeting's published roster:
    for roster_url in roster_urls:
        response = requests.get(roster_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        # print(soup)

        # Extract meeting information
        SS = soup.find('h2').get_text()
        SS = SS.replace('\n','')
        roster_header = soup.find('center')
        abbrv_and_date = [x.get_text(separator="\n").strip() for x in roster_header.find_all('b')]
        abbrv = abbrv_and_date[0] # 0 is SS abreviation, 1 is start date (and maybe end date, text meeting roster)
        startdate = abbrv_and_date[1].replace('-', '').split('\n')[0].strip()

        # Extract roster information
        chunks = soup.find_all('p')
        for chunk in chunks[1:-2]:
            chunk = chunk.get_text(separator="---").replace('\t', '').replace('\n', ' ').replace('*', '')
            chunk = [x.strip() for x in chunk.split('---') if x]
            chunk_zip = chunk[-1].split(' ')[-1]

            entry = [SS] + [abbrv] + [startdate] + [chunk_zip] + chunk
            member_list.append(entry)

In [None]:
# save rosters for later re-processing

from bs4 import BeautifulSoup
import requests
import os.path

source_url = 'https://public.csr.nih.gov/StudySections/StandingStudySections'
response = requests.get(source_url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table')
SS_urls = ['https://public.csr.nih.gov' + x.get('href') for x in table.find_all('a') if '/StudySections' in x.get('href')]

roster_path = 'C:\\Users\\kjj326\OneDrive - The University of Texas at Austin\\study_section_members\\rosters'

# scrape urls corresponding to each study section's meetings:
for SS_url in SS_urls:
    response = requests.get(SS_url)
    soup = BeautifulSoup(response.content, "html.parser")
    try:
        roster_urls = [x.get('href') for x in soup.find_all('a') if '/pubroster/preRosIndex' in x.get('href')]
    except: # some sections do not have rosters published (new and haven't met yet, etc)
        continue

    for roster_url in roster_urls:
        response = requests.get(roster_url)
        filename = os.path.join(roster_path, roster_url[-24:] + ".txt",)
        with open(filename, "w") as file:
            file.write(response.text)        

In [4]:
# Export member list to CSV
import pandas as pd # pandas is overkill but I want to learn it dang it
from datetime import datetime

my_df = pd.DataFrame(member_list)
today = datetime.today().strftime('%Y%m%d')
my_df.to_csv(today+'_study section members.csv', index=False, header=False)

In [None]:
# consolidate all member list data:
import pandas as pd
from datetime import datetime

# need to revise to handle n files
files = ['20220408_study section members.csv', '20220920_study section members.csv', '20230621_study section members.csv', '20240621_study section members.csv']

df = pd.read_csv(files[0])
for file in files[1:]:
    df = pd.concat([df, pd.read_csv(file)])

today = datetime.today().strftime('%Y%m%d')
df.to_csv(today+'_all reviewers.csv', index=False, header=False)

In [5]:
# clean up names in member list data (remove degrees)
import pandas as pd

file = '20240624_study section members.csv'

data = pd.read_csv(file)

# strategy: delete everything after the 2nd comma (1st comma used in lastname, firstname)
def find_nth(haystack: str, needle: str, n: int) -> int:
    start = haystack.find(needle)
    while start >= 0 and n > 1:
        start = haystack.find(needle, start+len(needle))
        n -= 1
    return start

data["Name"] = [name[:find_nth(name, ',', 2)] for name in data["Name"]]

data.to_csv(file[:-4]+'_cleaned.csv', index=False, header=True)


In [3]:
# filter reviewer list by UT affiliation, using list from NIH of active UT accounts (Kristen pulled from Commons)
import pandas as pd
from datetime import datetime

# Import with all names uppercase for easiest matching
UT_PIs = pd.DataFrame(pd.read_csv('20240624_NIH Commons UT Austin PI list ACTIVE.csv')["Name"].str.upper()) # just grab Name column
all_reviewers = pd.read_csv('20240621_study section members_cleaned.csv') # all columns (meeting info)

# Output: indices of UT-affiliated names within the all_reviewers doc, so that I can keep info about the meeting they were part of
# for each UT PI, find them in the reviewers list and return the indices. Add them to a running list of indices to pull out of the reviewers list
keys = list(UT_PIs.columns.values)
i1 = all_reviewers.set_index(keys).index
i2 = UT_PIs.set_index(keys).index
UT_reviewers = all_reviewers[i1.isin(i2)]

today = datetime.today().strftime('%Y%m%d')
UT_reviewers.to_csv(today+'_UT NIH reviewers.csv', index=False, header=True)



In [14]:
# filter reviewer list by UT affiliation, using list from NIH of active UT accounts (Kristen pulled from Commons)
import pandas as pd
from datetime import datetime

zip_list = [str(item) for item in list(range(78701, 78770))] + ['787121140', '787121229', '787120292']
Austin_Zipcodes = pd.DataFrame(zip_list, columns=['Zipcode'])
all_reviewers = pd.read_csv('20240624_study section members_cleaned.csv') # all columns (meeting info)

# Output: indices of UT-affiliated names within the all_reviewers doc, so that I can keep info about the meeting they were part of
# for each UT PI, find them in the reviewers list and return the indices. Add them to a running list of indices to pull out of the reviewers list
keys = list(Austin_Zipcodes.columns.values)
i1 = all_reviewers.set_index(keys).index
i2 = Austin_Zipcodes.set_index(keys).index
Austin_reviewers = all_reviewers[i1.isin(i2)]

today = datetime.today().strftime('%Y%m%d')
Austin_reviewers.to_csv(today+'_Austin NIH reviewers.csv', index=False, header=True)