In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import bleach
import warnings
warnings.filterwarnings('ignore')

In [2]:
def getPersonInfo(nid):
    url_template = "https://eds.nd.edu/cgi-bin/nd_ldap_search.pl?displayformat=/ndcso_tmp.html&ldapfilter=uid={who}"
    url = url_template.format(who=nid)
    try:
        r = requests.get(url)
    except requests.exceptions.ConnectionError:
        return 
    
    soup = BeautifulSoup(r.text)

    if(soup.find(text="Found 0 entries.") != None):
        return
    else:
        netid = soup.find(text="NetID:")
        valnetid = cleanTagContents(netid)
        
        dept = soup.find(text="Department:")
        valdept = cleanTagContents(dept)
        
        title = soup.find(text="Title:")
        valtitle = cleanTagContents(title)
        
        affil = soup.find(text="Affiliation:")
        valaffil = cleanTagContents(affil)
        
        curric = soup.find(text="Curriculum:")
        valcurric = cleanTagContents(curric)
        
        email = soup.find(text="Preferred email:")
        valemail = cleanTagContents(email)

        df = pd.DataFrame({'netid':valnetid, 'dept':valdept, 'title':valtitle, 'affil':valaffil, 'curric':valcurric, 'email':valemail})
        df=df[['netid', 'email', 'affil','dept', 'title','curric']]
        return df


In [3]:
def cleanTagContents(what):
        b_tag = what.parent
        td_tag = b_tag.parent
        next_td_tag = td_tag.findNext('td')
        next2_td_tag = next_td_tag.findNext('td')
        #print next2_td_tag.contents[0]
        cleaned = bleach.clean(next2_td_tag, tags=[], strip=True)
        val =[]
        val.append(cleaned)
        return val


In [4]:
# create empty dataframe
persons_df=pd.DataFrame()

In [5]:
# read in list of netid's from external file
netidlist=pd.read_csv('netids.csv',sep='\n')

In [6]:
# loop over list of netid's
for index, row in netidlist.iterrows():
    who = row['netid']
    pers_df = getPersonInfo(who)
    persons_df = persons_df.append(pers_df)

In [7]:
persons_df.reset_index(drop=True)

Unnamed: 0,netid,email,affil,dept,title,curric
0,jng2,james.ng@nd.edu,Faculty,Hesburgh Libraries,Assistant Librarian,
1,slee45,,Student,,,Graduate Business School


In [8]:
# save as CSV
persons_df.to_csv('ndpersonsinfo.csv',index=False)