In [1]:
import os
import sys
from os.path import join
import pandas as pd
import numpy as np
import time

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


---
# Create department/organization location table
Using tables provided via the duke facilities database, figure out how to assign a single location to each unique department

In [3]:
# Read in the room table (entry for every room on campus, along with associated DUKE_NUMBER, i.e. organization number)
bl_df = pd.read_table('data/raw/SQL_output/buildingByDepartment.tsv', sep='\t')

In [4]:
bl_df.head()

Unnamed: 0,BL_ID,FL_ID,RM_ID,DUKE_NUMBER
0,7593,6,6004,6860505000
1,7593,6,6302,6860505000
2,7593,6,6005,6860505000
3,7593,6,6013,6860505000
4,7593,6,6204,99999981


Note that a given DUKE_NUMBER can have rooms assigned in multiple buildings. Assign the DUKE_NUMBER to the building in which *most* of its rooms are concentrated

#### Test out with a single unique DUKE_NUMBER

In [5]:
tmp = bl_df.loc[bl_df.DUKE_NUMBER == 6860505000]
tmp.head()

Unnamed: 0,BL_ID,FL_ID,RM_ID,DUKE_NUMBER
0,7593,6,6004,6860505000
1,7593,6,6302,6860505000
2,7593,6,6005,6860505000
3,7593,6,6013,6860505000
6,7593,8,8500,6860505000


In [6]:
roomCount = tmp.groupby('BL_ID').count()
roomCount

Unnamed: 0_level_0,FL_ID,RM_ID,DUKE_NUMBER
BL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7504,75,75,75
7505,9,9,9
7593,333,333,333
8060,5,5,5
8138,306,306,306
8641,2,2,2


In [7]:
roomCount.idxmax()

FL_ID          7593
RM_ID          7593
DUKE_NUMBER    7593
dtype: object

#### Apply this approach to the entire table. 
Assign a building ID to each unique DUKE_NUMBER
based on the building that has the *most* rooms for that DUKE_NUMBER

In [8]:
# group dataframe based on 1) DUKE_NUMBER, then 2) BL_ID.
# count the number of rooms in each building owned by each DUKE_NUMBER
bl_by_dukeNumber = bl_df.groupby(['DUKE_NUMBER', 'BL_ID']).count()

In [9]:
bl_by_dukeNumber.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,FL_ID,RM_ID
DUKE_NUMBER,BL_ID,Unnamed: 2_level_1,Unnamed: 3_level_1
16000001,7976,54,54
20111000,7502,2,2
20111000,7504,2,2
20111000,7507,8,8
20111000,7508,9,9


In [10]:
# Get the index (i.e.) building ID of the building with the most rooms
bl_id_max = bl_by_dukeNumber.groupby(level='DUKE_NUMBER').idxmax()
bl_id_max.head()

Unnamed: 0_level_0,FL_ID,RM_ID
DUKE_NUMBER,Unnamed: 1_level_1,Unnamed: 2_level_1
16000001,"(16000001, 7976)","(16000001, 7976)"
20111000,"(20111000, 7508)","(20111000, 7508)"
20112000,"(20112000, 7508)","(20112000, 7508)"
20113000,"(20113000, 8238)","(20113000, 8238)"
20114000,"(20114000, 8160)","(20114000, 8160)"


In [11]:
# reformat the table (grab the bl_id value from each entry, drop unecessary cols)
bl_id_max['BL_ID'] = bl_id_max.loc[:, 'FL_ID'].apply(lambda x: x[1])
bl_id_max.drop(['FL_ID', 'RM_ID'], axis=1, inplace=True)
bl_id_max.head()

Unnamed: 0_level_0,BL_ID
DUKE_NUMBER,Unnamed: 1_level_1
16000001,7976
20111000,7508
20112000,7508
20113000,8238
20114000,8160


### Get the details about each building from the building info table
For each building ID, use the building info table (from duke facilities) to retrieve information about its address and location

In [5]:
blinfo = pd.read_table('data/raw/SQL_output/buildingInfo.tsv', sep='\t')

In [13]:
blinfo.head()

Unnamed: 0,BL_ID,ADDRESS1,CITY_ID,STATE_ID,ZIP,LON,LAT
0,7101,400 Gattis St,DURHAM,NC,27701,-78.91848,36.000104
1,7503,40 Duke Medicine Cir,DURHAM,NC,27705,-78.937025,36.004101
2,7505,40 Duke Medicine Cir,DURHAM,NC,27705,-78.937028,36.004417
3,7506,40 Duke Medicine Cir,DURHAM,NC,27705,-78.935105,36.003996
4,7507,40 Duke Medicine Cir,DURHAM,NC,27705,-78.93695,36.003063


In [6]:
#### Define functions to retrieve values from building info dataframe
def getBuildingInfo(this_bl_id, field):
    val = blinfo.loc[blinfo.BL_ID==this_bl_id, field]
    return val.values[0]


In [15]:
bl_id_max['ADDRESS'] = bl_id_max['BL_ID'].apply(getBuildingInfo, args=('ADDRESS1',))
bl_id_max['CITY'] = bl_id_max['BL_ID'].apply(getBuildingInfo, args=('CITY_ID',))
bl_id_max['STATE'] = bl_id_max['BL_ID'].apply(getBuildingInfo, args=('STATE_ID',))
bl_id_max['ZIP'] = bl_id_max['BL_ID'].apply(getBuildingInfo, args=('ZIP',))
bl_id_max['LON'] = bl_id_max['BL_ID'].apply(getBuildingInfo, args=('LON',))
bl_id_max['LAT'] = bl_id_max['BL_ID'].apply(getBuildingInfo, args=('LAT',))

In [16]:
bl_id_max.head()

Unnamed: 0_level_0,BL_ID,ADDRESS,CITY,STATE,ZIP,LON,LAT
DUKE_NUMBER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
16000001,7976,310 Blackwell St,DURHAM,NC,27701,-78.903707,35.993959
20111000,7508,40 Duke Medicine Cir,DURHAM,NC,27705,-78.935915,36.003337
20112000,7508,40 Duke Medicine Cir,DURHAM,NC,27705,-78.935915,36.003337
20113000,8238,3100 Tower Blvd,DURHAM,NC,27707,-78.959159,35.971664
20114000,8160,615 Douglas St,DURHAM,NC,27705,-78.941504,36.009265


This dataframe now has a row for each unique organization. The columns indicate the location of the building associated with each organization. As there can be multiple organizations to one building, certain buildings are repeated multiple times throughout this dataframe

In [17]:
# rename for clarity
orgLoc_df = bl_id_max.copy()

# write this data to file
orgLoc_df.to_csv('data/processed/organization_locations.tsv', sep='\t')

In [18]:
# number of unique buildings in the dataset
len(np.unique(bl_id_max.BL_ID))

173

---
# Make a table identifying each collaboration
A collaboration is defined as 2 people working together on the same project. The scholars_publications.csv lists one author per publication. Thus, for publications with more than 1 author, the publication details are repeated multiple times in the dataframe. The goal here is to reformat the collaborations such that each row lists 2 authors who collaborated on a given publication, as well as the rest of the information for that publication (e.g. year, etc...)

In [31]:
# read in the scholars_publications table
pub_df = pd.read_table('./data/raw/scholars_publications.csv', sep='\t', low_memory=False)

# drop the 'ABSTRACT' column to free up memory
pub_df.drop('ABSTRACT', axis=1, inplace=True)

# there are ~200 duplicate rows in this table. Drop them as well
pub_df.drop_duplicates(inplace=True)

In [32]:
pub_df[:3]

Unnamed: 0,DUID,PRO_FIRST_NAME,PRO_MIDDLE_NAME,PRO_LAST_NAME,DISPLAY_NAME,TITLE,AUTHOR_URI,PUBLICATION_URI,PUBLISHED_DATE,PUBLICATION_TYPE,DOI,ISSN,EISSN,ISBN10,ISBN13,JOURNAL,Volume / Issue
0,623466,David,W,Jang,"Surgery, Head and Neck Surgery and Communicati...",Product comparison model in otolaryngology: Eq...,https://scholars.duke.edu/individual/per7361302,https://scholars.duke.edu/individual/pub1118321,1/1/2016 12:00:00 AM,Theses and Dissertations,,,,,,,
1,73333,Carol,Casper,Figuers,"Orthopaedics, Physical Therapy",Developing a Professional Embodiment of Moveme...,https://scholars.duke.edu/individual/per4051842,https://scholars.duke.edu/individual/pub1071600,3/3/2015 12:00:00 AM,Theses and Dissertations,,,,,,,
2,279169,Jeffrey,Kyle,Covington,"Orthopaedics, Physical Therapy",Developing a Professional Embodiment of Moveme...,https://scholars.duke.edu/individual/per0337862,https://scholars.duke.edu/individual/pub1071600,3/3/2015 12:00:00 AM,Theses and Dissertations,,,,,,,


In [33]:
# count the number of entries in the table by publication type
numByType = pub_df.groupby('PUBLICATION_TYPE').count()
numByType

Unnamed: 0_level_0,DUID,PRO_FIRST_NAME,PRO_MIDDLE_NAME,PRO_LAST_NAME,DISPLAY_NAME,TITLE,AUTHOR_URI,PUBLICATION_URI,PUBLISHED_DATE,DOI,ISSN,EISSN,ISBN10,ISBN13,JOURNAL,Volume / Issue
PUBLICATION_TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Book,599,599,286,599,599,599,599,599,599,67,0,0,137,259,0,0
Book Review,115,115,50,115,115,115,115,115,115,30,65,9,0,0,20,0
Book Section,1994,1994,1292,1994,1994,1994,1994,1994,1994,698,0,0,167,1018,0,0
Book Series,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0
Conference Paper,4976,4976,3405,4976,4976,4976,4976,4976,4976,1279,4487,532,11,617,1359,0
Dataset,4,4,3,4,4,4,4,4,4,3,0,0,0,0,0,0
Digital Publication,84,84,47,84,84,84,84,84,84,2,2,0,0,0,0,0
Journal Article,67878,67876,51405,67876,67851,67876,67878,67878,67878,61075,43625,47900,0,0,27099,67794
Journal Issue,22,22,11,22,22,22,22,22,22,2,9,3,0,0,0,22
Other Article,1276,1276,830,1276,1276,1276,1276,1276,1276,522,491,412,1,10,242,0


In [34]:
# get the number of collaborators on each project by counting how many entries are associated with each unique PUBLICATION_URI
numByPubURI = pub_df.groupby('PUBLICATION_URI').count()

# grab the index values of publication URIs that have more than 1 author associated with them
multiauthorPubURIs = numByPubURI.loc[numByPubURI.DUID>1].index

In [35]:
# print the number of publications with more than 1 author
multiauthorPubURIs.shape

(13851,)

Test how to make a collaboration table out of all of the authors listed for a given multi-author publication

In [36]:
authors = pub_df.loc[pub_df.PUBLICATION_URI == multiauthorPubURIs[4], :]
authors

Unnamed: 0,DUID,PRO_FIRST_NAME,PRO_MIDDLE_NAME,PRO_LAST_NAME,DISPLAY_NAME,TITLE,AUTHOR_URI,PUBLICATION_URI,PUBLISHED_DATE,PUBLICATION_TYPE,DOI,ISSN,EISSN,ISBN10,ISBN13,JOURNAL,Volume / Issue
19332,373496,Junzo,Paul,Chino,Radiation Oncology,How much is another randomized trial of lymph ...,https://scholars.duke.edu/individual/per7661872,https://scholars.duke.edu/individual/pub1000158,1/1/2013 12:00:00 AM,Journal Article,10.1016/j.ygyno.2013.06.025,0090-8258,,,,,131 / 1
19338,135843,Evan,Robert,Myers,Obstetrics/Gynecology,How much is another randomized trial of lymph ...,https://scholars.duke.edu/individual/per4103452,https://scholars.duke.edu/individual/pub1000158,1/1/2013 12:00:00 AM,Journal Article,10.1016/j.ygyno.2013.06.025,0090-8258,,,,,131 / 1
19341,71139,Laura,Jean,Havrilesky,"Obstetrics and Gynecology, Gynecologic Oncology",How much is another randomized trial of lymph ...,https://scholars.duke.edu/individual/per0039842,https://scholars.duke.edu/individual/pub1000158,1/1/2013 12:00:00 AM,Journal Article,10.1016/j.ygyno.2013.06.025,0090-8258,,,,,131 / 1


In [37]:
import itertools

In [38]:
for a1,a2 in itertools.combinations(authors.index, 2):
    
    # refer to the first collaborator as the src, and 2nd as dest
    src = authors.loc[a1]
    dest = authors.loc[a2]
    print src.PRO_FIRST_NAME, dest.PRO_FIRST_NAME


Junzo Evan
Junzo Laura
Evan Laura


Use itertools to build an iterator to loop through unique combinations of 2 authors from the full author list for each publication in the multiauthorPubURIs list

In [39]:
# loop through each publication with more than 1 author
collabsList = []
pubID = 0;
for URI in multiauthorPubURIs:
    # get a dataframe of all of the authors for this publication
    authors = pub_df.loc[pub_df.PUBLICATION_URI == URI, :]
    
    # build an iterator to loop through unique combinations of 2 authors
    for a1, a2 in itertools.combinations(authors.index, 2):
        
        # author 1 referred to as src; author 2 as dst
        src = authors.loc[a1]
        dst = authors.loc[a2]
        
        # extract relevant info for each author
        thisCollab = {'src_DUID': src.DUID,
                     'src_FIRST_NAME': src.PRO_FIRST_NAME,
                     'src_MIDDLE_NAME': src.PRO_MIDDLE_NAME,
                     'src_LAST_NAME':src.PRO_LAST_NAME,
                     'dst_DUID': dst.DUID,
                     'dst_FIRST_NAME': dst.PRO_FIRST_NAME,
                     'dst_MIDDLE_NAME': dst.PRO_MIDDLE_NAME,
                     'dst_LAST_NAME': dst.PRO_LAST_NAME,
                     'PUBLICATION_TYPE': src.PUBLICATION_TYPE,
                     'PUBLISHED_DATE':src.PUBLISHED_DATE,
                     'PUBLICATION_URI':src.PUBLICATION_URI, 
                     'PUBLICATION_ID':pubID,
                     'TOTAL_AUTHORS': authors.shape[0]}
        collabsList.append(thisCollab)
    
    # increment publication id
    pubID += 1

# write it all to a dataframe
collab_df = pd.DataFrame(collabsList)
    

In [40]:
# print the shape; indicates how many collaborations total in this dataset
collab_df.shape

(50153, 13)

In [41]:
collab_df.head()

Unnamed: 0,PUBLICATION_ID,PUBLICATION_TYPE,PUBLICATION_URI,PUBLISHED_DATE,TOTAL_AUTHORS,dst_DUID,dst_FIRST_NAME,dst_LAST_NAME,dst_MIDDLE_NAME,src_DUID,src_FIRST_NAME,src_LAST_NAME,src_MIDDLE_NAME
0,0,Journal Article,https://scholars.duke.edu/individual/pub1000033,5/1/2013 12:00:00 AM,2,272070,Shelby,Reed,Derene,99016,Richard,Keefe,S.E.
1,1,Journal Article,https://scholars.duke.edu/individual/pub1000035,7/1/2013 12:00:00 AM,2,591120,John,Reynolds,Michael,119821,Laurie,Snyder,Dee
2,2,Journal Article,https://scholars.duke.edu/individual/pub1000091,10/3/2012 12:00:00 AM,2,98973,John,Alexander,Hunter Peel,314783,Pierluigi,Tricoci,
3,3,Journal Article,https://scholars.duke.edu/individual/pub1000099,11/1/2013 12:00:00 AM,2,449347,Endi,Wang,,435364,Maggie,Stoecker,Marie
4,4,Journal Article,https://scholars.duke.edu/individual/pub1000158,1/1/2013 12:00:00 AM,3,135843,Evan,Myers,Robert,373496,Junzo,Chino,Paul


In [42]:
# write this file to disk
collab_df.to_csv('data/processed/collaborations.tsv', sep='\t', index=False)

In [43]:
# number of collaborations
len(np.unique(pub_df.PUBLICATION_URI))

51233

---
# Add information about organizations to each collaboration 
The collaboration dataframe currently has basic information about each individual in the collaboration. The next step is to figure out which department/organization/DUKE_NUMBER is best associated with each individual

#### Figure out which DUKE_NUMBER is associated with each individual

In [44]:
# create a list of each unique DUID mentioned in the collaboration dataframe
srcDUIDs = np.unique(collab_df.src_DUID)
dstDUIDs = np.unique(collab_df.dst_DUID)

uniqueDUIDs = list(set().union(srcDUIDs, dstDUIDs))

In [45]:
# number of unique DUIDs in the collaboration table
len(uniqueDUIDs)

2461

In [46]:
uniqueDUIDs[:10]

[32770, 114691, 40968, 114697, 466957, 204819, 55982, 458782, 204832, 426023]

In [47]:
# Load scholars_faculty dataframe
faculty_df = pd.read_table('data/raw/scholars_faculty.csv', sep='\t')

# drop duplicates
faculty_df.drop_duplicates(inplace=True)

faculty_df.head()

Unnamed: 0,DUID,PRO_FIRST_NAME,PRO_MIDDLE_NAME,PRO_LAST_NAME,APPOINTMENT_TYPE,TITLE,Appt Org BFR,Appt Org Unit,Appt Org Desc
0,1080,Daniel,James,George,Primary,"Professor of Medicine, with tenure",6860201545,50000856,Medicine - Oncology
1,1080,Daniel,James,George,Secondary,Professor in Surgery,6860207000,50000983,Surgery
2,1255,Suzanne,(null),Shanahan,Administrative,Co-Director of the Kenan Institute for Ethics,6008208000,50000373,Kenan Institute for Ethics
3,1255,Suzanne,(null),Shanahan,Primary,Associate Research Professor in the Department...,6064105610,50000532,Sociology
4,1438,John,J.,Glushik,Primary,Director of Translational Services in Innovati...,6008101000,50000280,PAS Adm - Provost Office


In [48]:
# print total number of entries in this table
faculty_df.shape

(8659, 9)

In [49]:
def findOrgBFR(duid): 
    """
    Try to find an individual's Organization number (aka DUKE_NUMBER, Appt Org BFR)
    based on their DUID. Look for it in a couple of different ways. 
    First:
       Try looking up the DUID in the scholars_faculty.csv file. If found, grab the 'Appt Org BFR'. 
       If DUID listed multiple times, take the row where APPOINTMENT_TYPE is 'Primary'. If no 'Primary'
       then take the first entry listed.
    Second:
       If DUID not found in scholars_faculty.csv, try searching via their appointments and affiliations
       listed in the scholars_publications.csv. Find a publication record based on this DUID, grab the
       'DISPLAY_NAME' field. Separate the appointments into a list. Go through appointment in the list,
       and try to find another entry in the scholars_faculty.csv where the 'Appt Org Desc' field
       matches the appointment. If found, grab the 'Appt Org BFR' for that entry; If not found, try the
       next appointment. 

    Finally:
        if still no organization number is found, set the organization number to 0. 
    """
    try:
        orgNum = int(0);
        # First Attemp: Is this DUID in the scholars_faculty.csv?
        if duid in faculty_df.DUID.values:
            thisPerson = faculty_df.loc[faculty_df.DUID == duid, ['APPOINTMENT_TYPE', 'Appt Org BFR']]
            # if this person has more than 1 entry
            if thisPerson.shape[0]>1:

                # if one of their entries is listed as 'Primary'
                if 'Primary' in thisPerson.APPOINTMENT_TYPE.values:
                    orgNum = int(thisPerson.loc[thisPerson.APPOINTMENT_TYPE=='Primary', 'Appt Org BFR'].iloc[0])

                # otherwise take the first field
                else:
                    orgNum = int(thisPerson['Appt Org BFR'].iloc[0])
            else:
                # if only one entry, take that one
                orgNum = int(thisPerson['Appt Org BFR'].item())

        # Second attempt: search via appointments
        else:
            # grab the 'DISPLAY_NAME' field from publications dataframe
            appts = pub_df.loc[pub_df.DUID == duid, 'DISPLAY_NAME'].iloc[0]
            
            # separate multiple appts into a list
            appts = appts.split(',')
            appts = [x.lstrip() for x in appts]
            
            # loop through each appt, try to record for someone else with same appt
            for appt in appts:
                otherRecords = faculty_df.loc[[x == appt for x in faculty_df['Appt Org Desc']], 'Appt Org BFR']
                
                # if another record found, grab that record's 'Appt Org BFR'
                if otherRecords.shape[0] > 0:
                    orgNum = otherRecords.iloc[0]
                    
                    # stop searching
                    break                
    except:
        # if this DUID isn't found at all, or it errors
        orgNum = int(0)
    return orgNum


In [50]:
# find the Org BFR for each src and dst in the collaboration table
collab_df['src_OrgBFR'] = collab_df.src_DUID.apply(findOrgBFR)
collab_df['dst_OrgBFR'] = collab_df.dst_DUID.apply(findOrgBFR)

In [51]:
noOrg = collab_df.loc[(collab_df.src_OrgBFR == 0) | (collab_df.dst_OrgBFR == 0), :]

In [52]:
msg = """
There are %s entries out of %s where the orgBFR number could not be found 
for either the src OR the dst
""" %(noOrg.shape[0], collab_df.shape[0])

print msg


There are 32 entries out of 50153 where the orgBFR number could not be found 
for either the src OR the dst



#### Curses, but few enough that they can be corrected by hand

no Org BFR for the src in the collaboration

In [53]:
noSrc = collab_df.loc[collab_df.src_OrgBFR == 0, :]

In [54]:
noSrc.src_DUID.unique()

array([ 78958,  99154, 100772, 114379, 117254])

For every unique DUID with OrgBFR missing, find the correct one manually and add it to any entry where that individual is either the SRC or the DST

In [55]:
collab_df.loc[collab_df.src_DUID == 78958, 'src_OrgBFR'] = int(6860205500)   # Obsterics & Gynecology
collab_df.loc[collab_df.dst_DUID == 78958, 'dst_OrgBFR'] = int(6860205500)   # Obsterics & Gynecology

collab_df.loc[collab_df.src_DUID == 99154, 'src_OrgBFR'] = int(6064105000)   # Religious Studies
collab_df.loc[collab_df.dst_DUID == 99154, 'dst_OrgBFR'] = int(6064105000)   # Religious Studies

collab_df.loc[collab_df.src_DUID == 100772, 'src_OrgBFR'] = int(6056103000)  # Environmental Sciences & Policy
collab_df.loc[collab_df.dst_DUID == 100772, 'dst_OrgBFR'] = int(6056103000)  # Environmental Sciences & Policy

collab_df.loc[collab_df.src_DUID == 114379, 'src_OrgBFR'] = int(6860205500)  # Obsterics & Gynecology
collab_df.loc[collab_df.dst_DUID == 114379, 'dst_OrgBFR'] = int(6860205500)  # Obsterics & Gynecology

collab_df.loc[collab_df.src_DUID == 117254, 'src_OrgBFR'] = int(6056500000)  # Marine Science & Conservation
collab_df.loc[collab_df.dst_DUID == 117254, 'dst_OrgBFR'] = int(6056500000)  # Marine Science & Conservation


Now, just go through the dst DUIDs with no Org BFR

In [56]:
noDst = collab_df.loc[collab_df.dst_OrgBFR == 0, :]

In [57]:
noDst.loc[:, ['dst_DUID', 'dst_FIRST_NAME', 'dst_LAST_NAME']]

Unnamed: 0,dst_DUID,dst_FIRST_NAME,dst_LAST_NAME
2620,215304,Patricia,Leighten
11760,96445,Eric,Meyers
35898,117255,Celia,Bonaventura
35899,117255,Celia,Bonaventura
36610,117031,Norman,Christensen


In [58]:
collab_df.loc[collab_df.dst_DUID == 215304, 'dst_OrgBFR'] = int(6064100200)   # Art, Art History, & Vis studies
collab_df.loc[collab_df.dst_DUID == 96445, 'dst_OrgBFR'] = int(6064105000)    # Religious Studies
collab_df.loc[collab_df.dst_DUID == 117255, 'dst_OrgBFR'] = int(6056500000)  # Marine Science & Conservation
collab_df.loc[collab_df.dst_DUID == 117031, 'dst_OrgBFR'] = int(6056103000)  # Environmental Sciences & Policy


#### Make sure there are no more entries with unknown src or dst Org BFR

In [59]:
noOrg = collab_df.loc[(collab_df.src_OrgBFR == 0) | (collab_df.dst_OrgBFR == 0), :]

In [60]:
msg = """
There are %s entries out of %s where the orgBFR number could not be found 
for either the src OR the dst
""" %(noOrg.shape[0], collab_df.shape[0])

print msg


There are 0 entries out of 50153 where the orgBFR number could not be found 
for either the src OR the dst



#### Checkpoint the collaboration table at this point
Easier to just load in from this stage later on instead of having to go through all previous steps

In [61]:
collab_df.to_csv('data/processed/collaborations.tsv', sep='\t')

# Find Building ID associated with each OrgBFR
of course the OrgBFR number is not a direct 1:1 mapping with the DUKE_NUMBER in the organization_locations dataframe. For some folks, that's the case, but for others the first part of their OrgBFR number matches, but the last few digits are different, suggesting these last values encode something else (subdepartment?). 

So, for each OrgBFR: 
First, try to look up that value in orgLoc_df that was created earlier (also saved as 'organization_locations.tsv'). If found, great, grab the Building ID (BL_ID). 

If not, try to look up the OrgBFR value in SUBDEPARTMENTSPLIT.tsv. If found, grab the ROOM_DEPARTMENT_SPLIT_ID, and then look up the ROOM_DEPARTMENT_SPLIT_ID in ROOMDEPARTMENTSPLIT.tsv. If found, grab the DUKE_NUMBER associated with that ROOM_DEPARTMENT_SPLIT_ID, and look up the DUKE_NUMBER in the orgLoc_df, and grab the associated BL_ID. 

In [122]:
# load in the ROOMDEPARTMENTSPLIT and SUBDEPARTMENTSPLIT tables
roomDept_df = pd.read_table('data/raw/SQL_output/ROOMDEPARTMENTSPLIT.tsv')
subDept_df = pd.read_table('data/raw/SQL_output/SUBDEPARTMENTSPLIT.tsv')

# strip off the ':S' from the DUKE_NUMBER column where applicable
subDept_df['DUKE_NUMBER'] = subDept_df.DUKE_NUMBER.apply(lambda x: int(x.strip(':S')))

In [123]:
subDept_df.dtypes

SUB_DEPARTMENT_SPLIT_ID     int64
ROOM_DEPARTMENT_SPLIT_ID    int64
DUKE_NUMBER                 int64
dtype: object

First off, just how many OrgBFRs do not have a corresponding location in orgLoc_df?

In [68]:
# all unique OrgBFRs reprsented in collaboration table
src_OrgBFR = np.unique(collab_df.src_OrgBFR)
dst_OrgBFR = np.unique(collab_df.dst_OrgBFR)

uniqueOrgBFR = list(set().union(src_OrgBFR, dst_OrgBFR))

In [80]:
orgsWithBuildings = orgLoc_df.loc[[x in uniqueOrgBFR for x in orgLoc_df.index], :].shape[0]
totalOrgs = len(uniqueOrgBFR)
msg = """
There are %s unique OrgBFRs with a matching building out of %s total OrgBFRs
""" %(orgsWithBuildings, totalOrgs)
print msg


There are 56 unique OrgBFRs with a matching building out of 210 total OrgBFRs



In [156]:
def findDUKENUMBER(OrgBFR):
    # function to link every BFR to a DUKE_NUMBER in the orgLoc_df dataframe
    
    DUKE_NUMBER = 0
    
    # first, check if this OrgBFR is in the orgLoc_df
    if OrgBFR in orgLoc_df.index:
        DUKE_NUMBER = OrgBFR
    
    # if it's not, look up in SUBDEPARTMENTSPLIT and link back to the orgLoc_df (multiple links)
    elif OrgBFR in subDept_df.DUKE_NUMBER.values:
        roomDeptSplitID = subDept_df.loc[subDept_df.DUKE_NUMBER==OrgBFR, 'ROOM_DEPARTMENT_SPLIT_ID'].iloc[0]
        
        # look up this ID in the ROOMDEPARTMENTSPLIT table
        if roomDeptSplitID in roomDept_df.ROOM_DEPARTMENT_SPLIT_ID.values:
            roomDept_DUKE_NUMBER = roomDept_df.loc[roomDept_df.ROOM_DEPARTMENT_SPLIT_ID==roomDeptSplitID, 'DUKE_NUMBER'].item()
            
            # make sure this DUKE_NUMBER is represented in the orgLoc_df table
            if roomDept_DUKE_NUMBER in orgLoc_df.index:
                DUKE_NUMBER = roomDept_DUKE_NUMBER
    
    return int(DUKE_NUMBER)
        
        

In [166]:
# find the appropriate DUKE NUMBER that is tied to a building fOR each member in the collaboration table
collab_df['src_DUKE_NUMBER'] = collab_df.src_OrgBFR.apply(findDUKENUMBER)
collab_df['dst_DUKE_NUMBER'] = collab_df.dst_OrgBFR.apply(findDUKENUMBER)

#### find out how many OrgBFRs could not be linked to a DUKE_NUMBER in the buildings table

In [175]:
src_noDukeNum = collab_df.loc[collab_df.src_DUKE_NUMBER == 0, :]

In [176]:
msg = """
There are %s entries out of %s where the orgBFR number for src could not be linked to a DUKE_NUMBER w/ associated BLD
""" %(src_noDukeNum.shape[0], collab_df.shape[0])

print msg


There are 2243 entries out of 50153 where the orgBFR number for src could not be linked to a DUKE_NUMBER w/ associated BLD



In [177]:
len(np.unique(src_noDukeNum.src_OrgBFR))

14

Go through these and try to manually assign the correct DUKE_NUMBER

*Approach*: Grab the associated DUID or Name, look up online or in table, find address, look up address in orgLoc_df, assign DUKE_NUMBER associated with that address

In addition to the tables already here, this site was also helpful:
http://maps.duke.edu/map/accessible.php?id=21#3896

In [179]:
np.unique(src_noDukeNum.src_OrgBFR)

array([6840103000, 6840202000, 6840202030, 6860201020, 6860201080,
       6860201511, 6860201574, 6860201580, 6860201592, 6860201594,
       6860205080, 6860206040, 6860206050, 6860207076])

In [196]:
# for each one, assign the corrected DUKE_NUMBER to both the src and dst columns
collab_df.loc[collab_df.src_OrgBFR == 6840103000, 'src_DUKE_NUMBER'] = int(6840000000)   # School of Nursing
collab_df.loc[collab_df.dst_OrgBFR == 6840103000, 'dst_DUKE_NUMBER'] = int(6840000000)   # School of Nursing

collab_df.loc[collab_df.src_OrgBFR == 6840202000, 'src_DUKE_NUMBER'] = int(6840000000)   # School of Nursing
collab_df.loc[collab_df.dst_OrgBFR == 6840202000, 'dst_DUKE_NUMBER'] = int(6840000000)   # School of Nursing

collab_df.loc[collab_df.src_OrgBFR == 6840202030, 'src_DUKE_NUMBER'] = int(6840000000)   # School of Nursing
collab_df.loc[collab_df.dst_OrgBFR == 6840202030, 'dst_DUKE_NUMBER'] = int(6840000000)   # School of Nursing

collab_df.loc[collab_df.src_OrgBFR == 6860201020, 'src_DUKE_NUMBER'] = int(41211000)   # Comm & Family Medicine
collab_df.loc[collab_df.dst_OrgBFR == 6860201020, 'dst_DUKE_NUMBER'] = int(41211000)   # Comm & Family Medicine

collab_df.loc[collab_df.src_OrgBFR == 6860201080, 'src_DUKE_NUMBER'] = int(30214011)   # Fam Medicine Center
collab_df.loc[collab_df.dst_OrgBFR == 6860201080, 'dst_DUKE_NUMBER'] = int(30214011)   # Fam Medicine Center

collab_df.loc[collab_df.src_OrgBFR == 6860201511, 'src_DUKE_NUMBER'] = int(3280020200)   # LSRC
collab_df.loc[collab_df.dst_OrgBFR == 6860201511, 'dst_DUKE_NUMBER'] = int(3280020200)   # LSRC

collab_df.loc[collab_df.src_OrgBFR == 6860201574, 'src_DUKE_NUMBER'] = int(6860500100)   # Molec. Phys. Inst.
collab_df.loc[collab_df.dst_OrgBFR == 6860201574, 'dst_DUKE_NUMBER'] = int(6860500100)   # Molec. Phys. Inst.

collab_df.loc[collab_df.src_OrgBFR == 6860201580, 'src_DUKE_NUMBER'] = int(6860201500)   # Gen. Int. Medicine
collab_df.loc[collab_df.dst_OrgBFR == 6860201580, 'dst_DUKE_NUMBER'] = int(6860201500)   # Gen. Int. Medicine

collab_df.loc[collab_df.src_OrgBFR == 6860201592, 'src_DUKE_NUMBER'] = int(6860509500)   # Human Vaccine Inst.
collab_df.loc[collab_df.dst_OrgBFR == 6860201592, 'dst_DUKE_NUMBER'] = int(6860509500)   # Human Vaccine Inst.

collab_df.loc[collab_df.src_OrgBFR == 6860201594, 'src_DUKE_NUMBER'] = int(6860509500)   # Human Vaccine Inst.
collab_df.loc[collab_df.dst_OrgBFR == 6860201594, 'dst_DUKE_NUMBER'] = int(6860509500)   # Human Vaccine Inst.

collab_df.loc[collab_df.src_OrgBFR == 6860205080, 'src_DUKE_NUMBER'] = int(30206000)   # Anesthesiolgy
collab_df.loc[collab_df.dst_OrgBFR == 6860205080, 'dst_DUKE_NUMBER'] = int(30206000)   # Anesthesiolgy

collab_df.loc[collab_df.src_OrgBFR == 6860206040, 'src_DUKE_NUMBER'] = int(6860205500)   # Obst. & Gynecology
collab_df.loc[collab_df.dst_OrgBFR == 6860206040, 'dst_DUKE_NUMBER'] = int(6860205500)   # Obst. & Gynecology

collab_df.loc[collab_df.src_OrgBFR == 6860206050, 'src_DUKE_NUMBER'] = int(30216200)   # Eye-institute
collab_df.loc[collab_df.dst_OrgBFR == 6860206050, 'dst_DUKE_NUMBER'] = int(30216200)   # Eye-institute

collab_df.loc[collab_df.src_OrgBFR == 6860207076, 'src_DUKE_NUMBER'] = int(6860509500)   # Human Vaccine Inst.
collab_df.loc[collab_df.dst_OrgBFR == 6860207076, 'dst_DUKE_NUMBER'] = int(6860509500)   # Human Vaccine Inst.

Now do the same thing for the remaining dst entries with no DUKE_NUMBER

In [198]:
dst_noDukeNum = collab_df.loc[collab_df.dst_DUKE_NUMBER == 0, :]
msg = """
There are %s entries out of %s where the orgBFR number for dst could not be linked to a DUKE_NUMBER w/ associated BLD
""" %(dst_noDukeNum.shape[0], collab_df.shape[0])

print msg


There are 0 entries out of 50153 where the orgBFR number for dst could not be linked to a DUKE_NUMBER w/ associated BLD



Excellent, turns out there are none

#### Checkpoint the collaboration table at this point
Easier to just load in from this stage later on instead of having to go through all previous steps

In [203]:
collab_df.to_csv('data/processed/collaborations.tsv', sep='\t', index=False)

### Ok, now finally should be able to add building info to the collaborations table

build functions to add building info to the collaboration dataframe

In [10]:
def findBuildingID(dukeNum):
    # find the building ID associated with a given DUKE_NUMBER
    if dukeNum in orgLoc_df.index:
        bl_id = orgLoc_df.loc[dukeNum, 'BL_ID']
    else:
        bl_id = ''
    return bl_id

def findBuildingAddress(bl_id):
    # find the address associated with this building id
    addr = orgLoc_df.loc[orgLoc_df.BL_ID == bl_id, 'ADDRESS'].iloc[0]
    return addr

def findBuildingCity(bl_id):
    # find the city assocated with this building id
    city = orgLoc_df.loc[orgLoc_df.BL_ID == bl_id, 'CITY'].iloc[0]
    return city

def findBuildingZip(bl_id):
    # find the zipcode associated with this building id
    zipCode = orgLoc_df.loc[orgLoc_df.BL_ID == bl_id, 'ZIP'].iloc[0]
    return zipCode

def findBuildingLon(bl_id):
    # find the longitude associated with this building id
    lon = orgLoc_df.loc[orgLoc_df.BL_ID == bl_id, 'LON'].iloc[0]
    return lon

def findBuildingLat(bl_id):
    # find the latitude associated with this building id
    lat = orgLoc_df.loc[orgLoc_df.BL_ID == bl_id, 'LAT'].iloc[0]
    return lat

In [225]:
# Get the Building ID associated with the src and dst in each collaboration
collab_df['src_BL_ID'] = collab_df.src_DUKE_NUMBER.apply(findBuildingID)
collab_df['dst_BL_ID'] = collab_df.dst_DUKE_NUMBER.apply(findBuildingID)

In [244]:
# Now with building ID in place, add the rest of the building info for the src and dst in each collaboration
collab_df['src_ADDRESS'] = collab_df.src_BL_ID.apply(findBuildingAddress)
collab_df['src_CITY'] = collab_df.src_BL_ID.apply(findBuildingCity)
collab_df['src_ZIP'] = collab_df.src_BL_ID.apply(findBuildingZip)
collab_df['src_LON'] = collab_df.src_BL_ID.apply(findBuildingLon)
collab_df['src_LAT'] = collab_df.src_BL_ID.apply(findBuildingLat)

collab_df['dst_ADDRESS'] = collab_df.dst_BL_ID.apply(findBuildingAddress)
collab_df['dst_CITY'] = collab_df.dst_BL_ID.apply(findBuildingCity)
collab_df['dst_ZIP'] = collab_df.dst_BL_ID.apply(findBuildingZip)
collab_df['dst_LON'] = collab_df.dst_BL_ID.apply(findBuildingLon)
collab_df['dst_LAT'] = collab_df.dst_BL_ID.apply(findBuildingLat)

In [246]:
collab_df.head()

Unnamed: 0,PUBLICATION_ID,PUBLICATION_TYPE,PUBLICATION_URI,PUBLISHED_DATE,TOTAL_AUTHORS,dst_DUID,dst_FIRST_NAME,dst_LAST_NAME,dst_MIDDLE_NAME,src_DUID,...,src_ADDRESS,src_CITY,src_ZIP,src_LON,src_LAT,dst_ADDRESS,dst_CITY,dst_ZIP,dst_LON,dst_LAT
0,0,Journal Article,https://scholars.duke.edu/individual/pub1000033,5/1/2013 12:00:00 AM,2,272070,Shelby,Reed,Derene,99016,...,40 Duke Medicine Cir,DURHAM,27705,-78.935915,36.003337,40 Duke Medicine Cir,DURHAM,27705,-78.935915,36.003337
1,1,Journal Article,https://scholars.duke.edu/individual/pub1000035,7/1/2013 12:00:00 AM,2,591120,John,Reynolds,Michael,119821,...,40 Duke Medicine Cir,DURHAM,27705,-78.935915,36.003337,40 Duke Medicine Cir,DURHAM,27705,-78.935915,36.003337
2,2,Journal Article,https://scholars.duke.edu/individual/pub1000091,10/3/2012 12:00:00 AM,2,98973,John,Alexander,Hunter Peel,314783,...,40 Duke Medicine Cir,DURHAM,27705,-78.935915,36.003337,40 Duke Medicine Cir,DURHAM,27705,-78.935915,36.003337
3,3,Journal Article,https://scholars.duke.edu/individual/pub1000099,11/1/2013 12:00:00 AM,2,449347,Endi,Wang,,435364,...,40 Duke Medicine Cir,DURHAM,27705,-78.93695,36.003063,40 Duke Medicine Cir,DURHAM,27705,-78.93695,36.003063
4,4,Journal Article,https://scholars.duke.edu/individual/pub1000158,1/1/2013 12:00:00 AM,3,135843,Evan,Myers,Robert,373496,...,203 Research Dr,DURHAM,27705,-78.940748,36.007164,40 Duke Medicine Cir,DURHAM,27705,-78.935105,36.003996


#### Checkpoint the collaboration table
now has all of the location info for each member in each collaboration

In [247]:
collab_df.to_csv('data/processed/collaborations.tsv', sep='\t', index=False)

---
# Use the Google Maps API to get the distances and durations between unique buildings

#### First, make a list of all of the unique buildings in the collaborations file

In [250]:
# create a list of each unique DUID mentioned in the collaboration dataframe
src_BL_IDs = np.unique(collab_df.src_BL_ID)
dst_BL_IDs = np.unique(collab_df.dst_BL_ID)

uniqueBL_IDs = list(set().union(src_BL_IDs, dst_BL_IDs))

In [252]:
len(uniqueBL_IDs)

51


### Test out how to retrieve distance and duration values from the Google Maps API
Take the latitude and longitude for each building, and find the distance and duration (for walking?) to every other buildings. Test this out with one building-to-building combo, but put the final code in a separate script so that the API access limits are hit by accidently running the cell mulitple times

In [19]:
bl1 = orgLoc_df.iloc[2]
bl1

BL_ID                      7508
ADDRESS    40 Duke Medicine Cir
CITY                     DURHAM
STATE                        NC
ZIP                       27705
LON                    -78.9359
LAT                     36.0033
Name: 20112000, dtype: object

In [20]:
bl2 = orgLoc_df.iloc[42]
bl2

BL_ID               7546
ADDRESS    2301 Erwin Rd
CITY              DURHAM
STATE                 NC
ZIP                27705
LON             -78.9384
LAT              36.0071
Name: 30211022, dtype: object

In [21]:
import googlemaps

In [263]:
# create gmaps Client object using my API key
f = open('data/gmaps_API_key.txt' , 'r')
gmapKey = f.readline()
f.close()

gmaps = googlemaps.Client(key=gmapKey)

In [264]:
distMatrix = gmaps.distance_matrix((bl1.LAT, bl1.LON), (bl2.LAT, bl2.LON))

In [265]:
distMatrix

{u'destination_addresses': [u'2301 Erwin Rd, Durham, NC 27705, USA'],
 u'origin_addresses': [u'Flowers Dr, Durham, NC, USA'],
 u'rows': [{u'elements': [{u'distance': {u'text': u'1.1 km', u'value': 1060},
     u'duration': {u'text': u'6 mins', u'value': 351},
     u'status': u'OK'}]}],
 u'status': u'OK'}

In [266]:
distMatrix['rows'][0]['elements'][0]['duration']['value']

351

In [267]:
distMatrix['rows'][0]['elements'][0]['distance']['value']

1060

### create distance and duration dataframes

In [356]:
idx = cols = sort(uniqueBL_IDs)

# figure out how big to make each dimension of the array
n_buildings = len(uniqueBL_IDs)

# create matrix of zeros
distances = np.zeros(shape=(n_buildings, n_buildings))
durations = np.zeros(shape=(n_buildings, n_buildings))
wouldWalk = np.zeros(shape=(n_buildings, n_buildings))

# initialize dataframe
dist_df = pd.DataFrame(data=distances, index=idx, columns=cols)
dur_df = pd.DataFrame(data=durations, index=idx, columns=cols)
wouldWalk =  pd.DataFrame(data=wouldWalk, index=idx, columns=cols)

### Loop through unique buiding combos, use gmaps to get distance and duration between

# WARNING: this makes over 1275 API requests (limit: 2500/day)

In [357]:
gmaps = googlemaps.Client(key=gmapKey)

# loop through unique, non-repeated combinations of 2 buildings
for b1, b2 in itertools.combinations(uniqueBL_IDs, 2):
    print b1, b2
    
    # get the addresses for each location
    b1_addr = ','.join([orgLoc_df.loc[orgLoc_df.BL_ID == b1, 'ADDRESS'].iloc[0], 
                   orgLoc_df.loc[orgLoc_df.BL_ID == b1, 'CITY'].iloc[0],
                   orgLoc_df.loc[orgLoc_df.BL_ID == b1, 'STATE'].iloc[0],
                   str(orgLoc_df.loc[orgLoc_df.BL_ID == b1, 'ZIP'].iloc[0])])

    b2_addr = ','.join([orgLoc_df.loc[orgLoc_df.BL_ID == b2, 'ADDRESS'].iloc[0], 
                       orgLoc_df.loc[orgLoc_df.BL_ID == b2, 'CITY'].iloc[0],
                       orgLoc_df.loc[orgLoc_df.BL_ID == b2, 'STATE'].iloc[0],
                       str(orgLoc_df.loc[orgLoc_df.BL_ID == b2, 'ZIP'].iloc[0])])
    
    ### get the distanceMatrix for these 2 locations from the gmaps API
    # try to get the walking route first
    distMatrix = gmaps.distance_matrix(b1_addr, b2_addr, mode='walking')
    
    # parse the output to get the distance (m) and duration (s) values
    duration = distMatrix['rows'][0]['elements'][0]['duration']['value']
    distance = distMatrix['rows'][0]['elements'][0]['distance']['value']
    
    # If walking distance is more than 1 mile (~1609 m), recalculate for driving
    if distance > 1609:
        walk = False
        
        distMatrix = gmaps.distance_matrix(b1_addr, b2_addr)
    
        # parse the output to get the distance (m) and duration (s) values
        duration = distMatrix['rows'][0]['elements'][0]['duration']['value']
        distance = distMatrix['rows'][0]['elements'][0]['distance']['value']
    else:
        walk = True  
    
    # add these values to the approrpiate locations in the tables
    # Note, each value gets added twice, reflected along the diagonal of the matrix
    dur_df.loc[b1, b2] = duration
    dur_df.loc[b2, b1] = duration
    
    dist_df.loc[b1, b2] = distance
    dist_df.loc[b2, b1] = distance
    
    wouldWalk.loc[b1, b2] = walk
    wouldWalk.loc[b2, b1] = walk
    
    # pause
    time.sleep(.015) # max of 100 requests/sec on API


7756 7593
7756 8239
7756 8304
7756 8329
7756 7224
7756 8116
7756 7765
7756 7710
7756 7576
7756 7579
7756 7202
7756 7201
7756 7261
7756 7739
7756 7759
7756 7516
7756 7515
7756 7514
7756 7513
7756 7512
7756 7534
7756 7550
7756 7531
7756 7530
7756 7735
7756 7747
7756 8084
7756 8166
7756 8141
7756 7758
7756 7738
7756 7754
7756 7753
7756 7703
7756 7706
7756 7705
7756 7251
7756 7560
7756 7709
7756 7548
7756 7549
7756 7506
7756 7507
7756 7760
7756 7501
7756 7540
7756 7776
7756 7749
7756 7508
7756 7545
7593 8239
7593 8304
7593 8329
7593 7224
7593 8116
7593 7765
7593 7710
7593 7576
7593 7579
7593 7202
7593 7201
7593 7261
7593 7739
7593 7759
7593 7516
7593 7515
7593 7514
7593 7513
7593 7512
7593 7534
7593 7550
7593 7531
7593 7530
7593 7735
7593 7747
7593 8084
7593 8166
7593 8141
7593 7758
7593 7738
7593 7754
7593 7753
7593 7703
7593 7706
7593 7705
7593 7251
7593 7560
7593 7709
7593 7548
7593 7549
7593 7506
7593 7507
7593 7760
7593 7501
7593 7540
7593 7776
7593 7749
7593 7508
7593 7545
8239 8304


In [375]:
# fix the diagonal for the wouldWalk matrix (set to true)
for i in wouldWalk.index:
    wouldWalk.loc[i,i] = True

Save these output tables so the previous cell doesn't have to be run again and eat up the API limits

In [376]:
dur_df.to_csv('data/processed/bldg2bldf_duration.tsv', sep='\t', index=True)
dist_df.to_csv('data/processed/bldg2bldg_distance.tsv', sep='\t', index=True)
wouldWalk.to_csv('data/processed/bldg2bldg_mode.tsv', sep='\t', index=True)

# Add the distance, crow_distance, duration, and mode of transport to the collab table

Write functions to look up the relevant values

In [372]:
def getDistance(row):
    b1 = row['src_BL_ID']
    b2 = row['dst_BL_ID']
    
    # find the corresponding value in the distance matrix
    dist = dist_df.loc[b1, b2]
    
    return dist


def getDuration(row):
    b1 = row['src_BL_ID']
    b2 = row['dst_BL_ID']
    
    # find the corresponding value in the duration matrix
    duration = dur_df.loc[b1, b2]
    
    return duration


def getModeOfTransport(row):
        b1 = row['src_BL_ID']
        b2 = row['dst_BL_ID']
        
        # find the corresponding value in the wouldWalk matrix
        walk = wouldWalk.loc[b1, b2]
        if walk:
            mode = 'walk'
        else:
            mode = 'drive'
        
        return mode

In [377]:
collab_df['distance'] = collab_df.apply(getDistance, axis=1)
collab_df['duration'] = collab_df.apply(getDuration, axis=1)
collab_df['transportMode'] = collab_df.apply(getModeOfTransport, axis=1)

In [378]:
collab_df.head()

Unnamed: 0,PUBLICATION_ID,PUBLICATION_TYPE,PUBLICATION_URI,PUBLISHED_DATE,TOTAL_AUTHORS,dst_DUID,dst_FIRST_NAME,dst_LAST_NAME,dst_MIDDLE_NAME,src_DUID,...,src_LON,src_LAT,dst_ADDRESS,dst_CITY,dst_ZIP,dst_LON,dst_LAT,distance,duration,transportMode
0,0,Journal Article,https://scholars.duke.edu/individual/pub1000033,5/1/2013 12:00:00 AM,2,272070,Shelby,Reed,Derene,99016,...,-78.935915,36.003337,40 Duke Medicine Cir,DURHAM,27705,-78.935915,36.003337,0.0,0.0,walk
1,1,Journal Article,https://scholars.duke.edu/individual/pub1000035,7/1/2013 12:00:00 AM,2,591120,John,Reynolds,Michael,119821,...,-78.935915,36.003337,40 Duke Medicine Cir,DURHAM,27705,-78.935915,36.003337,0.0,0.0,walk
2,2,Journal Article,https://scholars.duke.edu/individual/pub1000091,10/3/2012 12:00:00 AM,2,98973,John,Alexander,Hunter Peel,314783,...,-78.935915,36.003337,40 Duke Medicine Cir,DURHAM,27705,-78.935915,36.003337,0.0,0.0,walk
3,3,Journal Article,https://scholars.duke.edu/individual/pub1000099,11/1/2013 12:00:00 AM,2,449347,Endi,Wang,,435364,...,-78.93695,36.003063,40 Duke Medicine Cir,DURHAM,27705,-78.93695,36.003063,0.0,0.0,walk
4,4,Journal Article,https://scholars.duke.edu/individual/pub1000158,1/1/2013 12:00:00 AM,3,135843,Evan,Myers,Robert,373496,...,-78.940748,36.007164,40 Duke Medicine Cir,DURHAM,27705,-78.935105,36.003996,1110.0,883.0,walk


In [385]:
lat2meters = 110958.98   # approximate number of meters/deg latitude at 36 degrees latitude
lon2meters = 90163.66    # approximate number of meters/deg longitude at 36 degrees latitude

# calculate the distance as the crow flies between the two buildings in the collaboration
def calculateCrowDistance(row):
    b1_lat =  row['src_LAT']
    b1_lon = row['src_LON']
    b2_lat = row['dst_LAT']
    b2_lon = row['dst_LON']
    
    # calculate the latitude and longitude difference between the two buildings
    latDist = abs(b2_lat-b1_lat)
    lonDist = abs(b2_lon-b1_lon)
    
    # convert to meters
    latDist_m = latDist * lat2meters
    lonDist_m = lonDist * lon2meters
    
    # do some hypoteneusing
    dist_m = np.sqrt(latDist_m**2 + lonDist_m**2)
    
    return dist_m


In [386]:
collab_df['crowDistance'] = collab_df.apply(calculateCrowDistance, axis=1)

In [387]:
collab_df.head()

Unnamed: 0,PUBLICATION_ID,PUBLICATION_TYPE,PUBLICATION_URI,PUBLISHED_DATE,TOTAL_AUTHORS,dst_DUID,dst_FIRST_NAME,dst_LAST_NAME,dst_MIDDLE_NAME,src_DUID,...,src_LAT,dst_ADDRESS,dst_CITY,dst_ZIP,dst_LON,dst_LAT,distance,duration,transportMode,crowDistance
0,0,Journal Article,https://scholars.duke.edu/individual/pub1000033,5/1/2013 12:00:00 AM,2,272070,Shelby,Reed,Derene,99016,...,36.003337,40 Duke Medicine Cir,DURHAM,27705,-78.935915,36.003337,0.0,0.0,walk,0.0
1,1,Journal Article,https://scholars.duke.edu/individual/pub1000035,7/1/2013 12:00:00 AM,2,591120,John,Reynolds,Michael,119821,...,36.003337,40 Duke Medicine Cir,DURHAM,27705,-78.935915,36.003337,0.0,0.0,walk,0.0
2,2,Journal Article,https://scholars.duke.edu/individual/pub1000091,10/3/2012 12:00:00 AM,2,98973,John,Alexander,Hunter Peel,314783,...,36.003337,40 Duke Medicine Cir,DURHAM,27705,-78.935915,36.003337,0.0,0.0,walk,0.0
3,3,Journal Article,https://scholars.duke.edu/individual/pub1000099,11/1/2013 12:00:00 AM,2,449347,Endi,Wang,,435364,...,36.003063,40 Duke Medicine Cir,DURHAM,27705,-78.93695,36.003063,0.0,0.0,walk,0.0
4,4,Journal Article,https://scholars.duke.edu/individual/pub1000158,1/1/2013 12:00:00 AM,3,135843,Evan,Myers,Robert,373496,...,36.007164,40 Duke Medicine Cir,DURHAM,27705,-78.935105,36.003996,1110.0,883.0,walk,618.435335


# Done with collaboration dataframe!
write the final output collaboration dataframe

In [None]:
collab_df.to_csv('data/processed/collaborations.tsv', sep='\t', index=False)

---
# Add location info to table of building names
For each unique building id in the collaborations dataframe, find the "name" of that building based on its address

In [3]:
bl_names_df = pd.read_table('data/processed/buildingNames.tsv', sep='\t')

In [11]:
bl_names_df.head()

Unnamed: 0,BL_ID,ADDRESS,NAME
0,7201,1304 Campus Dr,East Duke Building
1,7202,1364 Campus Dr,West Duke Building
2,7224,1316 Campus Dr,Friedl Building
3,7251,1 Brodie Gym Dr,Art Building
4,7261,114 S Buchanan Blvd,Smith Warehouse


In [10]:
# show the fields available in the blinfo table (loaded way at the top of this notebook)
blinfo.head()

Unnamed: 0,BL_ID,ADDRESS1,CITY_ID,STATE_ID,ZIP,LON,LAT
0,7101,400 Gattis St,DURHAM,NC,27701,-78.91848,36.000104
1,7503,40 Duke Medicine Cir,DURHAM,NC,27705,-78.937025,36.004101
2,7505,40 Duke Medicine Cir,DURHAM,NC,27705,-78.937028,36.004417
3,7506,40 Duke Medicine Cir,DURHAM,NC,27705,-78.935105,36.003996
4,7507,40 Duke Medicine Cir,DURHAM,NC,27705,-78.93695,36.003063


In [16]:
# use the getBuildingInfo function (defined up near the top of the notebook) to get lat/lon coords
bl_names_df['LON'] = bl_names_df['BL_ID'].astype(str).apply(getBuildingInfo, args=('LON',))
bl_names_df['LAT'] = bl_names_df['BL_ID'].astype(str).apply(getBuildingInfo, args=('LAT',))

In [17]:
bl_names_df.head()

Unnamed: 0,BL_ID,ADDRESS,NAME,LON,LAT
0,7201,1304 Campus Dr,East Duke Building,-78.914135,36.004908
1,7202,1364 Campus Dr,West Duke Building,-78.91539,36.004917
2,7224,1316 Campus Dr,Friedl Building,-78.914253,36.006383
3,7251,1 Brodie Gym Dr,Art Building,-78.917126,36.009341
4,7261,114 S Buchanan Blvd,Smith Warehouse,-78.914736,36.001974


In [18]:
bl_names_df.to_csv('data/processed/buildingNames.tsv', sep='\t', index=False)