In [1]:
import pandas as pd
import requests
import time

# Labeling of users based on their provided name on GitHub


### The steps shown here are used to identify UU employees based on their name and the profile pages user retrieval method. The search API is used to find matches. These are then manually verified whether they fit or not.

In [2]:
users = pd.read_excel("data/users_enriched.xlsx")
users_filtered = users.loc[users['final_decision'] == 1]
users_filtered

Unnamed: 0,user_id,source,service,date,id,node_id,avatar_url,gravatar_id,url,html_url,...,followers,following,created_at,updated_at,is_student,is_employee,is_currently_employed,is_research_group,final_decision,note
3,73616e646572,profile_page_uu.csv,github.com,2022-07-13,6605233.0,MDQ6VXNlcjY2MDUyMzM=,https://avatars.githubusercontent.com/u/660523...,,https://api.github.com/users/73616e646572,https://github.com/73616e646572,...,4.0,22.0,2014-02-06T13:51:56Z,2022-06-24T17:36:55Z,False,1,1.0,0.0,1,"Collected via profile pages, therefore must be..."
4,a-quei,github_search_users.csv,github.com,2022-07-13,6829836.0,MDQ6VXNlcjY4Mjk4MzY=,https://avatars.githubusercontent.com/u/682983...,,https://api.github.com/users/a-quei,https://github.com/a-quei,...,12.0,17.0,2014-03-02T08:21:10Z,2022-07-12T16:38:19Z,False,0,0.0,0.0,1,0
7,ajueling,github_search_repos.csv,github.com,2022-07-13,20947797.0,MDQ6VXNlcjIwOTQ3Nzk3,https://avatars.githubusercontent.com/u/209477...,,https://api.github.com/users/AJueling,https://github.com/AJueling,...,25.0,31.0,2016-08-10T11:21:34Z,2022-01-08T17:08:22Z,False,1,1.0,0.0,1,0
10,aldertzomer,github_search_users.csv,github.com,2022-07-13,10392411.0,MDQ6VXNlcjEwMzkyNDEx,https://avatars.githubusercontent.com/u/103924...,,https://api.github.com/users/aldertzomer,https://github.com/aldertzomer,...,9.0,3.0,2015-01-04T19:49:33Z,2022-06-20T14:13:47Z,False,0,0.0,0.0,1,0
16,amices,pure.csv,github.com,2022-07-13,69854869.0,MDEyOk9yZ2FuaXphdGlvbjY5ODU0ODY5,https://avatars.githubusercontent.com/u/698548...,,https://api.github.com/users/amices,https://github.com/amices,...,0.0,0.0,2020-08-18T13:11:20Z,2022-03-17T17:14:44Z,False,0,0.0,1.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552,wouterdroogers,github_search_users.csv,github.com,2022-07-13,38694532.0,MDQ6VXNlcjM4Njk0NTMy,https://avatars.githubusercontent.com/u/386945...,,https://api.github.com/users/WouterDroogers,https://github.com/WouterDroogers,...,0.0,0.0,2018-04-24T13:44:53Z,2021-01-07T15:10:19Z,False,1,1.0,0.0,1,0
556,xiedanghan,profile_page_uu.csv,github.com,2022-07-13,57119295.0,MDQ6VXNlcjU3MTE5Mjk1,https://avatars.githubusercontent.com/u/571192...,,https://api.github.com/users/xiedanghan,https://github.com/xiedanghan,...,0.0,0.0,2019-10-29T00:35:46Z,2022-06-07T12:52:39Z,False,1,,,1,"Collected via profile pages, therefore must be..."
562,yuntaoj,github_search_users.csv,github.com,2022-07-13,74980142.0,MDQ6VXNlcjc0OTgwMTQy,https://avatars.githubusercontent.com/u/749801...,,https://api.github.com/users/yuntaoj,https://github.com/yuntaoj,...,1.0,0.0,2020-11-24T16:38:30Z,2021-03-08T10:03:10Z,False,0,0.0,0.0,1,0
563,zievathustra,github_search_repos.csv,github.com,2022-07-13,61464979.0,MDQ6VXNlcjYxNDY0OTc5,https://avatars.githubusercontent.com/u/614649...,,https://api.github.com/users/zievathustra,https://github.com/zievathustra,...,2.0,0.0,2020-02-25T14:22:18Z,2022-07-08T09:40:10Z,False,1,1.0,0.0,1,0


### Load data from UU profile pages. This data has been manually cleaned to remove nonsense data. It is advised to take a quick look at the retrieved data from this method.

In [3]:
uu_profile_pages_without_orgs = pd.read_csv("data/profile_page_uu_without_orgs.csv", sep=";")

uu_profile_pages_without_orgs = uu_profile_pages_without_orgs.rename(columns={"UU_id": "employee_url"})
uu_profile_pages_without_orgs

Unnamed: 0,service,date,user_id,employee_url
0,github.com,14/07/2022,kovvalsky,LAbzianidze
1,github.com,14/07/2022,lukavdplas,LPvanderPlas
2,github.com,14/07/2022,73616e646572,SPrins
3,github.com,14/07/2022,hugoquene,HQuene
4,github.com,14/07/2022,oktaal,SJJSpoel
...,...,...,...,...
78,github.com,14/07/2022,ShNadi,SNadiNajafabadi
79,github.com,14/07/2022,jelletreep,HJTreep
80,github.com,14/07/2022,Southparkfan,FTufan
81,github.com,14/07/2022,MartineDeVos,MGdeVos


### Merge both datasets

In [4]:
users_merged = pd.merge(users_filtered, uu_profile_pages_without_orgs[['user_id','employee_url']], how="left", on="user_id")
users_merged

Unnamed: 0,user_id,source,service,date,id,node_id,avatar_url,gravatar_id,url,html_url,...,following,created_at,updated_at,is_student,is_employee,is_currently_employed,is_research_group,final_decision,note,employee_url
0,73616e646572,profile_page_uu.csv,github.com,2022-07-13,6605233.0,MDQ6VXNlcjY2MDUyMzM=,https://avatars.githubusercontent.com/u/660523...,,https://api.github.com/users/73616e646572,https://github.com/73616e646572,...,22.0,2014-02-06T13:51:56Z,2022-06-24T17:36:55Z,False,1,1.0,0.0,1,"Collected via profile pages, therefore must be...",SPrins
1,a-quei,github_search_users.csv,github.com,2022-07-13,6829836.0,MDQ6VXNlcjY4Mjk4MzY=,https://avatars.githubusercontent.com/u/682983...,,https://api.github.com/users/a-quei,https://github.com/a-quei,...,17.0,2014-03-02T08:21:10Z,2022-07-12T16:38:19Z,False,0,0.0,0.0,1,0,
2,ajueling,github_search_repos.csv,github.com,2022-07-13,20947797.0,MDQ6VXNlcjIwOTQ3Nzk3,https://avatars.githubusercontent.com/u/209477...,,https://api.github.com/users/AJueling,https://github.com/AJueling,...,31.0,2016-08-10T11:21:34Z,2022-01-08T17:08:22Z,False,1,1.0,0.0,1,0,
3,aldertzomer,github_search_users.csv,github.com,2022-07-13,10392411.0,MDQ6VXNlcjEwMzkyNDEx,https://avatars.githubusercontent.com/u/103924...,,https://api.github.com/users/aldertzomer,https://github.com/aldertzomer,...,3.0,2015-01-04T19:49:33Z,2022-06-20T14:13:47Z,False,0,0.0,0.0,1,0,
4,amices,pure.csv,github.com,2022-07-13,69854869.0,MDEyOk9yZ2FuaXphdGlvbjY5ODU0ODY5,https://avatars.githubusercontent.com/u/698548...,,https://api.github.com/users/amices,https://github.com/amices,...,0.0,2020-08-18T13:11:20Z,2022-03-17T17:14:44Z,False,0,0.0,1.0,1,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,wouterdroogers,github_search_users.csv,github.com,2022-07-13,38694532.0,MDQ6VXNlcjM4Njk0NTMy,https://avatars.githubusercontent.com/u/386945...,,https://api.github.com/users/WouterDroogers,https://github.com/WouterDroogers,...,0.0,2018-04-24T13:44:53Z,2021-01-07T15:10:19Z,False,1,1.0,0.0,1,0,
197,xiedanghan,profile_page_uu.csv,github.com,2022-07-13,57119295.0,MDQ6VXNlcjU3MTE5Mjk1,https://avatars.githubusercontent.com/u/571192...,,https://api.github.com/users/xiedanghan,https://github.com/xiedanghan,...,0.0,2019-10-29T00:35:46Z,2022-06-07T12:52:39Z,False,1,,,1,"Collected via profile pages, therefore must be...",DXie
198,yuntaoj,github_search_users.csv,github.com,2022-07-13,74980142.0,MDQ6VXNlcjc0OTgwMTQy,https://avatars.githubusercontent.com/u/749801...,,https://api.github.com/users/yuntaoj,https://github.com/yuntaoj,...,0.0,2020-11-24T16:38:30Z,2021-03-08T10:03:10Z,False,0,0.0,0.0,1,0,
199,zievathustra,github_search_repos.csv,github.com,2022-07-13,61464979.0,MDQ6VXNlcjYxNDY0OTc5,https://avatars.githubusercontent.com/u/614649...,,https://api.github.com/users/zievathustra,https://github.com/zievathustra,...,0.0,2020-02-25T14:22:18Z,2022-07-08T09:40:10Z,False,1,1.0,0.0,1,0,


# Faculties
* Faculty of Geosciences
* Faculty of Humanities
* Faculty of Law, Economics and Governance
* Faculty of Medicine
* Faculty of Science
* Faculty of Social and Behavioural Sciences
* Faculty of Veterinary Medicine

In [5]:
name = "Anna-Lena Lamprecht"
def search_employee_url(name):
    if "nan" == name or " " not in name:
        return None
    name_url = name.replace(" ", "%20")
    url = f"https://www.uu.nl/medewerkers/RestApi/Public/search?expression=%7B%22q%22%3A%22{name_url}%22%2C%22mode%22%3A%22naam%22%7D&t=null&l=EN"
    json_nested  = requests.get(url)
    try:
        employee_url = json_nested.json()["Employees"][0]["Url"]
#         print(name, employee_url)
        return employee_url
    except Exception as e:
        return None
    
search_employee_url(name)

'ALLamprecht'

In [6]:
employee_url = "ALLamprecht"
def get_employee_info(employee_url):
    if employee_url is None:
        return [None]*3
    API_link = requests.get(f"https://www.uu.nl/medewerkers/RestApi/Public/getEmployeeData?page={employee_url}&l=EN")
    API_json = API_link.json()["Employee"]
    # faculties is an array but retrieved data always had only one, so extra faculties will be discarded
    fac = API_json["Faculties"][0]["Faculty"]
    positions = [API_json["Positions"][0]["Level1"], 
                 API_json["Positions"][0]["Level2"], 
                 API_json["Positions"][0]["Level3"], 
                 API_json["Positions"][0]["Level4"], 
                 API_json["Positions"][0]["Position"]]
    info = [API_json["NameShort"], fac, positions]
    return info

get_employee_info(employee_url)

['A.L. (Anna-Lena) Lamprecht',
 'Science',
 ['Science',
  'Information and computing sciences',
  'Intelligent Software Systems',
  'Software Technology',
  'Assistant Professor']]

In [7]:
def get_flattened_info(employee_urls, emplyee_infos):
    columns = ["employee_url", "faculty", "level1", "level2", "level3", "level4", "position"]
    flat_list = []
    for employee_url, (_, faculty, positions) in zip(employee_urls, employee_infos):
        if positions is None:
            flat_list.append([employee_url, faculty, None, None, None, None, None])
        else:
            pos = [None if position == '' else position for position in positions]
            flat_list.append([employee_url, faculty, pos[0], pos[1], pos[2], pos[3], pos[4]])
    return flat_list

## Retrieve employee_urls based on names. If employee url exists, keep that information instead.

In [8]:
employee_urls = []
for user, employee_url in zip(users_merged["name"], users_merged["employee_url"]):
    if pd.isna(employee_url):
        result = search_employee_url(str(user))
        print(user, result)
        employee_urls.append(result)
    else:
        employee_urls.append(employee_url)
        print(employee_url)
#     time.sleep(1)
print(employee_urls)

SPrins
Guanyi Chen GChen
André Jüling AJuling2
Aldert Zomer ALZomer
MICE None
Anastasia Giachanou AGiachanou
Andreas Tataris ATataris
ALLamprecht
nan None
AMWegmann
Arno Bastenhof None
ARomanowski
ASReview None
basm92 None
nan None
nan None
ASobhani
BJCornejoCostas
JdeBoer
SMelnikov
Bovi-Analytics None
Bram Vaes BVaes
CZhu
CJvanLissa
CLARIAH None
Computational Geography None
D-score None
Daan Reijnders BJHRReijnders
Daniel Oberski DLOberski
DAsscheman
DICOCortesOrtuno
David Gernaat None
Derek Karssenberg DJKarssenberg
DH-IT Portal Development None
Dieter Stoker DGGStoker
Dorien Huijser DCHuijser
DSIslakoglu
Daniel Zamrsky DZamrsky
Dr ir Egil A.J. Fischer None
YKatrukha
nan None
EAarts
nan None
Fouad Sfarijlani None
Francisca Pessanha MFPessanhadeMenesesRibeirodosReis
FriedrichFoerster None
Geostatistical Algorithms & Image Analysis None
Jorge Vega Briones JEVegaBriones
Gabriel Beckers GJLBeckers
Geertje None
nan None
Gerbrand Sterrenburg ACJGSterrenburg
GVink
nan None
nan None
nan None

In [9]:
employee_urls

['SPrins',
 'GChen',
 'AJuling2',
 'ALZomer',
 None,
 'AGiachanou',
 'ATataris',
 'ALLamprecht',
 None,
 'AMWegmann',
 None,
 'ARomanowski',
 None,
 None,
 None,
 None,
 'ASobhani',
 'BJCornejoCostas',
 'JdeBoer',
 'SMelnikov',
 None,
 'BVaes',
 'CZhu',
 'CJvanLissa',
 None,
 None,
 None,
 'BJHRReijnders',
 'DLOberski',
 'DAsscheman',
 'DICOCortesOrtuno',
 None,
 'DJKarssenberg',
 None,
 'DGGStoker',
 'DCHuijser',
 'DSIslakoglu',
 'DZamrsky',
 None,
 'YKatrukha',
 None,
 'EAarts',
 None,
 None,
 'MFPessanhadeMenesesRibeirodosReis',
 None,
 None,
 'JEVegaBriones',
 'GJLBeckers',
 None,
 None,
 'ACJGSterrenburg',
 'GVink',
 None,
 None,
 None,
 None,
 None,
 'GTSpeed',
 'GJMVelders',
 None,
 'HAmiri',
 'HIOberman',
 None,
 None,
 'HNunner',
 'HQuene',
 None,
 'IVKryven',
 'IBanalesBelaunde',
 'IMvanderWulp2',
 None,
 None,
 None,
 'JdeBruin1',
 None,
 None,
 'IJKocken',
 'JBikker',
 None,
 'HJTreep',
 None,
 'JDMulder',
 'JHNienhuis',
 None,
 None,
 'JMCorreiaTeixeira',
 'JdeVries5',
 'J

## Retrieve employee info based on employee_url

In [10]:
employee_infos = []
for employee_url in employee_urls:
    result = get_employee_info(employee_url)
    print(result)
    employee_infos.append(result)

['Sander Prins', 'Humanities', ['Humanities', 'Digital Humanities IT', None, None, 'Technical Support Assistant for Teaching and Research - O profile']]
['G. (Guanyi) Chen', 'Science', ['Science', 'Information and computing sciences', 'Intelligent Software Systems', 'Natural Language Processing', 'Lecturer']]
['A. (Andre) Jüling', 'UU', ['UU', '', '', '', None]]
['A.L. (Aldert) Zomer', 'Veterinary Medicine', ['Veterinary Medicine', 'Department Biomolecular Health Sciences', 'Infectious Diseases & Immunology', ' Clinical Infectiology', 'Assistant Professor']]
[None, None, None]
['Anastasia Giachanou', 'Social and Behavioural Sciences', ['Social and Behavioural Sciences', '', None, None, 'Researcher']]
['Andreas Tataris', 'Geosciences', ['Geosciences', 'Earth Sciences', 'Seismology', '', 'PhD Candidate']]
['A.L. (Anna-Lena) Lamprecht', 'Science', ['Science', 'Information and computing sciences', 'Intelligent Software Systems', 'Software Technology', 'Assistant Professor']]
[None, None, N

['Joost Gadellaa', 'University Corporate Offices', ['University Corporate Offices', 'Information and Technology Services', 'Support and Monitoring Services', '', 'ICT Support Assistant']]
['Joris Eggenhuisen', 'Geosciences', ['Geosciences', 'Earth Sciences', 'Sedimentology', '', 'Associate Professor']]
[None, None, None]
['Jolanda Theeuwen', 'Geosciences', ['Geosciences', 'Copernicus Institute of Sustainable Development', 'Environmental Sciences', None, 'PhD Candidate']]
['J. (Jurriaan) Hage', 'Science', ['Science', 'Information and computing sciences', 'Intelligent Software Systems', 'Software Technology', 'Professor']]
['Jan-Willem Simons', 'Social and Behavioural Sciences', ['Social and Behavioural Sciences', 'Social Sciences', 'Sociology', None, 'PhD Candidate']]
['K. (Kaisa) Kajala', 'Science', ['Science', 'Biology', 'Environmental Biology', 'Plant-Environment Signaling', 'Assistant Professor']]
[None, None, None]
['K. (Keven) Quach', 'Science', ['Science', '', None, None, 'Studen

['Yongchao Ma', 'Social and Behavioural Sciences', ['Social and Behavioural Sciences', '', None, None, 'Research and Education Assistant']]
[None, None, None]
['Thom Volker', 'Social and Behavioural Sciences', ['Social and Behavioural Sciences', 'Social Sciences', 'Methodology and Statistics', '', 'PhD Candidate']]
[None, None, None]
[None, None, None]
[None, None, None]
[None, None, None]
['Haili Hu', 'University Corporate Offices', ['University Corporate Offices', 'Information and Technology Services', 'Research and Data Management Services', '', 'ICT Developer']]
[None, None, None]
[None, None, None]
[None, None, None]
[None, None, None]
[None, None, None]
[None, None, None]
[None, None, None]
[None, None, None]
['Erik-Jan van Kesteren', 'Social and Behavioural Sciences', ['Social and Behavioural Sciences', '', None, None, 'Assistant Professor']]
['Vera Bouwman', 'Social and Behavioural Sciences', ['Social and Behavioural Sciences', 'Psychology', 'Clinical Psychology', None, 'PhD Ca

In [11]:
# for employee_info in employee_infos:
#     print(employee_info[1]) # this shows that there are never 2 faculties for a person even if the rest api allows it
    
flat_list = get_flattened_info(employee_urls, employee_infos)
flat_list

[['SPrins',
  'Humanities',
  'Humanities',
  'Digital Humanities IT',
  None,
  None,
  'Technical Support Assistant for Teaching and Research - O profile'],
 ['GChen',
  'Science',
  'Science',
  'Information and computing sciences',
  'Intelligent Software Systems',
  'Natural Language Processing',
  'Lecturer'],
 ['AJuling2', 'UU', 'UU', None, None, None, None],
 ['ALZomer',
  'Veterinary Medicine',
  'Veterinary Medicine',
  'Department Biomolecular Health Sciences',
  'Infectious Diseases & Immunology',
  ' Clinical Infectiology',
  'Assistant Professor'],
 [None, None, None, None, None, None, None],
 ['AGiachanou',
  'Social and Behavioural Sciences',
  'Social and Behavioural Sciences',
  None,
  None,
  None,
  'Researcher'],
 ['ATataris',
  'Geosciences',
  'Geosciences',
  'Earth Sciences',
  'Seismology',
  None,
  'PhD Candidate'],
 ['ALLamprecht',
  'Science',
  'Science',
  'Information and computing sciences',
  'Intelligent Software Systems',
  'Software Technology',
 

### After this last step for automated labeling, an additional manual check will be done. If a user can't be found via the UU API, that person will be removed since we can't relate the person to a faculty. Organizational accounts are manually labeled based on available information.

In [12]:
columns = ["employee_url", "faculty", "level1", "level2", "level3", "level4", "position"]
pd_info = pd.DataFrame(flat_list,columns=columns)
pd_info
users_filtered.reset_index(drop=True, inplace=True)
pd_merged = pd.merge(users_filtered, pd_info, left_index=True, right_index=True)
pd_merged.to_excel("data/users_labeled.xlsx", index=False)
pd_merged

Unnamed: 0,user_id,source,service,date,id,node_id,avatar_url,gravatar_id,url,html_url,...,is_research_group,final_decision,note,employee_url,faculty,level1,level2,level3,level4,position
0,73616e646572,profile_page_uu.csv,github.com,2022-07-13,6605233.0,MDQ6VXNlcjY2MDUyMzM=,https://avatars.githubusercontent.com/u/660523...,,https://api.github.com/users/73616e646572,https://github.com/73616e646572,...,0.0,1,"Collected via profile pages, therefore must be...",SPrins,Humanities,Humanities,Digital Humanities IT,,,Technical Support Assistant for Teaching and R...
1,a-quei,github_search_users.csv,github.com,2022-07-13,6829836.0,MDQ6VXNlcjY4Mjk4MzY=,https://avatars.githubusercontent.com/u/682983...,,https://api.github.com/users/a-quei,https://github.com/a-quei,...,0.0,1,0,GChen,Science,Science,Information and computing sciences,Intelligent Software Systems,Natural Language Processing,Lecturer
2,ajueling,github_search_repos.csv,github.com,2022-07-13,20947797.0,MDQ6VXNlcjIwOTQ3Nzk3,https://avatars.githubusercontent.com/u/209477...,,https://api.github.com/users/AJueling,https://github.com/AJueling,...,0.0,1,0,AJuling2,UU,UU,,,,
3,aldertzomer,github_search_users.csv,github.com,2022-07-13,10392411.0,MDQ6VXNlcjEwMzkyNDEx,https://avatars.githubusercontent.com/u/103924...,,https://api.github.com/users/aldertzomer,https://github.com/aldertzomer,...,0.0,1,0,ALZomer,Veterinary Medicine,Veterinary Medicine,Department Biomolecular Health Sciences,Infectious Diseases & Immunology,Clinical Infectiology,Assistant Professor
4,amices,pure.csv,github.com,2022-07-13,69854869.0,MDEyOk9yZ2FuaXphdGlvbjY5ODU0ODY5,https://avatars.githubusercontent.com/u/698548...,,https://api.github.com/users/amices,https://github.com/amices,...,1.0,1,0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,wouterdroogers,github_search_users.csv,github.com,2022-07-13,38694532.0,MDQ6VXNlcjM4Njk0NTMy,https://avatars.githubusercontent.com/u/386945...,,https://api.github.com/users/WouterDroogers,https://github.com/WouterDroogers,...,0.0,1,0,WJDroogers,Science,Science,Biology,Biodynamics and Biocomplexity,Cell Biology,PhD Candidate
197,xiedanghan,profile_page_uu.csv,github.com,2022-07-13,57119295.0,MDQ6VXNlcjU3MTE5Mjk1,https://avatars.githubusercontent.com/u/571192...,,https://api.github.com/users/xiedanghan,https://github.com/xiedanghan,...,,1,"Collected via profile pages, therefore must be...",DXie,Geosciences,Geosciences,Physical Geography,"Coastal Dynamics, Fluvial Systems and Global C...",Biogeomorphology of Rivers and Estuaries,Researcher
198,yuntaoj,github_search_users.csv,github.com,2022-07-13,74980142.0,MDQ6VXNlcjc0OTgwMTQy,https://avatars.githubusercontent.com/u/749801...,,https://api.github.com/users/yuntaoj,https://github.com/yuntaoj,...,0.0,1,0,,,,,,,
199,zievathustra,github_search_repos.csv,github.com,2022-07-13,61464979.0,MDQ6VXNlcjYxNDY0OTc5,https://avatars.githubusercontent.com/u/614649...,,https://api.github.com/users/zievathustra,https://github.com/zievathustra,...,0.0,1,0,JASieverink,University Corporate Offices,University Corporate Offices,Information and Technology Services,Research and Data Management Services,,IT Engineer


### The following code is used for manually retrieving user information. The employee_url is inserted as employee variable and the information is printed as a string separated with |. This is inserted into the Excel file and then split. You can either use the employee name to find a user or the direct employee_url.

### The output from the third cell is manually inserted into the csv/excel file. Use this to label all users that could not be automatically labeled.

In [13]:
employee_name = "Keven Quach"
employee = search_employee_url(employee_name)
employee

'KQuach'

In [14]:
employee = "Kquach"

In [15]:
employee_info = get_employee_info(employee)
newlist = []
newlist.append(employee_info[0])
newlist.append(employee_info[1])
newlist.extend(employee_info[2])
newlist = ['' if value is None else value for value in newlist]
"|".join(newlist)

'K. (Keven) Quach|Science|Science||||Student Assistant'

# Label repositories with faculty information

### This step assumes that the input file has all users fully labeled. Then, the retrieved repositories from phase 2 of SWORDS receive the faculty label of the user through a merge.

In [37]:
users = pd.read_excel("data/users_labeled.xlsx")
repos = pd.read_excel("data/repositories_filtered.xlsx")
repos

  warn(msg)


Unnamed: 0,id,node_id,name,full_name,private,owner,html_url,description,fork,url,...,visibility,forks,open_issues,watchers,default_branch,permissions,date,html_url_clickable,repo_type,Note
0,393154427,MDEwOlJlcG9zaXRvcnkzOTMxNTQ0Mjc=,bert-chinese-classifier,a-quei/bert-chinese-classifier,False,a-quei,https://github.com/a-quei/bert-chinese-classifier,,False,https://api.github.com/repos/a-quei/bert-chine...,...,public,0,0,2,main,,2022-07-18,https://github.com/a-quei/bert-chinese-classifier,Empty,
1,114394344,MDEwOlJlcG9zaXRvcnkxMTQzOTQzNDQ=,guanyi.cls,a-quei/guanyi.cls,False,a-quei,https://github.com/a-quei/guanyi.cls,A costumed article class for latex,False,https://api.github.com/repos/a-quei/guanyi.cls,...,public,0,0,0,master,,2022-07-18,https://github.com/a-quei/guanyi.cls,Non-RS,
2,306983465,MDEwOlJlcG9zaXRvcnkzMDY5ODM0NjU=,mtuna-annotated,a-quei/mtuna-annotated,False,a-quei,https://github.com/a-quei/mtuna-annotated,The annotated MTUNA corpus,False,https://api.github.com/repos/a-quei/mtuna-anno...,...,public,0,0,0,main,,2022-07-18,https://github.com/a-quei/mtuna-annotated,Empty,
3,468485794,R_kgDOG-yGog,neuralreg-re-evaluation,a-quei/neuralreg-re-evaluation,False,a-quei,https://github.com/a-quei/neuralreg-re-evaluation,,False,https://api.github.com/repos/a-quei/neuralreg-...,...,public,0,0,0,main,,2022-07-18,https://github.com/a-quei/neuralreg-re-evaluation,Empty,
4,393154581,MDEwOlJlcG9zaXRvcnkzOTMxNTQ1ODE=,probe-neuralreg,a-quei/probe-neuralreg,False,a-quei,https://github.com/a-quei/probe-neuralreg,,False,https://api.github.com/repos/a-quei/probe-neur...,...,public,0,0,2,main,,2022-07-18,https://github.com/a-quei/probe-neuralreg,Empty,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1516,69224362,MDEwOlJlcG9zaXRvcnk2OTIyNDM2Mg==,onderzoeksdag,wouter-swierstra/onderzoeksdag,False,wouter-swierstra,https://github.com/wouter-swierstra/onderzoeksdag,,False,https://api.github.com/repos/wouter-swierstra/...,...,public,0,0,0,master,,2022-07-18,https://github.com/wouter-swierstra/onderzoeksdag,,
1517,105548131,MDEwOlJlcG9zaXRvcnkxMDU1NDgxMzE=,predicate-transformers,wouter-swierstra/predicate-transformers,False,wouter-swierstra,https://github.com/wouter-swierstra/predicate-...,,False,https://api.github.com/repos/wouter-swierstra/...,...,public,0,0,7,master,,2022-07-18,https://github.com/wouter-swierstra/predicate-...,,
1518,24751970,MDEwOlJlcG9zaXRvcnkyNDc1MTk3MA==,SoftwareProject,wouter-swierstra/SoftwareProject,False,wouter-swierstra,https://github.com/wouter-swierstra/SoftwarePr...,,False,https://api.github.com/repos/wouter-swierstra/...,...,public,1,0,0,master,,2022-07-18,https://github.com/wouter-swierstra/SoftwarePr...,,
1519,4482341,MDEwOlJlcG9zaXRvcnk0NDgyMzQx,xmonad,wouter-swierstra/xmonad,False,wouter-swierstra,https://github.com/wouter-swierstra/xmonad,xmonad in Coq,False,https://api.github.com/repos/wouter-swierstra/...,...,public,8,0,42,master,,2022-07-18,https://github.com/wouter-swierstra/xmonad,,


In [39]:
if "user_id" in users.columns:
    users = users.rename(columns={"user_id": "owner"})
    
users['owner_lower'] = users['owner'].str.lower()
repos['owner_lower'] = repos['owner'].str.lower()
repos_with_faculty = pd.merge(repos, users[['owner_lower','faculty']], how="left", on="owner_lower")
repos_with_faculty.drop('owner_lower', axis=1, inplace=True)
repos_with_faculty.to_excel("data/repositories_labeled_faculty.xlsx", index=False)
repos_with_faculty

Unnamed: 0,id,node_id,name,full_name,private,owner,html_url,description,fork,url,...,forks,open_issues,watchers,default_branch,permissions,date,html_url_clickable,repo_type,Note,faculty
0,393154427,MDEwOlJlcG9zaXRvcnkzOTMxNTQ0Mjc=,bert-chinese-classifier,a-quei/bert-chinese-classifier,False,a-quei,https://github.com/a-quei/bert-chinese-classifier,,False,https://api.github.com/repos/a-quei/bert-chine...,...,0,0,2,main,,2022-07-18,https://github.com/a-quei/bert-chinese-classifier,Empty,,Science
1,114394344,MDEwOlJlcG9zaXRvcnkxMTQzOTQzNDQ=,guanyi.cls,a-quei/guanyi.cls,False,a-quei,https://github.com/a-quei/guanyi.cls,A costumed article class for latex,False,https://api.github.com/repos/a-quei/guanyi.cls,...,0,0,0,master,,2022-07-18,https://github.com/a-quei/guanyi.cls,Non-RS,,Science
2,306983465,MDEwOlJlcG9zaXRvcnkzMDY5ODM0NjU=,mtuna-annotated,a-quei/mtuna-annotated,False,a-quei,https://github.com/a-quei/mtuna-annotated,The annotated MTUNA corpus,False,https://api.github.com/repos/a-quei/mtuna-anno...,...,0,0,0,main,,2022-07-18,https://github.com/a-quei/mtuna-annotated,Empty,,Science
3,468485794,R_kgDOG-yGog,neuralreg-re-evaluation,a-quei/neuralreg-re-evaluation,False,a-quei,https://github.com/a-quei/neuralreg-re-evaluation,,False,https://api.github.com/repos/a-quei/neuralreg-...,...,0,0,0,main,,2022-07-18,https://github.com/a-quei/neuralreg-re-evaluation,Empty,,Science
4,393154581,MDEwOlJlcG9zaXRvcnkzOTMxNTQ1ODE=,probe-neuralreg,a-quei/probe-neuralreg,False,a-quei,https://github.com/a-quei/probe-neuralreg,,False,https://api.github.com/repos/a-quei/probe-neur...,...,0,0,2,main,,2022-07-18,https://github.com/a-quei/probe-neuralreg,Empty,,Science
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1516,69224362,MDEwOlJlcG9zaXRvcnk2OTIyNDM2Mg==,onderzoeksdag,wouter-swierstra/onderzoeksdag,False,wouter-swierstra,https://github.com/wouter-swierstra/onderzoeksdag,,False,https://api.github.com/repos/wouter-swierstra/...,...,0,0,0,master,,2022-07-18,https://github.com/wouter-swierstra/onderzoeksdag,,,Science
1517,105548131,MDEwOlJlcG9zaXRvcnkxMDU1NDgxMzE=,predicate-transformers,wouter-swierstra/predicate-transformers,False,wouter-swierstra,https://github.com/wouter-swierstra/predicate-...,,False,https://api.github.com/repos/wouter-swierstra/...,...,0,0,7,master,,2022-07-18,https://github.com/wouter-swierstra/predicate-...,,,Science
1518,24751970,MDEwOlJlcG9zaXRvcnkyNDc1MTk3MA==,SoftwareProject,wouter-swierstra/SoftwareProject,False,wouter-swierstra,https://github.com/wouter-swierstra/SoftwarePr...,,False,https://api.github.com/repos/wouter-swierstra/...,...,1,0,0,master,,2022-07-18,https://github.com/wouter-swierstra/SoftwarePr...,,,Science
1519,4482341,MDEwOlJlcG9zaXRvcnk0NDgyMzQx,xmonad,wouter-swierstra/xmonad,False,wouter-swierstra,https://github.com/wouter-swierstra/xmonad,xmonad in Coq,False,https://api.github.com/repos/wouter-swierstra/...,...,8,0,42,master,,2022-07-18,https://github.com/wouter-swierstra/xmonad,,,Science
