In [1]:
import pandas as pd
import os
import numpy as np
import yaml
from ast import literal_eval
from pandarallel import pandarallel

In [2]:
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
pd.set_option('display.max_columns', None)

In [4]:
indivs_commits = pd.concat([pd.read_parquet("data/merged_data/committers_info_pr.parquet"),
                            pd.read_parquet("data/merged_data/committers_info_push.parquet")])
indivs_commits = indivs_commits.drop(['commit_repo','committer_info'], 
                                     axis = 1).reset_index(drop = True).drop_duplicates()
# exclude people who don’t have “@“ in email - can’t properly trace their status (and motivations)
indivs_commits = indivs_commits[indivs_commits['email'].apply(lambda x: not pd.isnull(x) and "@" in x)]

In [5]:
indivs_commits['domain'] = indivs_commits['email'].apply(lambda x: x.split("@")[-1])

In [6]:
# company data source on1
with open('data/inputs/company_domain_match_list.yaml', 'r') as f:
    company_info = yaml.load(f, Loader=yaml.FullLoader)
df_company = pd.DataFrame(company_info)
df_domains = df_company[['company', 'domains']].explode('domains').drop_duplicates().groupby('domains').agg({'company':list})
indivs_commits['company'] = indivs_commits['email'].apply(lambda x: [df_domains.loc[email, 'company'] for email in df_domains.index if x.endswith(email) and not x.split(email)[0][-1].isdigit() \
                                                                         and not x.split(email)[0][-1].isalpha()])
indivs_commits['company'] = indivs_commits['company'].apply(lambda x: "|".join(list(set([ele[0] for ele in x]))))
indivs_commits['company'] = indivs_commits.apply(lambda x: '' if x['email'].endswith('users.noreply.github.com') else x['company'], axis = 1)

In [7]:
with open("data/inputs/free_email_domains.txt", "r") as free_domain:
    free_domains = free_domain.readlines()
    free_domains = [ele.strip() for ele in free_domains]

In [8]:
no_free = ['oath.com','charter.com','cable.comcast.com','comcast.com','nus.edu.sg',
           'windstream.com','sky.com','alibaba.com','ancestry.com']
free_domains = [domain for domain in free_domains if domain not in no_free]

In [9]:
indivs_commits['personal'] = indivs_commits['domain'].parallel_apply(lambda x: x in free_domains)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=13557), Label(value='0 / 13557')))…

In [10]:
company_info2 = pd.read_parquet('data/inputs/more_company_domains.parquet')
df_domains2 = company_info2.groupby('email').agg({'name':lambda x: list(np.unique(x))})
indivs_commits['company2'] = indivs_commits['email'].parallel_apply(lambda x: [df_domains2.loc[email, 'name'] for email in df_domains2.index if x.endswith(email) and (len(x.split(email)[0])==0 or (not x.split(email)[0][-1].isdigit() \
                                                                         and not x.split(email)[0][-1].isalpha()))])
indivs_commits['company2'] = indivs_commits['company2'].parallel_apply(lambda x: "|".join(list(set([ele[0] for ele in x]))))
indivs_commits['company2'] = indivs_commits.apply(lambda x: '' if x['email'].endswith('users.noreply.github.com') else x['company2'], axis = 1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=13557), Label(value='0 / 13557')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=13557), Label(value='0 / 13557')))…

In [11]:
# university data source
df_university = pd.read_json('data/inputs/world_universities_and_domains.json')
df_university = df_university.explode('domains')[['name','domains']].drop_duplicates().groupby('domains').agg({'name':list})

indivs_commits['university'] = indivs_commits['email'].parallel_apply(lambda x: [df_university.loc[email, 'name'] for email in df_university.index if x.endswith(email) and not x.split(email)[0][-1].isdigit() \
                                                                         and not x.split(email)[0][-1].isalpha()])
indivs_commits['university'] = indivs_commits['university'].apply(lambda x: "|".join(list(set([ele[0] for ele in x]))))
indivs_commits['university'] = indivs_commits.apply(lambda x: '' if x['email'].endswith('users.noreply.github.com') else x['university'], axis = 1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=13557), Label(value='0 / 13557')))…

In [12]:
indivs_commits['institution'] = indivs_commits['university']
indivs_commits['domain_type'] = indivs_commits['university'].apply(lambda x: 'academic institution' if x != '' else '')
indivs_commits['institution'] = indivs_commits.apply(lambda x: x['company'] if x['institution'] == '' and x['company'] != '' \
                                                     else x['institution'], axis = 1)
indivs_commits['institution'] = indivs_commits.apply(lambda x: x['company2'] if x['institution'] == '' and x['company2'] != ''\
                                                     else x['institution'], axis = 1)
indivs_commits['domain_type'] = indivs_commits.apply(lambda x: 'company' if x['institution'] != '' and x['domain_type'] == '' \
                                                     else x['domain_type'], axis = 1)
indivs_commits['domain_type'] = indivs_commits.apply(lambda x: 'personal' if x['personal'] and x['domain_type'] == '' \
                                                     else x['domain_type'], axis = 1)
indivs_commits['domain_type'] = indivs_commits.apply(lambda x: 'untraceable' if x['domain'] == 'users.noreply.github.com'  \
                                                     else x['domain_type'], axis = 1)

In [14]:
indivs_commits['info']=indivs_commits['name']+" :|: "+indivs_commits['email']+" :|: "+\
    indivs_commits['institution']+" :|: "+ indivs_commits['domain_type']
indivs_commits = indivs_commits[~indivs_commits['info'].isna()]
indivs_commits_grouped = indivs_commits.groupby('actor_login').agg({'actor_id':'unique', 'info':'unique'}).reset_index()

In [15]:
indivs_commits_grouped['actor_id'] = indivs_commits_grouped['actor_id'].apply(lambda x: [str(int(ele)) for ele in x if not pd.isnull(ele)])
indivs_commits_grouped['names'] = indivs_commits_grouped['info'].apply(lambda x:  [val for val in np.unique([ele.split(" :|: ")[0] for ele in x if not pd.isnull(ele)]) if val != ''])
indivs_commits_grouped['emails'] = indivs_commits_grouped['info'].apply(lambda x:  [val for val in np.unique([ele.split(" :|: ")[1] for ele in x if not pd.isnull(ele)]) if val != ''])
indivs_commits_grouped['institutions'] = indivs_commits_grouped['info'].apply(lambda x:  [val for val in np.unique([ele.split(" :|: ")[2] for ele in x if not pd.isnull(ele)]) if val != ''])
indivs_commits_grouped['user types'] = indivs_commits_grouped['info'].apply(lambda x:  [val for val in np.unique([ele.split(" :|: ")[3] for ele in x if not pd.isnull(ele)]) if val != ''])

In [16]:
indivs_commits_grouped.to_parquet('data/merged_data/committer_detailed_info.parquet')

## Analysis

In [None]:
indivs_commits[indivs_commits['domain_type'] == '']['domain'].value_counts().head(50)

In [None]:
indivs_commits[~indivs_commits['actor_login'].isna()]['domain_type'].value_counts(normalize=True)

In [None]:
indivs_commits_grouped['institutions'].value_counts().head(10)