In [1]:
import os
os.chdir('../')

In [132]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
from dateutil.relativedelta import relativedelta
pd.set_option('display.max_columns', None)
import seaborn as sns
import wrds
import time
from ast import literal_eval
import requests
from bs4 import BeautifulSoup

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [10]:
# TODO: edit file so it can handle pushes
df_committers_info = pd.concat([pd.read_csv(indir_committers / 'committers_info_pr.csv', index_col = 0).dropna(),
                                pd.read_csv(indir_committers / 'committers_info_push.csv', index_col = 0).dropna()])
df_committers_info['committer_info'] = df_committers_info['committer_info'].apply(literal_eval)
df_committers_info['commit_repo'] = df_committers_info['commit_repo'].apply(literal_eval)

In [11]:
df_committers_info = df_committers_info[df_committers_info['committer_info'].apply(lambda x: len(x) in [1, 2, 4])]
df_committers_info['committer_info_length'] = df_committers_info['committer_info'].apply(len)

In [16]:
df_committers_info['actor_name'] = df_committers_info.apply(
    lambda x: x['committer_info'][0] if x['committer_info_length'] in [1,4] else x['committer_info'][1], axis = 1)
df_committers_info['actor_id'] = df_committers_info.apply(
    lambda x: x['committer_info'][1] if x['committer_info_length'] == 4 else 
    x['committer_info'][0] if x['committer_info_length'] == 2 else np.nan, axis = 1)
df_committers_info['repo_name'] = df_committers_info['commit_repo'].apply(lambda x: list(set([ele.split("_")[1] for ele in x])))
df_committers_info = df_committers_info.explode('repo_name')

In [23]:
# some cleaning functions, etc
def CleanName(name):
    if pd.isnull(name):
        return name
    name = name.title()
    if len(name.split(" "))<2:
        return np.nan
    else:
        if any([len(word)<=1 for word in name.split(" ")]):
            return np.nan
    return name

def DropNAList(lst):
    return [x for x in lst if not pd.isnull(x)]

df_committers_info['human_name'] = df_committers_info['name'].apply(lambda x: CleanName(x))
df_committers_info['actor_name'] = df_committers_info['actor_name'].fillna('')
df_committers_info['email_address'] = df_committers_info['email'].apply(lambda x: np.nan if pd.isnull(x) or x.endswith("@users.noreply.github.com") else x)

In [48]:
df_aggregated_committers = df_committers_info[['actor_id','actor_name','repo_name','human_name','email_address','name','email']].drop_duplicates()\
    .groupby(['actor_id','actor_name','repo_name'])\
    .agg({'human_name':'unique','email_address':'unique','name':'unique','email':'unique'})\
    .reset_index()

In [55]:
df_aggregated_committers['actor_id'] = pd.to_numeric(df_aggregated_committers['actor_id'], errors = 'coerce')\
    .replace('',np.nan)
df_aggregated_committers['actor_name'] = df_aggregated_committers['actor_name'].apply(
    lambda x: x if x.replace("_","").replace("-","").replace("[","").replace("]","").isalnum() else np.nan)
df_aggregated_committers['actor_name'] = df_aggregated_committers['actor_name'].replace('',np.nan)

In [65]:
for col in ['human_name','email_address']:
    df_aggregated_committers[col] = df_aggregated_committers[col].apply(DropNAList)
df_aggregated_committers.rename({'name':'commit_name','email':'commit_email'}, axis = 1, inplace = True)
df_aggregated_committers.dropna(subset = ['actor_id','actor_name'], thresh = 1, inplace = True)

In [89]:
def GetGitHubAPIData(user_attr, extract_attr):
    if extract_attr == 'login':
        url = f"https://api.github.com/user/{user_attr}"
    if extract_attr == 'id':
        url = f"https://api.github.com/users/{user_attr}"
    response = requests.get(url)
    if response.status_code != 200:
        return np.nan
    data = response.json()
    return data.get(extract_attr, np.nan)

In [90]:
df_aggregated_committers['actor_name'] = df_aggregated_committers.apply(
    lambda x: x['actor_name'] if not pd.isnull(x['actor_name']) else GetGitHubAPIData(x['actor_id'], 'login'), axis = 1)

In [91]:
df_aggregated_committers['actor_id'] = df_aggregated_committers.apply(
    lambda x: x['actor_id'] if not pd.isnull(x['actor_id']) else GetGitHubAPIData(x['actor_name'], 'id'), axis = 1)
df_aggregated_committers['id'] = df_aggregated_committers.apply(
    lambda x: x['actor_id'] if not pd.isnull(x['actor_id']) else GetGitHubAPIData(x['actor_id'], 'login'), axis = 1)
df_aggregated_committers.dropna(subset = ['actor_id','actor_name'], thresh = 2, inplace = True)

In [101]:
df_aggregated_committers['profile_url'] = df_aggregated_committers['actor_name'].apply(
    lambda x: f"https://github.com/{x}")

In [146]:
def ScrapeGitHubProfile(profile_url):
    response = requests.get(profile_url)
    if response.status_code != 200:
        return False
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    name_tag = soup.select_one('.p-name')
    name = name_tag.get_text(strip=True) if name_tag else None
    
    company_tag = soup.select_one('.p-org')
    company = company_tag.get_text(strip=True) if company_tag else None
    
    location_tag = soup.select_one('.p-label')
    location = location_tag.get_text(strip=True) if location_tag else None
    
    bio_tag = soup.select_one('.user-profile-bio')
    bio = bio_tag.get_text(strip=True) if bio_tag else None
    
    linkedin_url = None
    for link_tag in soup.select('a[href]'):
        href = link_tag['href']
        if "linkedin.com" in href.lower():
            linkedin_url = href
            break
    
    blog_url = None
    blog_link_tags = soup.select('a.Link--primary[rel="nofollow me"]')
    for link_tag in blog_link_tags:
        href = link_tag['href']
        if "linkedin.com" not in href.lower():
            blog_url = href
            break
    # Return collected data in a dictionary
    return {
        "name": name,
        "company": company,
        "location": location,
        "bio": bio,
        "linkedin_url": linkedin_url,
        "blog_url": blog_url
    }

In [152]:
df_profiles = df_aggregated_committers[['profile_url']].drop_duplicates().reset_index()

In [176]:
df_aggregated_committers.to_csv('issue/df_aggregated_committers.csv')

In [173]:
df_profile_data = pd.merge(df_aggregated_committers, df_profiles)

Unnamed: 0,actor_id,actor_name,repo_name,human_name,email_address,commit_name,commit_email,profile_url,profile_data
0,17.0,vanpelt,wandb/wandb,[Chris Van Pelt],"[vanpelt@gmail.com, vanpelt@wandb.com]",[Chris Van Pelt],"[vanpelt@gmail.com, vanpelt@wandb.com]",https://github.com/vanpelt,"[{'name': 'Chris Van Pelt (CVP)', 'company': '..."
1,26.0,topfunky,pygments/pygments,[Geoffrey Grosenbach],[boss@topfunky.com],[Geoffrey Grosenbach],[boss@topfunky.com],https://github.com/topfunky,"[{'name': 'Geoffrey Grosenbach', 'company': '@..."
2,29.0,lukas,wandb/wandb,[Lukas Biewald],[lukas@wandb.com],[Lukas Biewald],[lukas@wandb.com],https://github.com/lukas,"[{'name': 'Lukas Biewald', 'company': 'Weights..."
3,34.0,nitay,pantsbuild/pants,[Nitay Joffe],[nitay@actioniq.co],[Nitay Joffe],[nitay@actioniq.co],https://github.com/nitay,"[{'name': 'Nitay Joffe', 'company': 'ActionIQ'..."
4,35.0,kevwil,fabric/fabric,[Kevin Williams],[kevwil@gmail.com],[Kevin Williams],[kevwil@gmail.com],https://github.com/kevwil,
...,...,...,...,...,...,...,...,...,...
232108,99654151.0,serg-music,great-expectations/great,[],[],[serg-music],[99654151+serg-music@users.noreply.github.com],https://github.com/serg-music,
232109,45845474.0,eladkal,apache/airflow,[Elad Kalif],[eladkal@amazon.com],[Elad Kalif],"[45845474+eladkal@users.noreply.github.com, el...",https://github.com/eladkal,
232110,11886801.0,fabienpe,mdomke/schwifty,[],[],[fabienpe],[fabienpe+fabienpe@users.noreply.github.com],https://github.com/fabienpe,
232111,38999128.0,jmahlik,aws/sagemaker-python-sdk,[],[],[jmahlik],[j38999128+jmahlik@users.noreply.github.com],https://github.com/jmahlik,


In [170]:
%%time
for i in df_profiles.index:
    print(i)
    scrape_result = ScrapeGitHubProfile(df_profiles.loc[i, 'profile_url'])
    iter_count = 0
    while type(scrape_result) == bool and iter_count < 5:
        time.sleep(2)
        scrape_result = ScrapeGitHubProfile(df_profiles.loc[i, 'profile_url'])
        iter_count+=1
    df_profiles.loc[i, 'profile_data'] = [scrape_result]

0
1
2
3
4


KeyboardInterrupt: 