In [None]:
import yaml
from github import Github
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import networkx as nx
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import doc2vec

# Read config YAML

In [None]:
config_path = "../github.yaml"
with open(config_path, "r") as file:
    config = yaml.safe_load(file)
config

# Initialize Github API

In [None]:
github_token = "token" # Put your GitHub API token here
git_api = Github(github_token)

# Retrieve list of repositories

In this section, we gather the list of repositories that we want to crawl from two CSV datasets. We first clean the data from the datasets.

In [None]:
csv_paths = config['dataset_paths']
repos = pd.read_csv(csv_paths[0])
repos2 = pd.read_csv(csv_paths[1])
repos = repos[repos.topic != "awesome"] # Filter out repositories from awesome pages.
repos = repos[["topic", "name", "star", "topic_tag", "discription_text", "url"]]
repos2 = repos2[["Name", "URL", "Stars", "Topics", "Description"]]
repos = repos.rename(columns={"discription_text": "description", "topic_tag": "tags"})
repos2 = repos2.rename(columns={"Name": "name", "URL": "url", "Stars": "star", "Topics": "tags", "Description": "description"})
repos = pd.concat([repos, repos2]).drop_duplicates(subset="name")
repos = repos[~repos.name.str.contains("awesome|tutorial|interview|book|roadmap|list|cheat|how|best|book|scratch", case=False, na=False)]

# Convert strings which contain project tags to lists
def convert_tags(item):
    result = []
    striped = item.strip("['").strip("']")
    striped = striped.split(', ')
    for tag in striped:
        tag = tag.strip("'")
        result.append(tag)
    return result
repos.tags = repos.tags.apply(convert_tags)

In [None]:
# Since repo stars are in format of 12.3k, we convert these strings to numbers.
def value_to_float(value):
    if type(value) == float or type(value) == int:
        return value
    if 'k' in value:
        if len(value) > 1:
            return float(value.replace('k', '')) * 1000
        return 1000.0
    return 0.0

# We keep repositories with more than 2000 stars
good_repos = repos[repos.star.apply(value_to_float) > 2000]
good_repos = good_repos[good_repos.tags.apply(len) > 1]

# Set project url as "repoOwner/repoName"
urls = good_repos.url.apply(lambda item: "/".join(item.split("/")[-2:]))
good_repos['title'] = urls

# Crawl repository contributors

In [None]:
repo_dict = {}
repo_dict_path = config['repo_dict_path']
for url in tqdm(urls):
    if url in repo_dict:
        continue
    try:
        repo = git_api.get_repo(url)
        contributor_users = list(repo.get_contributors())
        repo_dict[url] = {"repo": repo, "contributors": contributor_users}
    except Exception:
        pass

with open(repo_dict_path, "wb") as f:
    pickle.dump(repo_dict, f)

We keep users that had worked on more than 3 projects. Plus, we remove bots from our users list.

In [None]:
with open(repo_dict_path, "rb") as f:
    repo_dict = pickle.load(f)

user_project_counter = Counter()
for repo_url, repo_items in repo_dict.items():
    contributor_users = repo_items['contributors']
    contributor_users = [item.login for item in contributor_users]  # Get usernames of contributors
    user_project_counter.update(contributor_users)
good_users = [user for user in user_project_counter if user_project_counter[user] > 3 and '[bot]' not in user]

# Keyword Extraction

We extract keywords from project tags. First, we create the universal keyword list using keywords which are present in more than 1000 projects and are not too rare.

In [None]:
tag_counter = Counter()
for item in repos.tags:
    for tag in item:
        tag_counter.update([tag])
good_tags = [item for item, cnt in tag_counter.items() if cnt > 100 and item]

We add programming languages used in each repo to its keywords.

In [None]:
languages = set()
repo_language = {}
for repo_url, repo_items in repo_dict.items():
    language = repo_items['repo'].language
    if not language:
        continue
    language = language.lower().replace(" ", "-")
    languages.add(language)
    url = "/".join(repo_items['repo'].url.split("/")[-2:])
    repo_language[url] = repo_items

good_tags = list(set(good_tags + list(languages)))  # Append programming languages to universal keyword list
good_repos['language'] = good_repos.title.apply(lambda x: [repo_language[x]] if x in repo_language else [])
good_repos['tags'] = good_repos.apply(lambda x: x.tags + x.language, axis=1)

# Find skills of each expert

In this section, we concatenate all keywords in each user's repositories, and then find TF-IDF values for each users's corresponding keywords. Then, we assign skills to experts when their TF-IDF values surpass a predefined threshold.

In [None]:
user_repos = defaultdict(list)
for repo_url, repo_items in tqdm(repo_dict.items()):
    contributor_users = repo_items['contributors']
    for user in contributor_users:
        user_repos[user.login].append(repo_url)
user_repos = {k:v for k,v in user_repos.items() if k in good_users}

In [None]:
user_skills = {}

for user, repos in tqdm(user_repos.items()):
    skills = []
    for repo in repos:
        rep = good_repos[good_repos.title == repo]
        sk = filter(lambda i: i in good_tags, rep.tags.iloc[0])
        skills.extend(sk)
    user_skills[user] = skills

In [None]:
vectorizer = TfidfVectorizer(vocabulary=good_tags)
tfidf_threshold = config['tfidf_threshold']

all_docs = []
for user, skills in user_skills.items():
    skills_str = " ".join(skills)
    all_docs.append(skills_str)
vectorizer.fit(all_docs)

user_id_dict = {}
users_skills_dict = {}
for user, skills in tqdm(user_skills.items()):
    if user not in user_id_dict:
        user_id_dict[user] = len(user_id_dict)
    user_id = user_id_dict[user]
    skills_str = " ".join(skills)
    user_vec = vectorizer.transform([skills_str])
    user_vec = (user_vec > tfidf_threshold).todense().tolist()[0]
    user_vec = [1 if item else 0 for item in user_vec]
    users_skills_dict[user_id] = user_vec

# Find collaborators of each user

Here, we find collaborators of each individual in our dataset. We keep collaborations that have been repeated for more than 2 times.

In [None]:
user_collab_counter = defaultdict(Counter)
user_collaborators = {}

for repo_url, repo_items in tqdm(repo_dict.items()):
    for user in repo_items['contributors']:
        if user.login not in user_id_dict:
            continue
        for other in repo_items['contributors']:
            if other.login not in user_id_dict:
                continue
            if user != other:
                user_collab_counter[user_id_dict[user.login]].update([user_id_dict[other.login]])

for user, counter in user_collab_counter.items():
    collabs = [other_user for other_user, count in counter.items() if count > 2]
    user_collaborators[user] = collabs

# Create Collaboration network

In [None]:
g = nx.Graph()
edges = []
for user, collabs in user_collaborators.items():
    for other in collabs:
        edges.append((user, other))
g.add_edges_from(edges)
nx.set_node_attributes(g, user_collaborators, name="x")

# Save collaboration network to files

In [None]:
with open(config['saving_paths']['graph'], "wb") as f:
    pickle.dump(g, f)
with open(config['saving_paths']['authors_id'], "wb") as f:
    pickle.dump(user_id_dict, f)
with open(config['saving_paths']['all_skills'], "wb") as f:
    pickle.dump(good_tags, f)
with open(config['saving_paths']['vectorizer'], "wb") as f:
    pickle.dump(vectorizer, f)

# Word2vec

In [None]:
docs = []
for i, repo in tqdm(good_repos.iterrows()):
    words = list(filter(lambda x: x in good_tags, repo.tags))
    docs.append(doc2vec.TaggedDocument(words, [len(docs)]))

doc2vec_model = doc2vec.Doc2Vec(docs, vector_size=64, epochs=30)

In [None]:
with open("../data/doc2vec_github.pkl", "wb") as f:
    pickle.dump(doc2vec_model, f)

In [None]:
with open("../data/user_repos.pkl", "wb") as f:
    pickle.dump(user_repos, f)
with open("../data/good_repos.pkl", "wb") as f:
    pickle.dump(good_repos, f)