In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

### Contributors Data Cleaning

In [2]:
data_path = Path("../") / "data"

if previous_contrib_data_list := [
    pd.read_feather(contrib_file)
    for contrib_file in data_path.glob("contributors*.feather")
]:
    cons = pd.concat(previous_contrib_data_list, ignore_index=True)

In [3]:
cons

Unnamed: 0,original_index,url,status,contributors
0,17532,https://api.github.com/repos/WinMin/4616577921...,success,"{'avatar_url': None, 'block': None, 'contribut..."
1,17534,https://api.github.com/repos/Cossack9989/fa971...,success,"{'avatar_url': None, 'block': None, 'contribut..."
2,17537,https://api.github.com/repos/Cossack9989/Vulns,success,{'avatar_url': 'https://avatars.githubusercont...
3,19242,https://api.github.com/repos/jaygreig86/dmitry,success,{'avatar_url': 'https://avatars.githubusercont...
4,19276,https://api.github.com/repos/tendenci/tendenci,success,{'avatar_url': 'https://avatars.githubusercont...
...,...,...,...,...
25213,484935,https://api.github.com/repos/eclipse-ee4j/el-ri,success,{'avatar_url': 'https://avatars.githubusercont...
25214,484935,https://api.github.com/repos/eclipse-ee4j/el-ri,success,{'avatar_url': 'https://avatars.githubusercont...
25215,484935,https://api.github.com/repos/eclipse-ee4j/el-ri,success,{'avatar_url': 'https://avatars.githubusercont...
25216,484938,https://api.github.com/repos/advisories/GHSL-2...,success,"{'avatar_url': None, 'contributions': None, 'd..."


In [4]:
result = []
for i in range(len(cons)):
    con = cons.contributors[i]
    if('login' in con):
        name = con['login']
        result.append([cons.original_index[i], cons.url[i], name])

In [5]:
loginsdf = pd.DataFrame(result, columns = ['original_index', 'url', 'logins'])
loginsdf.head()

Unnamed: 0,original_index,url,logins
0,17532,https://api.github.com/repos/WinMin/4616577921...,
1,17534,https://api.github.com/repos/Cossack9989/fa971...,
2,17537,https://api.github.com/repos/Cossack9989/Vulns,Cossack9989
3,19242,https://api.github.com/repos/jaygreig86/dmitry,jaygreig86
4,19276,https://api.github.com/repos/tendenci/tendenci,jennyq


In [6]:
loginsdf = loginsdf[loginsdf.logins.notnull()]
loginsdf.head()

Unnamed: 0,original_index,url,logins
2,17537,https://api.github.com/repos/Cossack9989/Vulns,Cossack9989
3,19242,https://api.github.com/repos/jaygreig86/dmitry,jaygreig86
4,19276,https://api.github.com/repos/tendenci/tendenci,jennyq
5,112255,https://api.github.com/repos/ImpulseAdventure/...,ImpulseAdventure
6,112264,https://api.github.com/repos/matrixssl/matrixssl,matrixssl-admin


In [7]:
loginsdf.reset_index().to_feather('../data/github_usernames_initial.feather')

In [None]:
cve_data = pd.read_feather(data_path / "all_parsed_cve_references.feather")

In [None]:
contribs = pd.read_feather(data_path / "github_usernames_initial.feather")
contribs_data = cve_data.join(
    contribs.set_index("original_index"), lsuffix="c", rsuffix="u"
).dropna(subset="logins")
contribs_data = contribs_data[~contribs_data["logins"].str.contains("[bot]")]
contribs_data[["cve_id", "logins"]].to_csv("github_usernames_merged.csv", index=False)
contribs_data

### Languages Data Cleaning

In [8]:
data_path = Path("../") / "data"

if previous_langs_data_list := [
    pd.read_feather(lang_file)
    for lang_file in data_path.glob("languages*.feather")
]:
    langs = pd.concat(previous_langs_data_list, ignore_index=True)

In [9]:
langs.head()

Unnamed: 0,original_index,url,status,languages
0,402641,https://api.github.com/repos/omniauth/omniauth...,success,"{'ANTLR': None, 'APL': None, 'ASP': None, 'ASP..."
1,402649,https://api.github.com/repos/esotalk/esoTalk,success,"{'ANTLR': None, 'APL': None, 'ASP': None, 'ASP..."
2,402660,https://api.github.com/repos/grymer/CVE,default,"{'ANTLR': None, 'APL': None, 'ASP': None, 'ASP..."
3,402964,https://api.github.com/repos/braekling/WP-Matomo,success,"{'ANTLR': None, 'APL': None, 'ASP': None, 'ASP..."
4,403323,https://api.github.com/repos/scaron/prettyphoto,success,"{'ANTLR': None, 'APL': None, 'ASP': None, 'ASP..."


In [10]:
result = []
for i in range(len(langs)):
    lang = langs.languages[i]
    for key, value in lang.items():
        if value is not None:
            result.append([langs.original_index[i], langs.url[i], key, str(value)])

In [11]:
langsdf = pd.DataFrame(result, columns = ['original_index', 'url', 'language', 'ratio'])
langsdf.head()

Unnamed: 0,original_index,url,language,ratio
0,402641,https://api.github.com/repos/omniauth/omniauth...,Ruby,1076.0
1,402649,https://api.github.com/repos/esotalk/esoTalk,CSS,46680.0
2,402649,https://api.github.com/repos/esotalk/esoTalk,JavaScript,252946.0
3,402649,https://api.github.com/repos/esotalk/esoTalk,PHP,870210.0
4,402964,https://api.github.com/repos/braekling/WP-Matomo,CSS,794.0


In [12]:
langsdf.reset_index().to_feather('../data/github_langs_initial.feather')

In [None]:
langs = pd.read_feather(data_path / "github_langs_initial.feather")
language_data = cve_data.join(
    langs.set_index("original_index"), lsuffix="c", rsuffix="l"
).dropna(subset="language")
language_data = language_data[~language_data.language.str.contains("documentation_url")]
language_data[["cve_id", "language", "ratio"]].to_csv(
    "github_langs_merged.csv", index=False
)
language_data