In [1]:
import pandas as pd
from pyarrow import feather
import os
import requests
from bs4 import BeautifulSoup
import re
import time
import concurrent.futures
from github import Github

In [2]:
# Takes a directory and finds all github links referenced within its files
def find_gits(dirpath):
    gitlinks = []
    for file in os.listdir(dirpath):
        f = os.path.join(dirpath, file)
        if os.path.isfile(f) and 'json' in f:
            with open(f) as file:
                lines = file.readlines()
                for line in lines:
                    if "github" in line and "url\" :" in line:
                        line = line.replace("\"url\" : \"", "").replace("\",", "").strip()
                        gitlinks.append(line)
    return gitlinks

In [3]:
# Find all git links in data directory files
gitlinks = find_gits('data')

In [4]:
len(gitlinks)

40471

In [5]:
# Find referenced CVEs and Languages used in Git Repos

def get_data(link):
    cve_exp = 'CVE-([A-Za-z0-9\-\_]+)'
    repo_exp = '.*github.com(\/).*?(\/).*?(\/)'
    loc_exp = '(?<=github.com)(\/).*?(\/).*?(\/)'
    
    cve1 = 'null'
    repo = 'null'
    cve2 = 'null'
    langsl = 'null'
    contributors = 'null'
    
    try:
        page = requests.get(link)
        text = page.text
        found = re.findall(cve_exp, text)
        cve1 = list(set(found))
        data = link, cve1, repo, cve2, langsl, contributors
    except:
        data = link, cve1, repo, cve2, langsl, contributors

    try:
        slash_count = link.count('/')
        if slash_count <= 4:
            repo = f"{link}/"
            repo_page = requests.get(repo)
            repo_text = repo_page.text
            found = re.findall(cve_exp, repo_text)
            cve2 = list(set(found))
            data = link, cve1, repo, cve2, langsl, contributors
        else:
            repo = re.search(repo_exp, link).group(0)
            repo_page = requests.get(repo)
            repo_text = repo_page.text
            found = re.findall(cve_exp, repo_text)
            cve2 = list(set(found))
            data = link, cve1, repo, cve2, langsl, contributors
    except:
        data = link, cve1, repo, cve2, langsl, contributors

    try:
        soup = BeautifulSoup(repo_page.content, 'html.parser')
        langs = soup.find_all("span", {"class":"Progress-item color-bg-success-emphasis"})
        langsl = []
        for l in langs:
            langsl.append(l['aria-label'])
        data = link, cve1, repo, cve2, langsl, contributors
    except:
        data = link, cve1, repo, cve2, langsl, contributors

    try:
        loc = re.search(loc_exp, repo).group(0)
        url = f"https://api.github.com/repos{loc}contributors"
        c_json = requests.get(url).json()
        contributors = []
        for c in c_json:
            contributors.append(c["login"])
        data = link, cve1, repo, cve2, langsl, contributors
    except:
        data = link, cve1, repo, cve2, langsl, contributors
    
    return data

In [6]:
# Example Test
x = get_data('https://github.com/ImageMagick/ImageMagick6/commit/553054c1cb1e4e05ec86237afef76a32cd7c464d')
x

('https://github.com/ImageMagick/ImageMagick6/commit/553054c1cb1e4e05ec86237afef76a32cd7c464d',
 [],
 'https://github.com/ImageMagick/ImageMagick6/',
 [],
 ['C 67.8',
  'HTML 18.4',
  'Makefile 4.8',
  'C++ 2.9',
  'XS 1.8',
  'Shell 1.7',
  'Other 2.6'],
 ['dlemstra',
  'glennrp',
  'Biswa96',
  'jcupitt',
  'Lastique',
  'joonsung90',
  'remicollet',
  'dlech',
  'Hrnchamd',
  'jeroen',
  'dependabot[bot]',
  'liclicli',
  'winterheart',
  'bgK',
  'staticfloat',
  'emcconville',
  'ralt',
  'ploki',
  'anolivetree',
  'chipitsine',
  'urban-warrior',
  'ijt',
  'mirakui',
  'xhorak',
  'jonsneyers',
  'lbartoletti',
  'maruno',
  'Schnouki',
  'tpett',
  'yoyap'])

In [7]:
time.sleep(10800)

In [8]:
data = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = []
    data = []
    for link in gitlinks:
        futures.append(executor.submit(get_data, link=link))
    for future in concurrent.futures.as_completed(futures):
        # print(future.result())
        data.append(future.result())



In [9]:
dd = []
for i in data:
    if i is not None:
        ii = list(i)
        dd.append(ii)
df = pd.DataFrame(dd, columns = ['Link', 'CVEs', 'Repo', 'Repo CVEs', 'Repo Languages', 'Contributors'])

In [10]:
df

Unnamed: 0,Link,CVEs,Repo,Repo CVEs,Repo Languages,Contributors
0,https://github.com/orangecertcc/security-resea...,[2021-1383],https://github.com/orangecertcc/security-resea...,"[2019-17570, 2021-42372, YYYY-XXXX, 2021-42770...","[Python 80.8, Java 18.4, Makefile 0.8]",[orange-cert-cc]
1,https://github.com/orangecertcc/security-resea...,[2021-1385],https://github.com/orangecertcc/security-resea...,"[2019-17570, 2021-42372, YYYY-XXXX, 2021-42770...","[Python 80.8, Java 18.4, Makefile 0.8]",[orange-cert-cc]
2,https://github.com/orangecertcc/security-resea...,[2021-0253],https://github.com/orangecertcc/security-resea...,"[2019-17570, 2021-42372, YYYY-XXXX, 2021-42770...","[Python 80.8, Java 18.4, Makefile 0.8]",[orange-cert-cc]
3,https://github.com/orangecertcc/security-resea...,[2021-0252],https://github.com/orangecertcc/security-resea...,"[2019-17570, 2021-42372, YYYY-XXXX, 2021-42770...","[Python 80.8, Java 18.4, Makefile 0.8]",[orange-cert-cc]
4,https://github.com/BlackFan/client-side-protot...,[2021-20088],https://github.com/BlackFan/client-side-protot...,"[2021-20083, 2021-20085, 2021-20089, 2021-2008...",[],"[BlackFan, amlnspqr, msrkp, p4fg, ardigan6, po..."
...,...,...,...,...,...,...
40466,https://github.com/clastix/capsule/releases/ta...,[],https://github.com/clastix/capsule/,[],[],[]
40467,https://github.com/MatMoul/g810-led/pull/297,[],https://github.com/MatMoul/g810-led/,[],[],[]
40468,https://github.com/jflyfox/jfinal_cms/issues/51,[],https://github.com/jflyfox/jfinal_cms/,[],[],
40469,https://github.com/HKD01l/bug_report/blob/main...,[],https://github.com/HKD01l/bug_report/,,,[]


In [11]:
uni = df.loc[df.astype(str).drop_duplicates().index]

In [12]:
uni.to_pickle('gitdata.pickle')