In [1]:
import json
import glob
import os
import tarfile
import re
import datetime

In [2]:
PREFIX = "/home/jpivarski/storage/data/physicists/forks-of-cmssw-2022-03-04"

In [3]:
userrepos = []
for filename in glob.glob(PREFIX + "/user-repos/*.json"):
    userrepos.extend(json.load(open(filename)))

In [4]:
len(set([repo["full_name"] for repo in userrepos if not repo["fork"]]))

16737

In [5]:
len([repo for repo in userrepos if not repo["fork"]])

16737

In [6]:
regex = {
    "": re.compile(r"\b(import\s+uproot[34]*|from\s+uproot[34]*\s+import)\b"),
    "": re.compile(r"\b(import\s+awkward[01]*|from\s+awkward[01]*\s+import)\b"),
    "": re.compile(r"\b(import\s+coffea*|from\s+coffea*\s+import)\b"),
    "": re.compile(r"\b(import\s+ROOT*|from\s+ROOT*\s+import|include\s*[<\"]TFile|include\s*[<\"]TTree|include\s*[<\"]TH[123]|include\s*[<\"]TMVA|include\s*[<\"]ROOT)\b"),
    "": re.compile(r"\b(import\s+numpy*|from\s+numpy*\s+import)\b"),
    "": re.compile(r"\b(import\s+scipy*|from\s+scipy*\s+import)\b"),
    "": re.compile(r"\b(import\s+matplotlib*|from\s+matplotlib*\s+import)\b"),
    "": re.compile(r"\b(import\s+pandas*|from\s+pandas*\s+import)\b"),
    "": re.compile(r"\b(import\s+sklearn*|from\s+sklearn*\s+import)\b"),
    "": re.compile(r"\b(import\s+tensorflow*|from\s+tensorflow*\s+import)\b"),
    "": re.compile(r"\b(import\s+keras*|from\s+keras*\s+import)\b"),
    "": re.compile(r"\b(import\s+torch*|from\s+torch*\s+import)\b"),
    "": re.compile(r"\b(import\s+jax*|from\s+jax*\s+import)\b"),
    "": re.compile(r"\b(import\s+xgboost*|from\s+xgboost*\s+import)\b"),
    "": re.compile(r"\b(import\s+scipy*|from\s+scipy*\s+import)\b"),
}

In [7]:
python_import = re.compile(rb"\bimport\s+([A-Za-z_][A-Za-z_0-9]*)\b")
python_fromimport = re.compile(rb"\bfrom\s+([A-Za-z_][A-Za-z_0-9]*)\s+import\b")
cpp_include_bracket = re.compile(rb"#include\s*<\s*([^>,]+)\s*>")
cpp_include_quote = re.compile(rb"\#include\s*\"\s*([^\",]+)\s*\"")
ipynb_include_quotedquote = re.compile(rb"\#include\s*\\\"\s*([^\\\",]+)\s*\\\"")

In [8]:
python_import.search(b"import  something").group(1)

b'something'

In [9]:
python_fromimport.search(b"from something import *").group(1)

b'something'

In [10]:
cpp_include_bracket.search(b"#include<something>").group(1)

b'something'

In [11]:
cpp_include_quote.search(b"#include\"something\"").group(1)

b'something'

In [12]:
print(r'    "#include \"TMVA/MethodRSNNS.h\"\n",')

    "#include \"TMVA/MethodRSNNS.h\"\n",


In [13]:
ipynb_include_quotedquote.search(rb'    "#include \"TMVA/MethodRSNNS.h\"\n",').group(1)

b'TMVA/MethodRSNNS.h'

In [14]:
with open(f"{PREFIX}/GitHub-CMSSW-user-nonfork-dependencies.csv", "w") as outfile:
    outfile.write("owner,repo,created,updated,filename,suffix,package,how\n")
    for i, repo in enumerate(userrepos):
        if not repo["fork"]:
            owner, reponame = repo["full_name"].split("/")
            # print(owner, reponame)
            created_at = repo["created_at"].rstrip("Z")
            updated_at = repo["updated_at"].rstrip("Z")
            tarname = f"{PREFIX}/actual-repos-2/{owner}-{reponame}.tgz"
            if os.path.exists(tarname):
                assert "," not in owner
                assert "," not in repo["full_name"]
                assert "," not in created_at
                with tarfile.open(tarname) as tarball:
                    # with tarball.extractfile(f"{owner}-{reponame}/git-log.json") as file:
                    #     ages = {}
                    #     for commit in git_log["log"]:
                    #         timestamp = commit["commit_time"]
                    #         for delta in commit["deltas"]:
                    #             if delta["new"] not in ages or timestamp > ages[delta["new"]]:
                    #                 ages[delta["new"]] = timestamp
                    for info in tarball:
                        name = info.name
                        quoted_name = json.dumps(name)
                        suffix = name.split(".")[-1]
                        if info.type not in (tarfile.LNKTYPE, tarfile.SYMTYPE):
                            try:
                                file = tarball.extractfile(name)
                                if file is not None:
                                    source = file.read()
                                    if name.endswith(".py") or name.endswith(".PY"):
                                        for match in python_import.findall(source):
                                            outfile.write(f"{owner},{repo['full_name']},{created_at},{updated_at},{quoted_name},{suffix},{json.dumps(match.decode())},python_import\n")
                                        for match in python_fromimport.findall(source):
                                            outfile.write(f"{owner},{repo['full_name']},{created_at},{updated_at},{quoted_name},{suffix},{json.dumps(match.decode())},python_fromimport\n")
                                    elif name.endswith(".ipynb") or name.endswith(".IPYNB"):
                                        for match in python_import.findall(source):
                                            outfile.write(f"{owner},{repo['full_name']},{created_at},{updated_at},{quoted_name},{suffix},{json.dumps(match.decode())},python_import\n")
                                        for match in python_fromimport.findall(source):
                                            outfile.write(f"{owner},{repo['full_name']},{created_at},{updated_at},{quoted_name},{suffix},{json.dumps(match.decode())},python_fromimport\n")
                                        for match in cpp_include_bracket.findall(source):
                                            outfile.write(f"{owner},{repo['full_name']},{created_at},{updated_at},{quoted_name},{suffix},{json.dumps(match.decode())},cpp_include_bracket\n")
                                        for match in cpp_include_quote.findall(source):
                                            outfile.write(f"{owner},{repo['full_name']},{created_at},{updated_at},{quoted_name},{suffix},{json.dumps(match.decode())},cpp_include_quote\n")
                                        for match in ipynb_include_quotedquote.findall(source):
                                            outfile.write(f"{owner},{repo['full_name']},{created_at},{updated_at},{quoted_name},{suffix},{json.dumps(match.decode())},ipynb_include_quotedquote\n")
                                    else:
                                        for match in cpp_include_bracket.findall(source):
                                            outfile.write(f"{owner},{repo['full_name']},{created_at},{updated_at},{quoted_name},{suffix},{json.dumps(match.decode())},cpp_include_bracket\n")
                                        for match in cpp_include_quote.findall(source):
                                            outfile.write(f"{owner},{repo['full_name']},{created_at},{updated_at},{quoted_name},{suffix},{json.dumps(match.decode())},cpp_include_quote\n")
                            finally:
                                if file is not None:
                                    file.close()

In [15]:
tarball = tarfile.open("/home/jpivarski/storage/data/physicists/forks-of-cmssw-2022-03-04/actual-repos-2/dabarbosa10-HEP.tgz")

In [16]:
"dabarbosa10-HEP/2021/Aug/Func.C" in tarball.getnames()

True

In [17]:
print(tarball.extractfile("dabarbosa10-HEP/2021/Aug/Func.C"))

None
