In [28]:
from pathlib import Path
import numpy as np
import json
from tqdm.notebook import tqdm
from collections import defaultdict

from zipfile import ZipFile
import hashlib

In [10]:
output = Path("../telegram-ml-contest-data/files")
repos = Path("../telegram-ml-contest-data/repos")

output.mkdir(exist_ok=True)

In [130]:
lang_suffixes = {
    "OpenEdgeABL": [".p", "w"],
    "F#": [".fs", "fsx", "fsi"],
    "Solidity": [".sol"],
    "C++": [".cpp"],
    "1C Enterprise": [".1cd", ".cf", ".os", ".bsl"],
    "APEX": [".cls"],
    "PLSQL": [".sql", ".plsql", ".pks", ".pkb", ".prc", ".fnc"],
    "Keyman": [".kmn"],
    "FunC": [".fc"],
    "FIFT": [".fif"],
    "Icon": [".icn"],
    "Lisp": [".lsp", ".rkt", ".el", ".lisp", ".lsp"],
    "CommonLisp": [".cl", ".lisp", ".l", ".asd", ".ny"],
    "Logo": [".nlogo"],
    "C#": [".cs"],
    "Bison": [".y", ".ypp", ".bison", ".yy", ".ll"],
    "Protobuf": [".proto"],
    "Smalltalk": [".st"],
    "Ada": [".adb", ".ads"],
    "ActionScript": [".as"],
    "ASP": [".asp"],
    "Delphi": [".dpr", ".dfm"],
    "Pascal": [".pas", ".pp"],
    "AWK": [".awk"],
    "D": [".d"],
    "Forth": [".fs"],
    "Scheme": [".scm", ".ss"],
    "VDScript": [".vd"],
    "IDL": [".idl"],
    "Basic": [".bas"],
    "Tcl": [".tcl", ".tk"],
    "Wolfram": [".nb", ".wl"],
    "QML": [".qml"],
    "GraphQL": [".graphql"],
    "AutoHotKey": [".ahk"],
    "Raku": [".raku", ".p6"],
    "AppleScript": [".applescript", ".scpt"],
    "Vala": [".vala"],
    "Elm": [".elm"],
    "NGINX": [".nginx", ".conf"],
    "Textile": [".textile"],
    "Gradle": [".gradle"],
    "UnrealScript": [".uc", ".usc"],
    "Regex": [".regex", ".regexp"],
    "Hack": [".hack", ".hh"],
    "TL": [".tl"],
    "Crystal": [".cr"],
    "SAS": [".sas", ".sas7bdat"],
    "Nim": [".nim", ".nims"],
    "GAMS": [".gms", ".gmy"],
    "ABAP": [".abap", ".ab"],
}

In [159]:
extracted_archives_path = output / "extracted_archives.json"
extracted_files_path = output / "extracted_files.json"

if extracted_archives_path.exists():
    with open(extracted_archives_path, "r") as file:
        extracted_archives = defaultdict(list, json.load(file))
    with open(extracted_files_path, "r") as file: 
        extracted_files = defaultdict(list, json.load(file))

    for lang in extracted_files:
        extracted_files[lang] = np.unique(extracted_files[lang]).tolist()
else:
    extracted_archives = defaultdict(list)
    extracted_files = defaultdict(list)

archives = list(repos.glob("*/*.zip"))

MAX_NUM_FILES = 10000

try:
    for archive in tqdm(archives):
        lang = archive.parent.name
        lang_dir = output / lang
        lang_dir.mkdir(exist_ok=True, parents=True)

        if len(extracted_files[lang]) > MAX_NUM_FILES:
            continue 

        if archive.name in extracted_archives.get(lang, []):
            continue

        with ZipFile(archive, 'r') as zip_ref:

            for fileref in zip_ref.namelist():
                filepath = Path(fileref)
                if filepath.suffix in lang_suffixes[lang]:
                    content = zip_ref.read(fileref)
                    filepath = lang_dir / filepath.name

                    sha = hashlib.sha1(content).hexdigest()
                    filepath = lang_dir / f"{sha}{lang_suffixes[lang][0]}"

                    filepath.write_bytes(content)
                    extracted_files[lang].append(filepath.name)

        extracted_archives[lang].append(archive.name)
        
finally:
    with open(extracted_archives_path, "w", encoding="utf8") as file:
        json.dump(extracted_archives, file, indent=4)
    
    with open(extracted_files_path, "w", encoding="utf8") as file:
        json.dump(extracted_files, file, indent=4)

  0%|          | 0/3864 [00:00<?, ?it/s]

In [160]:
total = 0

for lang, files in extracted_files.items():
    print(lang, len(files))
    total += len(files)

print(total)

FIFT 2141
OpenEdgeABL 8876
Lisp 1074
PLSQL 7679
F# 1227
Solidity 801
Icon 2469
FunC 704
C# 10032
CommonLisp 2976
C++ 11902
Keyman 1180
Logo 336
1C Enterprise 10288
APEX 4336
Bison 160
Protobuf 1580
Smalltalk 10022
Ada 11648
ASP 1268
ActionScript 14102
Pascal 13468
Delphi 6112
AWK 665
D 10396
IDL 159
Scheme 10193
Forth 2162
Tcl 5952
Basic 2898
Wolfram 871
QML 7681
GraphQL 2
Raku 1307
AutoHotKey 7045
Crystal 10204
Nim 4507
SAS 10127
NGINX 583
UnrealScript 12686
GAMS 2547
Hack 2912
AppleScript 967
ABAP 3681
Gradle 3
Elm 7883
Vala 7063
236875
