In [1]:
import os
import json
import javalang
import tokenize
import random

In [None]:
def tokenize_java(filepath):
    file = open(filepath, "r", encoding = "ISO-8859-1")
    tokens = javalang.tokenizer.tokenize(file.read())
    code = []
    for token in tokens:
        code.append(token.value)
#     print(f"Java Tokens Count: {len(code)}")
    return " ".join(code)

In [None]:
def detokenize_java(s):
    try:
        tokens = javalang.tokenizer.tokenize(s)
        return javalang.tokenizer.reformat_tokens(tokens)
    except:
        return s

In [None]:
def tokenize_python(filepath):
    code = []
    with tokenize.open(filepath) as f:
        tokens = tokenize.generate_tokens(f.readline)
        pre_token = None
        for token in tokens:
            if (pre_token != None and pre_token.type == tokenize.COMMENT and token.type == tokenize.NL) or (token.type == tokenize.COMMENT):
                pre_token = token
                continue
            elif token.type == tokenize.NEWLINE:
                val = token.string.replace("\n", "NEWLINE")
            elif token.type == tokenize.NL:
                val = "NL"
            elif token.type == tokenize.INDENT and token.string.isspace():
                no = int(len(token.string))
                val = "INDENT" * no
            elif token.type == tokenize.INDENT:
                val = token.string.replace("\t", "INDENT")
            elif token.type == tokenize.DEDENT:
                val = "DEDENT"
            elif token.type == tokenize.ENDMARKER:
                val = "ENDMARKER"
            else:
                val = token.string
            pre_token = token
            code.append(val)
#     print(f"Python Tokens Count: {len(code)}")
    return " ".join(code)

In [None]:
word_set = set()
def tokenize_python_codebert(filepath):
    code = ""
    with tokenize.open(filepath) as f:
        tokens = tokenize.generate_tokens(f.readline)
        pre_token = None
        for token in tokens:
            if (pre_token != None and pre_token.type == tokenize.COMMENT and token.type == tokenize.NL) or (token.type == tokenize.COMMENT):
                pre_token = token
                continue
            elif token.type == tokenize.NEWLINE:
                temp = token.string.replace("\n", "NEWLINE")
                val = " " + temp
                word_set.add(temp)
            elif token.type == tokenize.NL:
                temp = "NL"
                val = " " + temp
                word_set.add(temp)
            elif token.type == tokenize.INDENT and token.string.isspace():
                no = int(len(token.string))
                temp = "INDENT" * no
                val = " " + temp
                word_set.add(temp)
            elif token.type == tokenize.INDENT:
                temp = token.string.replace("\t", "INDENT")
                val = " " + temp
                word_set.add(temp)
            elif token.type == tokenize.DEDENT:
                temp = "DEDENT"
                val = " " + temp + " "
                word_set.add(temp)
            elif token.type == tokenize.ENDMARKER:
                temp = "ENDMARKER"
                val = " " + temp
                word_set.add(temp)
            else:
                start = token.start
                line = token.line
                space_idx = start[1] - 1
                if line[space_idx] == " " or (pre_token != None and pre_token.type == tokenize.NEWLINE):
                    val = " " + token.string
                else:
                    val = token.string
                
            pre_token = token
            code += val
#     print(f"Length of Python code: {len(code)}")
    return code

In [None]:
def write_to_file(lst, remark):
    total = sum(lst)
    with open("dataset.txt", "a") as outfile:
        outfile.write(f"{remark}: {total} \n\n")

In [None]:
def get_leetcode_filepath(path, mini):
    dir_list = os.listdir(path)
    file_count = 0
    file_paths = []
    for directory in dir_list:
        folders_path = f"{path}/{directory}"
        if os.path.isdir(folders_path):
            folders = os.listdir(folders_path)
            for folder in folders:
                if os.path.isdir(f"{folders_path}/{folder}"):
                    files = os.listdir(f"{folders_path}/{folder}")
                    for file in files:
                        if file.split('.')[1] == "java":
                            javafile = file
                        elif mini and file.startswith("mini_"):
                            pythonfile = file
                        elif file.split('.')[1] == "py":
                            pythonfile = file
                    file_count += 1
                    javafile_path = os.path.join(path, directory, folder, javafile)
                    pythonfile_path = os.path.join(path, directory, folder, pythonfile)
                    file_paths.append([javafile_path, pythonfile_path])
#     write_to_file([file_count], "LeetCode File Count")
    print(f"LeetCode File Count:{file_count}")
    return file_paths

In [None]:
def get_codejam_filepath(path, mini):
    dir_list = os.listdir(path)
    file_count = 0
    file_paths = []

    for directory in dir_list:
        folders = os.listdir(f"{path}/{directory}")
        for folder in folders:
            files = os.listdir(f"{path}/{directory}/{folder}")
            for file in files:
                if "java" in file:
                    javafile = file
                    if mini:
                        pythonfile = "mini_" + file.split(".")[0] + ".py"
                    else:
                        pythonfile = file.split(".")[0] + ".py"
                    if pythonfile in files:
                        file_count += 1
                        javafile_path = os.path.join(path, directory, folder, javafile)
                        pythonfile_path = os.path.join(path, directory, folder, pythonfile)
                        file_paths.append([javafile_path, pythonfile_path])
#     write_to_file([file_count], "CodeJam File Count")
    print(f"CodeJam File Count:{file_count}")
    return file_paths            

In [None]:
def get_codeforces_filepath(path, mini):
    dir_list = os.listdir(path)
    file_count = 0
    file_paths = []

    for directory in dir_list:
        folders_path = f"{path}/{directory}"
        if os.path.isdir(folders_path):
            folders = os.listdir(folders_path)
            for folder in folders:
                files = os.listdir(f"{folders_path}/{folder}")
                for file in files:
                    if "java" in file:
                        javafile = file
                        if mini:
                            pythonfile = "mini_" + file.split(".")[0] + ".py"
                        else:
                            pythonfile = file.split(".")[0] + ".py"
                        if pythonfile in files:
                            file_count += 1
                            javafile_path = os.path.join(path, directory, folder, javafile)
                            pythonfile_path = os.path.join(path, directory, folder, pythonfile)
                            file_paths.append([javafile_path, pythonfile_path])
#     write_to_file([file_count], "CodeForces File Count")
    print(f"CodeForces File Count:{file_count}")
    return file_paths

In [None]:
def get_geeksforgeeks_filepath(path, mini):
    dir_list = os.listdir(path)
    file_count = 0
    file_paths = []

    for directory in dir_list:
        folders_path = f"{path}/{directory}"
        if os.path.isdir(folders_path):
            folders = os.listdir(folders_path)
            for folder in folders:
                if os.path.isdir(f"{folders_path}/{folder}"):
                    files = os.listdir(f"{folders_path}/{folder}")
                    for file in files:
                        if file.split(".")[1] == "java":
                            javafile = file
                        elif mini and file.startswith("mini_"):
                            pythonfile = file
                        else:
                            pythonfile = file
                    file_count += 1
                    javafile_path = os.path.join(path, directory, folder, javafile)
                    pythonfile_path = os.path.join(path, directory, folder, pythonfile)
                    file_paths.append([javafile_path, pythonfile_path])
#     write_to_file([file_count], "GeeksForGeeks File Count")
    print(f"GeetksForGeeks File Count:{file_count}")
    return file_paths           

In [None]:
def get_projecteuler_filepath(path, mini):
    dir_list = os.listdir(path)
    file_count = 0
    file_paths = []

    for directory in dir_list:
        folders_path = f'{path}/{directory}' 
        if os.path.isdir(folders_path):
            folders = os.listdir(folders_path)
            for folder in folders:
                if os.path.isdir(f'{folders_path}/{folder}'):
                    files = os.listdir(f'{folders_path}/{folder}')
                    for file in files:
                        if file.split('.')[1] == 'java':
                            javafile = file
                        elif mini and file.startswith("mini_"):
                            pythonfile = file
                        else:
                            pythonfile = file
                    file_count += 1
                    javafile_path = os.path.join(path, directory, folder, javafile)
                    pythonfile_path = os.path.join(path, directory, folder, pythonfile)
                    file_paths.append([javafile_path, pythonfile_path])
#     write_to_file([file_count], "ProjectEuler File Count")
    print(f"ProjectEuler File Count:{file_count}")
    return file_paths         

In [None]:
def cleanup(filepaths):
    for filepath in filepaths:
        os.popen(f"autopep8 --in-place --aggressive --aggressive {filepath[1]}")
        print(f"Completed cleaning: {filepath[1]}")

In [None]:
def minify(filepaths):
    new_filepaths = []
    for filepath in filepaths:
        paths = filepath[1].split("/")
        length = len(paths)
        filename = paths[length - 1]
        paths[length - 1] = "mini_" + filename
        mini_filepath = "/".join(paths)
        os.popen(f"pyminifier {filepath[1]} > {mini_filepath}")
        print(f"Completed processing {filepath[1]} to {mini_filepath}")
        new_filepaths.append([filepath[0], mini_filepath])
    return new_filepaths

In [None]:
def get_code_list(filepaths):
    valid_count = 0
    error_count = 0
    dic_list = []
    codebert_dic_list = []
    for filepath in filepaths:
        try:
            java_code, python_code = tokenize_java(filepath[0]), tokenize_python(filepath[1])
            
            ## for codebert
            codebert_java_code, codebert_python_code = " ".join(detokenize_java(tokenize_java((filepath[0]))).split()), tokenize_python_codebert(filepath[1])
            
            if python_code == "NL ENDMARKER" or python_code == "ENDMARKER":
                continue
                
            print("Counting files...")
            print(f"File paths: {filepath}")
            print(f"Java Code Length: {len(java_code)}, Python Code Length: {len(python_code)}")
            decoded_java = java_code.encode("ascii", "ignore").decode()
            decoded_python = python_code.encode("ascii", "ignore").decode()
            
            ## for codebert
            codebert_decoded_java = codebert_java_code.encode("ascii", "ignore").decode()
            codebert_decoded_python = codebert_python_code.encode("ascii", "ignore").decode()
            
            if len(decoded_java) < 5 or len(decoded_python) < 5 or len(decoded_java) > 450 or len(decoded_python) > 450:
                continue
            valid_count += 1
#             print(f"File paths: {filepath}")
            print(f"Valid File Count:{valid_count}, Java Code Length: {len(java_code)}, Python Code Length: {len(python_code)}")
            file = filepath[0].split("/")
            del file[len(file) - 1]
            dic = {
                "id": "_".join(file) + "_" + str(valid_count),
                "java_code": decoded_java,
                "python_code": decoded_python
            }
            
            ## for codebert
            codebert_dic = {
                "id": "_".join(file) + "_" + str(valid_count),
                "java_code": codebert_decoded_java,
                "python_code": codebert_decoded_python
            }
            
            dic_list.append(dic)
            
            ## for codebert
            codebert_dic_list.append(codebert_dic)
        except:
            error_count += 1
            print("An error occured!!!")
    return dic_list, codebert_dic_list

In [None]:
filepaths = get_leetcode_filepath("java-python/LeetCode", False) 
filepaths.extend(get_codejam_filepath("java-python/CodeJam", False))
filepaths.extend(get_codeforces_filepath("java-python/CodeForces", False))
filepaths.extend(get_geeksforgeeks_filepath("java-python/GeeksForGeeks", False))

pe_filepaths = get_projecteuler_filepath("java-python/ProjectEuler", False)

# write_to_file([len(filepaths), len(pe_filepaths)], "File Count (before cleaning ProjectEuler files)")
# cleanup(pe_filepaths)

filepaths.extend(pe_filepaths)
# write_to_file([len(filepaths)], "File Count (after cleanup of ProjectEuler files)")

# mini_filepaths = minify(filepaths)
print(len(filepaths))

In [None]:
mini_filepaths = get_leetcode_filepath("java-python/LeetCode", True) 
mini_filepaths.extend(get_codejam_filepath("java-python/CodeJam", True))
mini_filepaths.extend(get_codeforces_filepath("java-python/CodeForces", True))
mini_filepaths.extend(get_geeksforgeeks_filepath("java-python/GeeksForGeeks", True))
mini_filepaths.extend(get_projecteuler_filepath("java-python/ProjectEuler", True))
write_to_file([len(mini_filepaths)], "File Count (after minification)")
print(len(mini_filepaths))

In [None]:
random.shuffle(mini_filepaths)
code_list, codebert_code_list = get_code_list(mini_filepaths)

In [None]:
print(filepaths)

In [None]:
write_to_file([len(code_list)], "No. of Programs (5 <= length <= 450)")
print(len(code_list))
print(len(codebert_code_list))

In [None]:
data = {"codes": code_list}
json_object = json.dumps(data, indent = 4)

with open("code450.json", "w") as outfile:
    outfile.write(json_object)                

In [None]:
codebert_data = {"codes": codebert_code_list}
codebert_json_object = json.dumps(codebert_data, indent = 4)

with open("codebert_code450.json", "w") as outfile:
    outfile.write(codebert_json_object)                

In [None]:
with open("words450.txt", "w") as outfile:
    outfile.write(",".join(word_set))

In [None]:
code_list[0]

In [None]:
leetcode = 0
code_jam = 0
code_forces = 0
geeks_for_geeks = 0
project_euler = 0
for code in code_list:
    code_id = code["id"]
    if "LeetCode" in code_id:
        leetcode += 1
    elif "CodeJam" in code_id:
        code_jam += 1
    elif "CodeForces" in code_id:
        code_forces += 1
    elif "GeeksForGeeks" in code_id:
        geeks_for_geeks += 1
    elif "ProjectEuler" in code_id:
        project_euler += 1

write_to_file([leetcode], "Final LeetCode File Count")
write_to_file([code_jam], "Final CodeJam File Count")
write_to_file([code_forces], "Final CodeForces File Count")
write_to_file([geeks_for_geeks], "Final GeeksForGeeks File Count")
write_to_file([project_euler], "Final ProjectEuler File Count")
    