In [1]:
import re

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gensim
from gensim.models import Word2Vec
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split

from src.dataset import read_lang_dataset, tokenize_dataset, get_vocab_mapping

In [2]:
from src.dataset import gateways, classpath
gateways[0].jvm.com.codetokenizer.Tokenizer.tokenizeCsharp("""
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Linq;
using Microsoft.CodeAnalysis.CSharp.Symbols;

namespace Microsoft.CodeAnalysis.CSharp
{
    internal static class CSharpCompilationExtensions
    {
        int member;
        internal static bool IsFeatureEnabled(this CSharpCompilation compilation, MessageID feature)
        {
            var a = 5;
            int b = 0x5 + a;
            var tst = "abc" + 'd';
            var t = 56f + member;
            return null;
        }
    }
}
""")

['using', 'System', '.', 'Linq', ';', 'using', 'Microsoft', '.', 'CodeAnalysis', '.', 'CSharp', '.', 'Symbols', ';', 'namespace', 'Microsoft', '.', 'CodeAnalysis', '.', 'CSharp', '{', 'internal', 'static', 'class', 'CSharpCompilationExtensions', '{', 'int', 'VARIABLE', ';', 'internal', 'static', 'bool', 'IsFeatureEnabled', '(', 'this', 'CSharpCompilation', 'VARIABLE', ',', 'MessageID', 'VARIABLE', ')', '{', 'var', 'VARIABLE', '=', 'INT_LITERAL', ';', 'int', 'VARIABLE', '=', 'INT_LITERAL', '+', 'VARIABLE', ';', 'var', 'VARIABLE', '=', 'STRING_LITERAL', '+', 'STRING_LITERAL', ';', 'var', 'VARIABLE', '=', 'FLOAT_LITERAL', '+', 'VARIABLE', ';', 'return', 'null', ';', '}', '}', '}', '<EOF>']

In [3]:
gateways[0].jvm.com.codetokenizer.Tokenizer.tokenizeCpp("""
#include <iostream>
using namespace std;

int main()
{
    int a = 5, b = 10, temp;

    cout << "Before swapping." << endl;
    cout << "a = " << a << ", b = " << b << endl;

    temp = a;
    a = b;
    b = temp;

    cout << "\nAfter swapping." << endl;
    cout << "a = " << a << ", b = " << b << endl;

    return 0;
}
""")

['using', 'namespace', 'std', ';', 'int', 'main', '(', ')', '{', 'int', 'VARIABLE', '=', 'INT_LITERAL', ',', 'VARIABLE', '=', 'INT_LITERAL', ',', 'VARIABLE', ';', 'cout', '<', '<', 'STRING_LITERAL', '<', '<', 'endl', ';', 'cout', '<', '<', 'STRING_LITERAL', '<', '<', 'VARIABLE', '<', '<', 'STRING_LITERAL', '<', '<', 'VARIABLE', '<', '<', 'endl', ';', 'VARIABLE', '=', 'VARIABLE', ';', 'VARIABLE', '=', 'VARIABLE', ';', 'VARIABLE', '=', 'VARIABLE', ';', 'cout', '<', '<', 'After', 'using', 'namespace', 'std', ';', 'int', 'VARIABLE', '(', ')', '{', 'int', 'VARIABLE', '=', 'INT_LITERAL', ',', 'VARIABLE', '=', 'INT_LITERAL', ',', 'VARIABLE', ';', 'cout', '<', '<', 'STRING_LITERAL', '<', '<', 'endl', ';', 'cout', '<', '<', 'STRING_LITERAL', '<', '<', 'VARIABLE', '<', '<', 'STRING_LITERAL', '<', '<', 'VARIABLE', '<', '<', 'endl', ';', 'VARIABLE', '=', 'VARIABLE', ';', 'VARIABLE', '=', 'VARIABLE', ';', 'VARIABLE', '=', 'VARIABLE', ';', 'cout', '<', '<', 'After', 'swapping', '.', 'cout', '<', '<'

In [4]:
gateways[0].jvm.com.codetokenizer.Tokenizer.tokenizeGo("""
package samples

import "fmt"

type HpType struct {
}

func (c HpType) HP() {
	password := `hardcoded`

	fmt.Printf("Hello, world\nYou type the password=%v\n", password)
	letters := []string{"a", "b", "c", "d"}
	letters[3] = "e"

	p := make([]string, 10)
	p = append(letters, "e", "f")
	fmt.Println(letters, len(letters), cap(letters))
	fmt.Println(p, len(p), cap(p))
}
""")

['package', 'samples', '\n\n', 'import', 'STRING_LITERAL', '\n\n', 'type', 'HpType', 'struct', '{', '}', '\n\n', 'func', '(', 'VARIABLE', 'HpType', ')', 'HP', '(', ')', '{', 'VARIABLE', ':=', 'STRING_LITERAL', '\n\n', 'fmt', '.', 'Printf', '(', 'STRING_LITERAL', ',', 'VARIABLE', ')', '\n', 'VARIABLE', ':=', '[', ']', 'string', '{', 'STRING_LITERAL', ',', 'STRING_LITERAL', ',', 'STRING_LITERAL', ',', 'STRING_LITERAL', '}', '\n', 'VARIABLE', '[', 'INT_LITERAL', ']', '=', 'STRING_LITERAL', '\n\n', 'VARIABLE', ':=', 'make', '(', '[', ']', 'string', ',', 'INT_LITERAL', ')', '\n', 'VARIABLE', '=', 'append', '(', 'VARIABLE', ',', 'STRING_LITERAL', ',', 'STRING_LITERAL', ')', '\n', 'fmt', '.', 'Println', '(', 'VARIABLE', ',', 'len', '(', 'VARIABLE', ')', ',', 'cap', '(', 'VARIABLE', ')', ')', '\n', 'fmt', '.', 'Println', '(', 'VARIABLE', ',', 'len', '(', 'VARIABLE', ')', ',', 'cap', '(', 'VARIABLE', ')', ')', '\n', '}', '\n', '<EOF>']

In [5]:
pycode = " ".join(gateways[0].jvm.com.codetokenizer.Tokenizer.tokenizePython3("""
from functools import partial
from typing import Dict, List, Optional, Tuple
from multiprocessing import Pool
import platform

from tqdm import tqdm
import pandas as pd
import numpy as np
import swifter
from swifter import set_defaults
import sqlite3
from py4j.java_gateway import JavaGateway, launch_gateway, GatewayParameters

set_defaults(allow_dask_on_strings=True, progress_bar=True)

STRING_LITERAL_TOKEN = "STRING_LITERAL"
INT_LITERAL_TOKEN = "INT_LITERAL"


def _generic_regex_tokenization(code: pd.Series):
    # TODO: maybe consider \t seperately for python and other such languages
    vars_or_keywords = r"\w+"
    dot_operator = r"\."
    # parantheses and other similar constructs
    parantheses_like = r"[<>/\\{}[\]()'\"]"
    # almost \W, but with some whitespaces. Captures rest of characters.
    non_words = r"[^a-zA-Z0-9_ \t\n\.<>/\\{}[\]()'\"]+"
    generic_regex = (
        rf"({vars_or_keywords}|{dot_operator}|{parantheses_like}|{non_words})"
    )

    return (
        code.str.lower()
        .str.replace(r"'(\\.|[^'\\])*'", f" {STRING_LITERAL_TOKEN} ", regex=True)
        .str.replace(r'"(\\.|[^"\\])*"', f" {STRING_LITERAL_TOKEN} ", regex=True)
        .str.replace(r"0x(\d|\w)+", f" {INT_LITERAL_TOKEN} ", regex=True)
        .str.replace(r"\d+", f" {INT_LITERAL_TOKEN} ", regex=True)
        .str.findall(generic_regex)
    )


pool_size = 8
# "create_parser.(sh|bat)" script will create this
jarpath = (
    "./src/w2vtokenizer/target/w2vtokenizer-0.0.1-SNAPSHOT-jar-with-dependencies.jar"
)
classpath_seperator = ";" if platform.system() == "Windows" else ":"
classpath = classpath_seperator.join([jarpath])

gateway_port = launch_gateway(classpath=classpath, die_on_exit=True)
gateways = [
    JavaGateway(gateway_parameters=GatewayParameters(port=gateway_port))
    for _ in range(pool_size)
]


def code_tokenize_par(t, function_name):
    i, code = t
    res = getattr(
        gateways[i % pool_size].jvm.com.codetokenizer.Tokenizer, function_name
    )(code)
    return list(res)


def _antlr_tokenization(code: pd.Series, function_name: str):
    with Pool(pool_size) as p:
        tokenized_code = pd.Series(
            tqdm(
                p.imap(
                    partial(code_tokenize_par, function_name=function_name),
                    enumerate(code),
                ),
                total=len(code),
                smoothing=0.01,
            ),
            index=code.index,
            dtype=object,
        )
    return tokenized_code


def _cpp_tokenization(code: pd.Series):
    return _antlr_tokenization(code, "tokenizeCpp")


def _csharp_tokenization(code: pd.Series):
    return _antlr_tokenization(code, "tokenizeCsharp")


SPECIALIZED_TOKENIZATION = {"C++": _cpp_tokenization, "C#": _csharp_tokenization}


def read_snippets_dataset(
    db_file_path: str, programming_language: Optional[str] = None
) -> pd.DataFrame:
    conn = sqlite3.connect(db_file_path)
    cur = conn.cursor()

    # check if database contains table "progress"
    foo = cur.execute(
        "SELECT name FROM sqlite_master WHERE type='table' AND name='progress'"
    )
    # "our" database
    if len(list(foo)) == 0:
        if programming_language is None:
            snippets = cur.execute("SELECT language, snippet FROM snippets")
        else:
            snippets = cur.execute(
                f"SELECT language, snippet FROM snippets WHERE language='{programming_language}'"
            )
    else:
        if programming_language is None:
            snippets = cur.execute("SELECT language, content FROM code")
        else:
            snippets = cur.execute(
                f"SELECT language, content FROM code WHERE language='{programming_language}'"
            )

    return pd.DataFrame(snippets, columns=["language", "code"])


def read_lang_dataset(db_file_path: str) -> pd.DataFrame:
    conn = sqlite3.connect(db_file_path)
    cur = conn.cursor()
    data = cur.execute("SELECT language, content FROM code")
    data = pd.DataFrame(data, columns=["language", "code"])
    data.code = data.code.str.decode("utf-8", errors="replace")
    return data


def tokenize_dataset(dataset: pd.DataFrame):
    dataset = dataset.copy()
    for language in dataset.language.unique():
        code_selection = dataset.code[dataset.language == language]
        if language in SPECIALIZED_TOKENIZATION:
            dataset.code[dataset.language == language] = SPECIALIZED_TOKENIZATION[
                language
            ](code_selection)
        else:
            dataset.code[dataset.language == language] = _generic_regex_tokenization(
                code_selection
            )
    return dataset


def get_vocab_mapping(
    whole_tokenized_dataset: pd.DataFrame,
) -> Tuple[Dict[str, int], List[str]]:
    # TODO: initialize with whole vocab
    words = set()
    whole_tokenized_dataset["code"].apply(words.update)
    words = sorted(words)
    int2word = words
    word2int = {w: i for i, w in enumerate(words)}
    return word2int, int2word
"""))
pycode

'\n from functools import partial \n from typing import Dict , List , Optional , Tuple \n from multiprocessing import Pool \n import platform \n BOOL_LITERAL , BOOL_LITERAL \n if BOOL_LITERAL :        pass \n \n from tqdm import tqdm \n import pandas as pd \n import numpy as np \n import swifter \n from swifter import set_defaults \n import sqlite3 \n from py4j . java_gateway import JavaGateway , launch_gateway , GatewayParameters \n set_defaults ( VARIABLE = BOOL_LITERAL , VARIABLE = BOOL_LITERAL ) \n VARIABLE = STRING_LITERAL \n VARIABLE = STRING_LITERAL \n def _generic_regex_tokenization ( VARIABLE : pd . Series ) :        VARIABLE = STRING_LITERAL   VARIABLE = STRING_LITERAL   VARIABLE = STRING_LITERAL   VARIABLE = r [ VARIABLE - VARIABLE - VARIABLE - FLOAT_LITERAL _ \n { } [ ] ( ) STRING_LITERAL   VARIABLE = ( STRING_LITERAL )   return ( code . str . lower ( ) . str . replace ( STRING_LITERAL , STRING_LITERAL , regex = BOOL_LITERAL ) . str . replace ( STRING_LITERAL , STRING_LITER

In [15]:
print(pycode)


 from functools import partial 
 from typing import Dict , List , Optional , Tuple 
 from multiprocessing import Pool 
 import platform 
 from tqdm import tqdm 
 import pandas as pd 
 import numpy as np 
 import swifter 
 from swifter import set_defaults 
 import sqlite3 
 from py4j . java_gateway import JavaGateway , launch_gateway , GatewayParameters 
 set_defaults ( VARIABLE = True , VARIABLE = True ) 
 VARIABLE = STRING_LITERAL 
 VARIABLE = STRING_LITERAL 
 def _generic_regex_tokenization ( VARIABLE : pd . Series ) :        VARIABLE = STRING_LITERAL   VARIABLE = STRING_LITERAL   VARIABLE = STRING_LITERAL   VARIABLE = r [ VARIABLE - VARIABLE - VARIABLE - 9 _ 
 { } [ ] ( ) STRING_LITERAL   VARIABLE = ( STRING_LITERAL )   return ( code . str . lower ( ) . str . replace ( STRING_LITERAL , STRING_LITERAL , regex = True ) . str . replace ( STRING_LITERAL , STRING_LITERAL , regex = True ) . str . replace ( STRING_LITERAL , STRING_LITERAL , regex = True ) . str . replace ( STRING_LITERAL 

In [6]:
csharp_df = read_lang_dataset("data/csharp_codes.db")
csharp_df

Unnamed: 0,language,code
0,C#,using System;\nusing System.Collections.Generi...
1,C#,using System;\nusing System.Collections.Generi...
2,C#,using System.Collections.Generic;\nusing Syste...
3,C#,using System;\nusing System.Windows;\nusing Sy...
4,C#,using System;\nusing System.Collections.Generi...
...,...,...
1108,C#,using System;\nusing System.Collections.Generi...
1109,C#,// <auto-generated />\nusing System;\nusing Mi...
1110,C#,"#pragma checksum ""C:\Users\gault\Desktop\Effic..."
1111,C#,using System;\nusing System.Collections.Generi...


In [7]:
sample = csharp_df.sample(2)
sample

Unnamed: 0,language,code
1024,C#,/*********************************************...
51,C#,//--------------------------------------------...


In [8]:
tokenized_sample = tokenize_dataset(sample)
tokenized_sample

100%|██████████| 2/2 [00:01<00:00,  1.06it/s]


Unnamed: 0,language,code
1024,C#,"[using, System, ;, using, System, ., Component..."
51,C#,"[using, System, ., Collections, ., Generic, ;,..."


In [9]:
sample.code.iloc[1]

'//---------------------------------------------------------------------------\n//\n// Copyright (C) Microsoft Corporation.  All rights reserved.\n// \n// File: ContentTextElementAutomationPeer.cs\n//\n// Description: Base class for AutomationPeers associated with TextPattern.\n//\n//---------------------------------------------------------------------------\n\nusing System.Collections.Generic;           // List<T>\nusing System.Windows.Automation.Provider;   // IRawElementProviderSimple\nusing System.Windows.Documents;             // ITextPointer\n\nnamespace System.Windows.Automation.Peers\n{\n    /// <summary>\n    /// Base class for AutomationPeers associated with TextPattern.\n    /// </summary>\n    public abstract class TextAutomationPeer : FrameworkElementAutomationPeer\n    {\n        /// <summary>\n        /// Constructor.\n        /// </summary>\n        protected TextAutomationPeer(FrameworkElement owner)\n            : base(owner)\n        {}\n\n        /// <summary>\n    

In [10]:
' '.join(tokenized_sample.code.iloc[1][:550])

'using System . Collections . Generic ; using System . Windows . Automation . Provider ; using System . Windows . Documents ; namespace System . Windows . Automation . Peers { public abstract class TextAutomationPeer : FrameworkElementAutomationPeer { protected TextAutomationPeer ( FrameworkElement VARIABLE ) : base ( VARIABLE ) { } override protected string GetNameCore ( ) { string VARIABLE = AutomationProperties . GetName ( this . Owner ) ; if ( string . IsNullOrEmpty ( VARIABLE ) ) { AutomationPeer VARIABLE = GetLabeledByCore ( ) ; if ( VARIABLE != null ) { VARIABLE = VARIABLE . GetName ( ) ; } } return VARIABLE ?? string . Empty ; } internal new IRawElementProviderSimple ProviderFromPeer ( AutomationPeer VARIABLE ) { return base . ProviderFromPeer ( VARIABLE ) ; } internal DependencyObject ElementFromProvider ( IRawElementProviderSimple VARIABLE ) { DependencyObject VARIABLE = null ; AutomationPeer VARIABLE = PeerFromProvider ( VARIABLE ) ; if ( VARIABLE is UIElementAutomationPeer 

In [11]:
np.random.seed(42)
sample = data.sample(3)
for i, language, code in sample.itertuples():
    print(f"###### Index {i:7} ######")
    print(code)
    print(f"###########################")

NameError: name 'data' is not defined

In [None]:
sample = tokenize_dataset(sample)
sample

100%|██████████| 3/3 [00:01<00:00,  2.06it/s]


Unnamed: 0,language,code
77165,C++,[]
307910,C++,[]
79469,C++,[]


In [None]:
for language, code in sample.itertuples(False):
    print(code)

[]
[]
[]


In [None]:
tokenize_dataset(data)

100%|██████████| 327126/327126 [04:00<00:00, 1357.60it/s]


Unnamed: 0,language,code
0,C++,"[http, :, Unless, required, http, Unless, requ..."
1,C++,[<EOF>]
2,C++,[]
3,C++,"[AddError, (, INT_LITERAL, ,, INT_LITERAL, ,, ..."
4,C++,[]
...,...,...
327121,C++,[]
327122,C++,[<EOF>]
327123,C++,[]
327124,C++,"[assert, (, not_found, ==, answer1, ), ;, answ..."


In [None]:
datasets = [read_dataset(dataset_path, language) for language in ["Python", "Go", "C", "C++"]]
datasets = pd.concat(datasets, ignore_index=True)
datasets

Unnamed: 0,language,code
0,Python,"version = '7'\nhtml_title = ""Guzzle Documentat..."
1,Python,"# Path to a touch icon\n # ""touch_icon""..."
2,Python,"""base_url"": ""http://guzzlephp.org""\n\n ..."
3,Python,from tensorflow.python.estimator.model_fn impo...
4,Python,self.assertFalse(gfile.Exists('ram://exist...
...,...,...
1546444,C++,* * Worst Time Complexity O(log n)\n * * Best...
1546445,C++,/**\n * \file\n * \brief [Interpolation\n * se...
1546446,C++,std::cin >> n;\n\n int *array = new int...
1546447,C++,assert(not_found == answer1);\n // Test...


In [None]:
tokenized = tokenize_dataset(datasets)
tokenized

 54%|█████▍    | 176115/327126 [02:03<01:46, 1422.12it/s]


KeyboardInterrupt: 

In [None]:
words2int, int2word = get_vocab_mapping(tokenized)

: 