In [1]:
pip install -U transformers>=4.48.0

In [1]:
from transformers import AutoTokenizer
import pandas as pd
import utils
tokenizer_modernbert = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
tokenizer_bert = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
tokenizer_codebert = AutoTokenizer.from_pretrained("microsoft/codebert-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [3]:
results = {}
results["Tokenizer"] = ["BERT", "CodeBERT", "ModernBERT"]
results["Vocab Size"] = [tokenizer_bert.vocab_size, tokenizer_codebert.vocab_size, tokenizer_modernbert.vocab_size]

## Whitespaces & Tabs

### Whitespaces

In [4]:
from utils import longest_whitespace_token

In [5]:
print("BERT:", longest_whitespace_token(tokenizer_bert))
print("CodeBERT:", longest_whitespace_token(tokenizer_codebert))
print("ModernBERT:", longest_whitespace_token(tokenizer_modernbert))
results["Longest Whitespace Token"] = [longest_whitespace_token(tokenizer_bert),
                                      longest_whitespace_token(tokenizer_codebert),
                                       longest_whitespace_token(tokenizer_modernbert)]

BERT: 0
CodeBERT: 1
ModernBERT: 24


### Indentation (tabs)

In [6]:
#let's check the tokenization of a single tab.
print("BERT", tokenizer_bert.tokenize("\t"))
print("CodeBERT", tokenizer_codebert.tokenize("\t"))
print("ModernBERT", tokenizer_modernbert.tokenize("\t"))

BERT []
CodeBERT ['ĉ']
ModernBERT ['ĉ']


We notice that BERT ignores tabs completely. However, CodeBERT and ModernBERT maps a single tab to the same token. <br>
It is not clear if CodeBERT uses BPE but ModernBERT does.
Let's dig deeper to see how both tokenizers handle multiple tabs

In [7]:
from utils import tab_tokenization

In [8]:
print("BERT:", tab_tokenization(tokenizer_bert))
print("CodeBERT:", tab_tokenization(tokenizer_codebert))
print("ModernBERT:", tab_tokenization(tokenizer_modernbert))
results["Longest Tabs Token"] = [tab_tokenization(tokenizer_bert),
                                      tab_tokenization(tokenizer_codebert),
                                       tab_tokenization(tokenizer_modernbert)]

BERT: 0
CodeBERT: 1
ModernBERT: 6


### Tab-Space Merging

We notice ModernBERT is the only tokenizer that has a single token for a whitespace+tab (in this order).
This means shorter sequences, less memory usage, and faster inference.

In [9]:
print("BERT", tokenizer_bert.tokenize("\t \t \t"))
print("CodeBERT", tokenizer_codebert.tokenize("\t \t \t"))
print("ModernBERT", tokenizer_modernbert.tokenize("\t \t \t"))
results["Tab Space merging"] = ["False",
                                      "False",
                                       "True"]

BERT []
CodeBERT ['ĉ', 'Ġ', 'ĉ', 'Ġ', 'ĉ']
ModernBERT ['ĉ', 'Ġĉ', 'Ġĉ']


# Keywords and Operators
We'll compare keyword and operators tokenization between Python, Java and C++.

### Keywords
To have most accurate sets of keywords of the tree languages: <br>
For python: we will use the built in list. <br>
For JAVA: The official list (50 elements of Java SE8+): https://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-Keyword <br>
For C++ (C++ 11 only): https://en.cppreference.com/w/cpp/keyword


In [10]:
import pickle


keywords_path = "../data/keywords.pkl"

with open(keywords_path, "rb") as f:
    keywords = pickle.load(f)

python_keywords = keywords["python_keywords"]

java_keywords = keywords["java_keywords"]

cpp_keywords = keywords["cpp_keywords"]

print("Tested python kyewords:", len(python_keywords))
print("Tested Java keywords:", len(java_keywords))
print("Tested cpp keywords:", len(cpp_keywords))

Tested python kyewords: 35
Tested Java keywords: 50
Tested cpp keywords: 85


In [None]:
from utils import keywords_with_single_token

In [12]:
#ptyhon
print("Python")
print("BERT:", keywords_with_single_token(tokenizer_bert, python_keywords))
print("CodeBERT:", keywords_with_single_token(tokenizer_codebert, python_keywords))
print("ModernBERT:", keywords_with_single_token(tokenizer_modernbert, python_keywords))
results["Python Keywords (%)"] = [keywords_with_single_token(tokenizer_bert, python_keywords),
                                      keywords_with_single_token(tokenizer_codebert, python_keywords),
                                       keywords_with_single_token(tokenizer_modernbert, python_keywords)]


print("Java")
print("BERT:", keywords_with_single_token(tokenizer_bert, java_keywords))
print("CodeBERT:", keywords_with_single_token(tokenizer_codebert, java_keywords))
print("ModernBERT:", keywords_with_single_token(tokenizer_modernbert, java_keywords))
results["Java Keywords (%)"] = [keywords_with_single_token(tokenizer_bert, java_keywords),
                                      keywords_with_single_token(tokenizer_codebert, java_keywords),
                                       keywords_with_single_token(tokenizer_modernbert, java_keywords)]
print("CPP")
print("BERT:", keywords_with_single_token(tokenizer_bert, cpp_keywords))
print("CodeBERT:", keywords_with_single_token(tokenizer_codebert, cpp_keywords))
print("ModernBERT:", keywords_with_single_token(tokenizer_modernbert, cpp_keywords))


results["CPP Keywords (%)"]= [keywords_with_single_token(tokenizer_bert, cpp_keywords),
                                      keywords_with_single_token(tokenizer_codebert, cpp_keywords),
                                       keywords_with_single_token(tokenizer_modernbert, cpp_keywords)]

Python
BERT: 91.4
CodeBERT: 82.9
ModernBERT: 91.4
Java
BERT: 88.0
CodeBERT: 76.0
ModernBERT: 86.0
CPP
BERT: 56.5
CodeBERT: 62.4
ModernBERT: 71.8


The results may seem odd! How come BERT perform better or equivalent to CodeBERT and ModernBERT while it was not intented to be trained on code? <br>
It appears that the reason of this is the fact that Java and Python have a lot of keywords that are just regular english words (e.g while, else, if, true, false). This explains why it performs very poorly on CPP keywords since they are more complex. <br>

Let's test this in a simple way, by tokenizing "elif", which is very specefic to python and not a correct english word.

In [13]:
print("BERT:", tokenizer_bert.tokenize("elif"))
print("CodeBERT:", tokenizer_codebert.tokenize("elif"))
print("ModernBERT:", tokenizer_modernbert.tokenize("elif"))

BERT: ['eli', '##f']
CodeBERT: ['el', 'if']
ModernBERT: ['elif']


This makes our hypothesis stronger. However, it is ambiguous why CodeBERT's tokenizer doesn't perform as good as expected. This might probably be due to the training data: CodeBERT is trained on a large-scale dataset with 2.1 million bimodal data points and 6.4 million unimodal codes from six programming languages (Python, Java, JavaScript, PHP, Ruby, and Go). The data is derived from publicly accessible open-source GitHub repositories.

### Operators

Why Operators Matter
Operators are fundamental to how code functions, and how well they are tokenized impacts the efficiency and effectiveness of any model working with code. Tokenizing operators properly is crucial for:

Maintaining the integrity of the code.
Ensuring correct representation of syntax for downstream tasks like code completion, code summarization, or error detection.
Improving efficiency in tokenization—if operators are tokenized as single tokens, it could reduce the overall number of tokens needed to represent a piece of code.

For Operators, we will use the documentation of each language; <br>
For python: https://docs.python.org/3/library/operator.html <br>
For JAVA: https://docs.oracle.com/javase/tutorial/java/nutsandbolts/operators.html <br>
For C++: https://www.programiz.com/cpp-programming/operators <br>

In [24]:
operators_path = "../data/operators.pkl"

with open(operators_path, "rb") as f:
    operators = pickle.load(f)

python_operators = operators["python_operators"]

java_operators = operators["java_operators"]

cpp_operators = operators["cpp_operators"]

In [15]:
from utils import operators_with_single_token

In [16]:
print("Python")
print("BERT", operators_with_single_token(tokenizer_bert, python_operators))
print("CodeBERT", operators_with_single_token(tokenizer_codebert, python_operators))
print("ModernBERT",operators_with_single_token(tokenizer_modernbert, python_operators))
results["Python Operators (%)"] = [operators_with_single_token(tokenizer_bert, python_operators),
                                   operators_with_single_token(tokenizer_codebert, python_operators),
                                   operators_with_single_token(tokenizer_modernbert, python_operators)]

print("Java")
print("BERT", operators_with_single_token(tokenizer_bert, java_operators))
print("CodeBERT", operators_with_single_token(tokenizer_codebert, java_operators))
print("ModernBERT",operators_with_single_token(tokenizer_modernbert, java_operators))
results["Java Operators (%)"] = [operators_with_single_token(tokenizer_bert, java_operators),
                                   operators_with_single_token(tokenizer_codebert, java_operators),
                                   operators_with_single_token(tokenizer_modernbert, java_operators)]


print("Cpp")
print("BERT", operators_with_single_token(tokenizer_bert, cpp_operators))
print("CodeBERT", operators_with_single_token(tokenizer_codebert, cpp_operators))
print("ModernBERT",operators_with_single_token(tokenizer_modernbert, cpp_operators))
results["Cpp Operators (%)"] = [operators_with_single_token(tokenizer_bert, cpp_operators),
                                   operators_with_single_token(tokenizer_codebert, cpp_operators),
                                   operators_with_single_token(tokenizer_modernbert, cpp_operators)]



Python
BERT 50.0
CodeBERT 67.6
ModernBERT 82.4
Java
BERT 43.8
CodeBERT 75.0
ModernBERT 90.6
Cpp
BERT 38.9
CodeBERT 66.7
ModernBERT 86.1


Unlike keywords, operations make a real challenge for BERT. As it was not trained on large code corpus, it struggles a lot with operators. CodeBERT and ModernBert have decent performances but ModernBert's tokenizer seems to be the best.

# Token Visualization

In [21]:
from utils import show_tokens

In [23]:
def test_language_specific_constructs(tokenizers):
    python_code = """
    @decorator
    def my_function():
        pass

    my_list = [x**2 for x in range(10)]
    lambda x: x + 1
    """

    cpp_code = """
    std::vector<int> myVector;
    namespace std { int x = 5; }
    int* ptr = nullptr;
    """

    java_code = """
    List<String> myList = new ArrayList<>();
    public void myMethod(int x, String y) { }
    (x) -> x + 1
    """

    def show_tokens_for_language(code, tokenizer, language):
        print(f"Tokens for {language} code:")
        show_tokens(code, tokenizer)

    for tokenizer_name, tokenizer in tokenizers.items():
        print(f"\nTesting {tokenizer_name}")
        show_tokens_for_language(python_code, tokenizer, "Python")
        show_tokens_for_language(cpp_code, tokenizer, "C++")
        show_tokens_for_language(java_code, tokenizer, "Java")

tokenizers = {
    'BERT': tokenizer_bert,
    'CodeBERT': tokenizer_codebert,
    'ModernBERT': tokenizer_modernbert
}
test_language_specific_constructs(tokenizers)



Testing BERT
Tokens for Python code:
[0;30;48;2;102;194;165m[CLS][0m [0;30;48;2;252;141;98m@[0m [0;30;48;2;141;160;203mdecor[0m [0;30;48;2;231;138;195m##ator[0m [0;30;48;2;166;216;84mdef[0m [0;30;48;2;255;217;47mmy[0m [0;30;48;2;102;194;165m_[0m [0;30;48;2;252;141;98mfunction[0m [0;30;48;2;141;160;203m([0m [0;30;48;2;231;138;195m)[0m [0;30;48;2;166;216;84m:[0m [0;30;48;2;255;217;47mpass[0m [0;30;48;2;102;194;165mmy[0m [0;30;48;2;252;141;98m_[0m [0;30;48;2;141;160;203mlist[0m [0;30;48;2;231;138;195m=[0m [0;30;48;2;166;216;84m[[0m [0;30;48;2;255;217;47mx[0m [0;30;48;2;102;194;165m*[0m [0;30;48;2;252;141;98m*[0m [0;30;48;2;141;160;203m2[0m [0;30;48;2;231;138;195mfor[0m [0;30;48;2;166;216;84mx[0m [0;30;48;2;255;217;47min[0m [0;30;48;2;102;194;165mrange[0m [0;30;48;2;252;141;98m([0m [0;30;48;2;141;160;203m10[0m [0;30;48;2;231;138;195m)[0m [0;30;48;2;166;216;84m][0m [0;30;48;2;255;217;47mlambda[0m [0;30;48;2;102;194;165mx[0m [0;

# Results

In [17]:
df = pd.DataFrame(results)
df

Unnamed: 0,Tokenizer,Vocab Size,Longest Whitespace Token,Longest Tabs Token,Tab Space merging,Python Keywords (%),Java Keywords (%),CPP Keywords (%),Python Operators (%),Java Operators (%),Cpp Operators (%)
0,BERT,30522,0,0,False,91.4,88.0,56.5,50.0,43.8,38.9
1,CodeBERT,50265,1,1,False,82.9,76.0,62.4,67.6,75.0,66.7
2,ModernBERT,50280,24,6,True,91.4,86.0,71.8,82.4,90.6,86.1


In [18]:
df.to_csv("tokenizer_insights_v0.csv")