# Python vs Java vs C vs Multilang Experiments

### Setting things up

In [1]:
%load_ext autoreload
%autoreload 2

### Corpora stats

In [5]:
%%bash
source ~/.bashrc

BASIC_PREP_DATASETS_DIR="$HOME/.cache/dataprep/1.0.0-alpha.0/prep_datasets"

echo "N tokens (basic splitting)"
j=$(tc "$BASIC_PREP_DATASETS_DIR/small_chunk_19-05-22T17-59-55_00100")
echo "Java: $(( j / 1000 ))k"
p=$(tc "$BASIC_PREP_DATASETS_DIR/python_19-05-20T00-48-25_00100")
echo "Python: $(( p / 1000 ))k"
c=$(tc "$BASIC_PREP_DATASETS_DIR/c_19-05-24T08-55-52_00100")
echo "c: $(( c / 1000 ))k"
m=$(tc "$BASIC_PREP_DATASETS_DIR/multilang_19-05-31T17-50-18_00100")
echo "Multilang: $(( m / 1000 ))k"

N tokens (basic splitting)
Java: 13444k
Python: 44141k
c: 94010k
Multilang: 136254k


### Loading merges

In [3]:
import os
from metrics.merge import read_merges
from metrics.matrix import merge_similarity_rate_matrix
from pandas import DataFrame

from dataprep.config import USER_BPE_DIR

N_MERGES = 10000

def load_merges(dir, n_merges):
    file = os.path.join(USER_BPE_DIR, f"{dir}/{n_merges}/merges.txt")
    return read_merges(file)

java_merges = load_merges("small_chunk_19-05-22T17-59-55", N_MERGES)
python_merges = load_merges("python_19-05-20T00-48-25", N_MERGES)
c_merges = load_merges("c_19-05-24T08-55-52", N_MERGES)
multilang_merges = load_merges("multilang_19-05-31T17-50-18", N_MERGES)

java_nocase_merges = load_merges("small_chunk_19-05-22T17-59-55_nocase", N_MERGES)
python_nocase_merges = load_merges("python_19-05-20T00-48-25_nocase", N_MERGES)
c_nocase_merges = load_merges("c_19-05-24T08-55-52_nocase", N_MERGES)
multilang_nocase_merges = load_merges("multilang_19-05-31T17-50-18_nocase", N_MERGES)

[MainProcess] DEBUG - 2019-06-03 08:43:54,439 - matplotlib: $HOME=/home/lv71161/hlibbabii
[MainProcess] DEBUG - 2019-06-03 08:43:54,442 - matplotlib: CONFIGDIR=/home/lv71161/hlibbabii/.config/matplotlib
[MainProcess] DEBUG - 2019-06-03 08:43:54,443 - matplotlib: matplotlib data path: /home/lv71161/hlibbabii/log-recommender-dataprep/venv/lib/python3.6/site-packages/matplotlib/mpl-data
[MainProcess] DEBUG - 2019-06-03 08:43:54,448 - matplotlib: loaded rc file /home/lv71161/hlibbabii/log-recommender-dataprep/venv/lib/python3.6/site-packages/matplotlib/mpl-data/matplotlibrc
[MainProcess] DEBUG - 2019-06-03 08:43:54,509 - matplotlib: matplotlib version 3.0.3
[MainProcess] DEBUG - 2019-06-03 08:43:54,510 - matplotlib: interactive is False
[MainProcess] DEBUG - 2019-06-03 08:43:54,511 - matplotlib: platform is linux


In [3]:
print(f"N merges: {len(python_merges)}")
python_merges[:10], python_nocase_merges[:10]

N merges: 10000


([('\\', "'"): 1534342,
  ('s', 'e'): 1113174,
  ('t', 'e'): 855782,
  ('i', 'n'): 851819,
  ('o', 'n'): 851736,
  ('r', 'e'): 840122,
  ('s', 't'): 711730,
  ('l', 'e'): 650486,
  ('a', 'l'): 548867,
  ('m', 'e'): 520902],
 [('\\', "'"): 1534342,
  ('s', 'e'): 1161198,
  ('i', 'n'): 934483,
  ('t', 'e'): 928378,
  ('r', 'e'): 895176,
  ('o', 'n'): 879485,
  ('s', 't'): 745309,
  ('l', 'e'): 688386,
  ('a', 'l'): 577227,
  ('e', 'r'): 566812])

In [4]:
print(f"N merges: {len(java_merges)}")
java_merges[:10], java_nocase_merges[:10]

N merges: 10000


([('e', 'r'): 440973,
  ('o', 'n'): 404289,
  ('t', 'i'): 291637,
  ('i', 'n'): 273818,
  ('o', 'r'): 247026,
  ('e', 'n'): 243529,
  ('e', 's'): 242396,
  ('e', 't'): 240037,
  ('ti', 'on'): 189408,
  ('t', 'h'): 181834],
 [('e', 'r'): 456435,
  ('o', 'n'): 423924,
  ('i', 'n'): 346600,
  ('r', 'e'): 317780,
  ('s', 't'): 313742,
  ('t', 'i'): 296648,
  ('e', 'n'): 247095,
  ('o', 'r'): 243885,
  ('e', 't'): 235980,
  ('t', 'h'): 212814])

In [5]:
print(f"N merges: {len(c_merges)}")
c_merges[:10], c_nocase_merges[:10]

N merges: 10000


([('i', 'n'): 1582413,
  ('d', 'e'): 1414205,
  ('r', 'e'): 1384120,
  ('e', 'r'): 1126689,
  ('0', 'x'): 985483,
  ('a', 't'): 974505,
  ('0', '0'): 960344,
  ('s', 't'): 915636,
  ('e', 'n'): 732643,
  ('o', 'n'): 691025],
 [('i', 'n'): 1897930,
  ('r', 'e'): 1631360,
  ('d', 'e'): 1620094,
  ('e', 'r'): 1369360,
  ('a', 't'): 1126483,
  ('s', 't'): 1089666,
  ('0', 'x'): 985483,
  ('0', '0'): 960344,
  ('e', 'n'): 896423,
  ('o', 'n'): 839317])

In [3]:
print(f"N merges: {len(multilang_merges)}")
multilang_merges[:10], multilang_nocase_merges[:10]

N merges: 10000


([('e', 'r'): 3038644,
  ('o', 'n'): 2544820,
  ('i', 'n'): 2386821,
  ('r', 'e'): 2125644,
  ('a', 't'): 2013993,
  ('e', 'n'): 1935670,
  ('s', 't'): 1764978,
  ('e', 't'): 1762244,
  ('o', 'r'): 1686343,
  ('h', 'e'): 1329511],
 [('e', 'r'): 3268142,
  ('i', 'n'): 2902997,
  ('o', 'n'): 2763695,
  ('r', 'e'): 2630328,
  ('s', 't'): 2489487,
  ('e', 'n'): 2197990,
  ('a', 't'): 2169292,
  ('o', 'r'): 1833935,
  ('e', 't'): 1776228,
  ('h', 'e'): 1438036])

### Metric 1. Pearson

In [10]:
from metrics.matrix import pearson_matrix
from pandas import DataFrame

n_merges = 1000
print(f"N merges: {n_merges//1000}k, case")
m = pearson_matrix([java_merges[:n_merges], python_merges[:n_merges], c_merges[:n_merges], multilang_merges[:n_merges]])
DataFrame(m).round(3)

N merges: 1k, case


Unnamed: 0,0,1,2,3
0,1.0,0.567,0.613,0.761
1,0.567,1.0,0.626,0.63
2,0.613,0.626,1.0,0.732
3,0.761,0.63,0.732,1.0


In [5]:
from metrics.matrix import pearson_matrix
from pandas import DataFrame

n_merges = 1000
print(f"N merges: {n_merges//1000}k, NO case")
m = pearson_matrix([java_nocase_merges[:n_merges], python_nocase_merges[:n_merges], c_nocase_merges[:n_merges], multilang_nocase_merges[:n_merges]])
DataFrame(m).round(3)

N merges: 1k, NO case


Unnamed: 0,0,1,2,3
0,1.0,0.71,0.704,0.905
1,0.71,1.0,0.656,0.715
2,0.704,0.656,1.0,0.759
3,0.905,0.715,0.759,1.0


In [11]:
from metrics.matrix import pearson_matrix
from pandas import DataFrame

n_merges = 10000
print(f"N merges: {n_merges//1000}k, case")
m = pearson_matrix([java_merges[:n_merges], python_merges[:n_merges], c_merges[:n_merges], multilang_merges[:n_merges]])
DataFrame(m).round(3)

N merges: 10k, case


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  


Unnamed: 0,0,1,2,3
0,1.0,0.604,0.654,0.787
1,0.604,1.0,0.662,0.664
2,0.654,0.662,1.0,0.764
3,0.787,0.664,0.764,1.0


In [6]:
from metrics.matrix import pearson_matrix
from pandas import DataFrame

n_merges = 10000
print(f"N merges: {n_merges//1000}k, NO case")
m = pearson_matrix([java_nocase_merges[:n_merges], python_nocase_merges[:n_merges], c_nocase_merges[:n_merges], multilang_nocase_merges[:n_merges]])
DataFrame(m).round(3)

N merges: 10k, NO case


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  


Unnamed: 0,0,1,2,3
0,1.0,0.736,0.733,0.915
1,0.736,1.0,0.689,0.742
2,0.733,0.689,1.0,0.785
3,0.915,0.742,0.785,1.0


### Metric 2. Bpe merges cooccurrences

In [12]:
from metrics.matrix import cooccurence_matrix
from pandas import DataFrame

n_merges = 1000
print(f"N merges: {n_merges//1000}k, case")
lst = [java_merges[:n_merges], python_merges[:n_merges], c_merges[:n_merges], multilang_merges[:n_merges]]
m = cooccurence_matrix(lst, lst)
DataFrame(m).round(3)

N merges: 1k, case


Unnamed: 0,0,1,2,3
0,1.0,0.365,0.351,0.538
1,0.365,1.0,0.415,0.44
2,0.351,0.415,1.0,0.434
3,0.538,0.44,0.434,1.0


In [7]:
from metrics.matrix import cooccurence_matrix
from pandas import DataFrame

n_merges = 1000
print(f"N merges: {n_merges//1000}k, NO case")
lst = [java_nocase_merges[:n_merges], python_nocase_merges[:n_merges], c_nocase_merges[:n_merges], multilang_nocase_merges[:n_merges]]
m = cooccurence_matrix(lst, lst)
DataFrame(m).round(3)

N merges: 1k, NO case


Unnamed: 0,0,1,2,3
0,1.0,0.481,0.432,0.627
1,0.481,1.0,0.455,0.538
2,0.432,0.455,1.0,0.477
3,0.627,0.538,0.477,1.0


In [9]:
from metrics.matrix import cooccurence_matrix
from pandas import DataFrame

n_merges = 10000
print(f"N merges: {n_merges//1000}k, case")
lst = [java_merges[:n_merges], python_merges[:n_merges], c_merges[:n_merges], multilang_merges[:n_merges]]
m = cooccurence_matrix(lst, lst)
DataFrame(m).round(3)

N merges: 10k, case


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  import sys


Unnamed: 0,0,1,2,3
0,1.0,0.35,0.311,0.466
1,0.35,1.0,0.369,0.417
2,0.311,0.369,1.0,0.392
3,0.466,0.417,0.392,1.0


In [10]:
from metrics.matrix import cooccurence_matrix
from pandas import DataFrame

n_merges = 10000
print(f"N merges: {n_merges//1000}k, NO case")
lst = [java_nocase_merges[:n_merges], python_nocase_merges[:n_merges], c_nocase_merges[:n_merges], multilang_nocase_merges[:n_merges]]
m = cooccurence_matrix(lst, lst)
DataFrame(m).round(3)

N merges: 10k, NO case


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  import sys


Unnamed: 0,0,1,2,3
0,1.0,0.404,0.326,0.482
1,0.404,1.0,0.352,0.45
2,0.326,0.352,1.0,0.398
3,0.482,0.45,0.398,1.0


### Loading vocabs

In [6]:
from dataprep import vocabloader
n_merges = 1000

java_bpe_vocab = vocabloader.base(f'small_chunk-{n_merges}')
python_bpe_vocab = vocabloader.base(f'python-{n_merges}')
c_bpe_vocab = vocabloader.base(f'c-{n_merges}')
multilang_bpe_vocab = vocabloader.base(f'multilang-{n_merges}')

java_all_vocab = vocabloader.all(f'small_chunk-{n_merges}')
python_all_vocab = vocabloader.all(f'python-{n_merges}')
c_all_vocab = vocabloader.all(f'c-{n_merges}')
multilang_all_vocab = vocabloader.all(f'multilang-{n_merges}')

java_nocase_bpe_vocab = vocabloader.base(f'small_chunk_nocase-{n_merges}')
python_nocase_bpe_vocab = vocabloader.base(f'python_nocase-{n_merges}')
c_nocase_bpe_vocab = vocabloader.base(f'c_nocase-{n_merges}')
multilang_nocase_bpe_vocab = vocabloader.base(f'multilang_nocase-{n_merges}')

java_nocase_all_vocab = vocabloader.all(f'small_chunk_nocase-{n_merges}')
python_nocase_all_vocab = vocabloader.all(f'python_nocase-{n_merges}')
c_nocase_all_vocab = vocabloader.all(f'c_nocase-{n_merges}')
multilang_nocase_all_vocab = vocabloader.all(f'multilang_nocase-{n_merges}')

In [9]:
list(java_bpe_vocab.items())[:200], list(java_all_vocab.items())[:200]

([('get', 109533),
  ('the', 107519),
  ('_', 76549),
  ('String', 63861),
  ('Exception', 42147),
  ('org', 41943),
  ('of', 36659),
  ('set', 34374),
  ('is', 33670),
  ('to', 33549),
  ('0', 31921),
  ('1', 31221),
  ('License', 30577),
  ('a', 27753),
  ('2', 24865),
  ('Name', 22715),
  ("\\'", 22005),
  ('List', 19667),
  ('in', 19399),
  ('or', 19357),
  ('java', 18767),
  ('Id', 18171),
  ('param', 16273),
  ('add', 15881),
  ('value', 15793),
  ('and', 15603),
  ('com', 15008),
  ('i', 14807),
  ('Request', 14334),
  ('Context', 13357),
  ('Class', 13348),
  ('File', 12920),
  ('Object', 12839),
  ('not', 12651),
  ('e', 12532),
  ('under', 12425),
  ('file', 12277),
  ('Map', 12062),
  ('The', 11932),
  ('Type', 11817),
  ('Test', 11796),
  ('util', 11577),
  ('Override', 11069),
  ('it', 11058),
  ('with', 11042),
  ('name', 10936),
  ('Value', 10808),
  ('apache', 10636),
  ('Service', 10491),
  ('test', 10256),
  ('Connection', 10149),
  ('This', 10016),
  ('by', 9947),
  

In [10]:
list(python_bpe_vocab.items())[:200], list(python_all_vocab.items())[:200]

([('_', 2141551),
  ("\\'", 1534342),
  ('self', 512311),
  ('the', 197719),
  ('1', 196537),
  ('0', 153846),
  ('name', 137854),
  ('2', 128548),
  ('to', 127496),
  ('is', 119042),
  ('a', 118998),
  ('in', 117430),
  ('module', 115204),
  ('type', 95603),
  ('test', 95080),
  ('of', 87981),
  ('None', 87886),
  ('get', 77563),
  ('not', 75591),
  ('s', 72919),
  ('and', 69156),
  ('True', 69080),
  ('x', 69009),
  ('3', 67721),
  ('description', 64366),
  ('Error', 62479),
  ('n', 60433),
  ('result', 59142),
  ('data', 58996),
  ('dict', 57981),
  ('False', 57683),
  ('Equal', 55747),
  ('value', 52791),
  ('str', 52733),
  ('or', 50920),
  ('b', 47146),
  ('path', 45128),
  ('list', 43987),
  ('params', 41620),
  ('state', 41609),
  ('be', 40938),
  ('default', 40574),
  ('5', 38854),
  ('set', 38348),
  ('np', 38142),
  ('4', 35886),
  ('id', 35326),
  ('key', 34978),
  ('check', 34612),
  ('f', 34223),
  ('c', 33787),
  ('required', 33391),
  ('index', 32779),
  ('X', 32445),
 

In [11]:
list(c_bpe_vocab.items())[:200], list(c_all_vocab.items())[:200]

([('_', 7575058),
  ('0', 722165),
  ('1', 370480),
  ('the', 334675),
  ('2', 316855),
  ('x', 298388),
  ('dev', 285889),
  ('i', 255619),
  ('to', 223745),
  ('n', 167517),
  ('0x00', 156906),
  ('p', 155743),
  ('data', 154789),
  ('u', 139929),
  ('s', 132765),
  ('3', 130403),
  ('h', 125696),
  ('4', 124168),
  ('t', 122854),
  ('device', 121651),
  ('is', 120049),
  ('of', 118207),
  ('32', 112964),
  ('a', 109715),
  ('include', 109687),
  ('info', 106842),
  ('8', 103470),
  ('lock', 100797),
  ('in', 100630),
  ('d', 100158),
  ('c', 99019),
  ("\\'", 98297),
  ('err', 95965),
  ('and', 93896),
  ('NULL', 92467),
  ('flags', 87340),
  ('set', 85740),
  ('len', 84494),
  ('define', 81766),
  ('size', 78770),
  ('port', 78340),
  ('addr', 77843),
  ('ret', 77163),
  ('write', 76397),
  ('state', 74211),
  ('get', 73918),
  ('buf', 73777),
  ('init', 73367),
  ('name', 72680),
  ('16', 72489),
  ('status', 69646),
  ('skb', 69088),
  ('read', 68855),
  ('priv', 67686),
  ('linu

In [12]:
list(multilang_bpe_vocab.items())[:200], list(multilang_all_vocab.items())[:200]

([('_', 3906677),
  ('the', 775102),
  ('get', 726688),
  ('0', 641680),
  ("\\'", 509695),
  ('1', 509498),
  ('String', 388705),
  ('2', 367919),
  ('to', 324794),
  ('org', 297820),
  ('is', 284369),
  ('i', 275282),
  ('Exception', 261763),
  ('of', 243236),
  ('a', 240921),
  ('set', 236944),
  ('s', 224302),
  ('in', 203650),
  ('License', 183263),
  ('Name', 162560),
  ('n', 159778),
  ('3', 155637),
  ('and', 151476),
  ('x', 147452),
  ('or', 133965),
  ('name', 132224),
  ('4', 128960),
  ('data', 128204),
  ('List', 123494),
  ('add', 123385),
  ('file', 121143),
  ('apache', 120165),
  ('p', 119630),
  ('Type', 114752),
  ('value', 111657),
  ('c', 111565),
  ('test', 110688),
  ('java', 109846),
  ('self', 106322),
  ('r', 106008),
  ('not', 104829),
  ('Object', 104762),
  ('t', 103979),
  ('size', 102979),
  ('8', 98324),
  ('code', 97445),
  ('Id', 96356),
  ('Value', 95851),
  ('param', 93686),
  ('src', 92142),
  ('Test', 91669),
  ('Context', 91140),
  ('File', 90907

### Metric 3. Merge similarity rate matrix
Cell`[i,j]` contains comparison of token splittings when preprocessing corpus `i` with bpe codes `i` vs bpe codes `j` and consists of 5 floats:
*    the percentage of tokens that are split differently;
*    the percentage of tokens which are split into MORE subwords when preprocessed with bpe codes `i`
*    the percentage of tokens which are split into LESS subwords when preprocessed with bpe codes `i`
*    the percentage of tokens which are NOT split when preprocessed with bpe codes `i` but ARE split when preprocessed with bpe codes `j`
*    the percentage of tokens which ARE split when preprocessed with bpe codes `i` but are NOT split when preprocessed with bpe codes `j`

In [8]:
import collections
import numbers
def pformat(thing, formatfunc):
    if isinstance(thing, dict):
        return type(thing)((key, pformat(value, formatfunc)) for key, value in thing.iteritems())
    if isinstance(thing, collections.Container):
        return type(thing)(pformat(value, formatfunc) for value in thing)
    if isinstance(thing, numbers.Number):
        return formatfunc(thing)
    return thing

In [17]:
n_merges = 1000
print(f"N merges: {n_merges//1000}k, case, without keywords and operators")
m = merge_similarity_rate_matrix([java_bpe_vocab, python_bpe_vocab, c_bpe_vocab, multilang_bpe_vocab], 
                                 [java_merges[:n_merges], python_merges[:n_merges], c_merges[:n_merges], multilang_merges[:n_merges]])

DataFrame(pformat(m, lambda s: f'{s*100:.1f}'))

N merges: 1k, case, without keywords and operators


Unnamed: 0,0,1,2,3
0,"(0.0, 0.0, 0.0, 0.0, 0.0)","(49.4, 33.6, 9.4, 22.3, 3.9)","(55.6, 41.9, 7.9, 28.0, 3.1)","(28.7, 14.6, 8.3, 8.4, 2.9)"
1,"(38.4, 28.6, 5.1, 19.4, 1.9)","(0.0, 0.0, 0.0, 0.0, 0.0)","(35.7, 26.9, 5.0, 18.2, 1.7)","(30.6, 20.5, 5.6, 13.5, 2.0)"
2,"(42.2, 34.4, 4.1, 19.8, 1.7)","(34.6, 26.4, 5.0, 14.3, 2.1)","(0.0, 0.0, 0.0, 0.0, 0.0)","(33.2, 25.0, 4.6, 14.3, 1.8)"
3,"(28.1, 16.3, 6.2, 7.3, 2.9)","(37.7, 24.2, 7.7, 13.6, 3.3)","(41.4, 28.4, 8.0, 16.5, 3.1)","(0.0, 0.0, 0.0, 0.0, 0.0)"


In [18]:
n_merges = 1000
print(f"N merges: {n_merges//1000}k, case, including keywords and operators")
m = merge_similarity_rate_matrix([java_all_vocab, python_all_vocab, c_all_vocab, multilang_all_vocab], 
                                 [java_merges[:n_merges], python_merges[:n_merges], c_merges[:n_merges], multilang_merges[:n_merges]])

DataFrame(pformat(m, lambda s: f'{s*100:.1f}'))

N merges: 1k, case, including keywords and operators


Unnamed: 0,0,1,2,3
0,"(0.0, 0.0, 0.0, 0.0, 0.0)","(21.5, 12.4, 5.1, 8.1, 2.7)","(23.0, 15.8, 4.2, 9.7, 2.2)","(12.4, 6.2, 3.1, 3.3, 1.0)"
1,"(17.9, 12.0, 3.6, 8.2, 1.8)","(0.0, 0.0, 0.0, 0.0, 0.0)","(16.6, 11.2, 3.6, 7.5, 1.7)","(13.7, 8.8, 2.4, 5.9, 0.9)"
2,"(18.6, 14.7, 1.8, 8.4, 0.7)","(16.7, 12.0, 3.1, 6.5, 1.6)","(0.0, 0.0, 0.0, 0.0, 0.0)","(15.5, 11.3, 2.5, 6.6, 0.7)"
3,"(12.1, 6.0, 3.3, 2.6, 1.6)","(16.1, 8.8, 4.5, 4.8, 2.3)","(17.8, 10.8, 4.9, 5.8, 2.5)","(0.0, 0.0, 0.0, 0.0, 0.0)"


In [19]:
n_merges = 10000
print(f"N merges: {n_merges//1000}k, NO case, without keywords and operators")
m = merge_similarity_rate_matrix([java_nocase_bpe_vocab, python_nocase_bpe_vocab, c_nocase_bpe_vocab, multilang_nocase_bpe_vocab], 
                                 [java_nocase_merges[:n_merges], python_nocase_merges[:n_merges], c_nocase_merges[:n_merges], multilang_nocase_merges[:n_merges]])

DataFrame(pformat(m, lambda s: f'{s*100:.1f}'))

N merges: 10k, NO case, without keywords and operators


Unnamed: 0,0,1,2,3
0,"(0.0, 0.0, 0.0, 0.0, 0.0)","(8.8, 8.5, 0.2, 8.3, 0.1)","(15.6, 15.4, 0.1, 15.1, 0.1)","(4.9, 4.6, 0.2, 4.5, 0.1)"
1,"(9.1, 8.4, 0.4, 7.9, 0.2)","(0.0, 0.0, 0.0, 0.0, 0.0)","(11.6, 10.9, 0.4, 10.3, 0.2)","(7.2, 6.5, 0.4, 6.1, 0.2)"
2,"(18.0, 16.7, 0.6, 15.2, 0.4)","(15.6, 14.1, 0.7, 12.8, 0.5)","(0.0, 0.0, 0.0, 0.0, 0.0)","(12.6, 11.1, 0.7, 10.1, 0.5)"
3,"(6.9, 6.0, 0.5, 5.2, 0.3)","(8.5, 7.5, 0.5, 6.7, 0.4)","(12.6, 11.5, 0.6, 10.8, 0.3)","(0.0, 0.0, 0.0, 0.0, 0.0)"


In [20]:
n_merges = 10000
print(f"N merges: {n_merges//1000}k, NO case, including keywords and operators")
m = merge_similarity_rate_matrix([java_nocase_all_vocab, python_nocase_all_vocab, c_nocase_all_vocab, multilang_nocase_all_vocab], 
                                 [java_nocase_merges[:n_merges], python_nocase_merges[:n_merges], c_nocase_merges[:n_merges], multilang_nocase_merges[:n_merges]])

DataFrame(pformat(m, lambda s: f'{s*100:.1f}'))

N merges: 10k, NO case, including keywords and operators


Unnamed: 0,0,1,2,3
0,"(0.0, 0.0, 0.0, 0.0, 0.0)","(4.7, 2.5, 2.1, 2.5, 2.0)","(8.9, 4.6, 3.9, 4.6, 1.9)","(5.0, 1.9, 3.0, 1.8, 0.7)"
1,"(3.7, 3.3, 0.2, 3.1, 0.1)","(0.0, 0.0, 0.0, 0.0, 0.0)","(6.4, 4.3, 2.0, 4.0, 0.5)","(4.6, 2.9, 1.6, 2.7, 0.1)"
2,"(12.4, 11.4, 0.5, 6.0, 0.4)","(11.4, 10.2, 0.8, 4.9, 0.8)","(0.0, 0.0, 0.0, 0.0, 0.0)","(5.6, 4.7, 0.5, 4.4, 0.4)"
3,"(6.1, 5.2, 0.6, 2.1, 0.6)","(6.8, 5.0, 1.6, 2.1, 1.5)","(5.7, 3.8, 1.6, 3.6, 1.5)","(0.0, 0.0, 0.0, 0.0, 0.0)"


### Metric 4. Corpora sizes

###  Preprocessing the corpora with different bpe codes

In [2]:
import subprocess, os
from subprocess import PIPE
from typing import List, Dict

def cross_prep(datasets: List[Dict[str, str]], n_merges: int, output: str):
    for i in range(len(datasets)):
        for j in range(len(datasets)):
            code = datasets[j]['code']
            
            command_nocase = ["python", "dataprep/__main__.py", "bpe", f"{code}_nocase-{n_merges}", "-p", datasets[i]['path'], "-o", output, "--no-case", "--ext", datasets[i]['ext']]
            command = ["python", "dataprep/__main__.py", "bpe", f"{code}-{n_merges}", "-p", datasets[i]['path'], "-o", output, "--ext", datasets[i]['ext']]
            
            p1 = subprocess.run(command_nocase, cwd=os.path.join(os.environ['HOME'], 'dataprep'), stdout=PIPE, stderr=PIPE, check=True, universal_newlines=True)
            print(p1.stdout, p1.stderr)
            p2 = subprocess.run(command, cwd=os.path.join(os.environ['HOME'], 'dataprep'), stdout=PIPE, stderr=PIPE, check=True, universal_newlines=True)
            print(p2.stdout)

In [None]:
import os

RAW_DATASETS="/home/lv71161/hlibbabii/raw_datasets"
OUTPUT="/home/lv71161/hlibbabii/prep-1.0.0-alpha.0"

cross_prep([{'path': os.path.join(RAW_DATASETS, 'python'), 'ext': 'py', 'code': 'python'},
            {'path': os.path.join(RAW_DATASETS, 'allamanis/small_chunk'), 'ext': 'java', 'code': 'small_chunk'},
            {'path': os.path.join(RAW_DATASETS, 'c'), 'ext': 'c', 'code': 'c'},
            {'path': os.path.join(RAW_DATASETS, 'multilang'), 'ext': 'c|java|py', 'code': 'multilang'},
           ], 1000, OUTPUT)

In [None]:
import os

RAW_DATASETS="/home/lv71161/hlibbabii/raw_datasets"
OUTPUT="/home/lv71161/hlibbabii/prep-1.0.0-alpha.0"

cross_prep([{'path': os.path.join(RAW_DATASETS, 'python'), 'ext': 'py', 'code': 'python'},
            {'path': os.path.join(RAW_DATASETS, 'allamanis/small_chunk'), 'ext': 'java', 'code': 'small_chunk'},
            {'path': os.path.join(RAW_DATASETS, 'c'), 'ext': 'c', 'code': 'c'},
            {'path': os.path.join(RAW_DATASETS, 'multilang'), 'ext': 'c|java|py', 'code': 'multilang'},
           ], 5000, OUTPUT)

In [None]:
import os

RAW_DATASETS="/home/lv71161/hlibbabii/raw_datasets"
OUTPUT="/home/lv71161/hlibbabii/prep-1.0.0-alpha.0"

cross_prep([{'path': os.path.join(RAW_DATASETS, 'python'), 'ext': 'py', 'code': 'python'},
            {'path': os.path.join(RAW_DATASETS, 'allamanis/small_chunk'), 'ext': 'java', 'code': 'small_chunk'},
            {'path': os.path.join(RAW_DATASETS, 'c'), 'ext': 'c', 'code': 'c'},
            {'path': os.path.join(RAW_DATASETS, 'multilang'), 'ext': 'c|java|py', 'code': 'multilang'},
           ], 10000, OUTPUT)

### Preprocessing multilang corpus with different bpe codes

In [1]:
import re, os

RAW_DATASETS_PATH = "/home/lv71161/hlibbabii/prep-1.0.0-alpha.0"

import collections

def update(d, u):
    for k, v in u.items():
        if isinstance(v, collections.Mapping):
            d[k] = update(d.get(k, {}), v)
        else:
            d[k] = v
    return d

def get_all_prep_datasets(dir: str):
    REGEX = "(.*)_[0-9]{2}-[0-9]{2}-[0-9]{2}T[0-9]{2}-[0-9]{2}-[0-9]{2}_0090[01]_(.*)-([0-9]+)_prep"
    prep_datasets = {}
    for dir in os.listdir(dir):
        match = re.fullmatch(REGEX, dir)
        if match:
            dataset = match[1]
            code, case = (re.fullmatch("(.*)_nocase", match[2])[1], 'nocase') if match[2].endswith("_nocase") else (match[2], 'case')
            n_merges = match[3]
            
            update(prep_datasets, {n_merges: {case: {dataset: {code: os.path.join(RAW_DATASETS_PATH, dir)}}}})
    return prep_datasets
        

prep_datasets = get_all_prep_datasets(RAW_DATASETS_PATH)
case_5k = prep_datasets['5000']['case']
case_10k = prep_datasets['10000']['case']
nocase_10k = prep_datasets['10000']['nocase']

In [37]:
%%bash -s "{case_5k['small_chunk']['small_chunk']}" "{case_5k['small_chunk']['python']}" "{case_5k['small_chunk']['c']}" "{case_5k['small_chunk']['multilang']}"
echo "java corpus prep with bpe codes learn on the corpora: java, python, c, multilang; CASE, 5k"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n4=$(tc "$4")
n=$n1
for ni in $n1 $n2 $n3 $n4; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

java corpus prep with bpe codes learn on the corpora: java, python, c, multilang; CASE, 5k
14740824 (1.000)
16751737 (1.136)
18538071 (1.258)
15570700 (1.056)


In [38]:
%%bash -s "{case_5k['python']['small_chunk']}" "{case_5k['python']['python']}" "{case_5k['python']['c']}" "{case_5k['python']['multilang']}"
echo "python corpus prep with bpe codes learn on the corpora: java, python, c, multilang; CASE, 5k"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n4=$(tc "$4")
n=$n2
for ni in $n1 $n2 $n3 $n4; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

python corpus prep with bpe codes learn on the corpora: java, python, c, multilang; CASE, 5k
57063135 (1.160)
49202444 (1.000)
57349576 (1.166)
53720408 (1.092)


In [39]:
%%bash -s "{case_5k['c']['small_chunk']}" "{case_5k['c']['python']}" "{case_5k['c']['c']}" "{case_5k['c']['multilang']}"
echo "C corpus prep with bpe codes learn on the corpora: java, python, c, multilang; CASE, 5k"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n4=$(tc "$4")
n=$n3
for ni in $n1 $n2 $n3 $n4; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

C corpus prep with bpe codes learn on the corpora: java, python, c, multilang; CASE, 5k
127171417 (1.176)
121329011 (1.122)
108163343 (1.000)
119247556 (1.102)


In [2]:
%%bash -s "{case_5k['multilang']['small_chunk']}" "{case_5k['multilang']['python']}" "{case_5k['multilang']['c']}" "{case_5k['multilang']['multilang']}"
echo "Multilang corpus prep with bpe codes learn on the corpora: java, python, c, multilang; CASE, 5k"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n4=$(tc "$4")
n=$n4
for ni in $n1 $n2 $n3 $n4; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

Multilang corpus prep with bpe codes learn on the corpora: java, python, c, multilang; CASE, 5k
163899158 (1.053)
168195949 (1.080)
178548761 (1.147)
155673541 (1.000)


#### prep with 10K merges

In [8]:
%%bash -s "{case_10k['small_chunk']['small_chunk']}" "{case_10k['small_chunk']['python']}" "{case_10k['small_chunk']['c']}" "{case_10k['small_chunk']['multilang']}"
echo "java corpus prep with bpe codes learn on the corpora: java, python, c, multilang; CASE, 10k"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n4=$(tc "$4")
n=$n1
for ni in $n1 $n2 $n3 $n4; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

java corpus prep with bpe codes learn on the corpora: java, python, c, multilang; CASE, 10k
13967128 (1.000)
15561461 (1.114)
16852196 (1.207)
14693720 (1.052)


In [9]:
%%bash -s "{case_10k['python']['small_chunk']}" "{case_10k['python']['python']}" "{case_10k['python']['c']}" "{case_10k['python']['multilang']}"
echo "python corpus prep with bpe codes learn on the corpora: java, python, c, multilang; CASE, 10k"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n4=$(tc "$4")
n=$n2
for ni in $n1 $n2 $n3 $n4; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

python corpus prep with bpe codes learn on the corpora: java, python, c, multilang; CASE, 10k
51728981 (1.106)
46782807 (1.000)
53034001 (1.134)
50152730 (1.072)


In [3]:
%%bash -s "{case_10k['c']['small_chunk']}" "{case_10k['c']['python']}" "{case_10k['c']['c']}" "{case_10k['c']['multilang']}"
echo "C corpus prep with bpe codes learn on the corpora: java, python, c, multilang; CASE, 10k"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n4=$(tc "$4")
n=$n3
for ni in $n1 $n2 $n3 $n4; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

C corpus prep with bpe codes learn on the corpora: java, python, c, multilang; CASE, 10k
117734468 (1.150)
114997850 (1.124)
102343447 (1.000)
112004234 (1.094)


In [4]:
%%bash -s "{case_10k['multilang']['small_chunk']}" "{case_10k['multilang']['python']}" "{case_10k['multilang']['c']}" "{case_10k['multilang']['multilang']}"
echo "Multilang corpus prep with bpe codes learn on the corpora: java, python, c, multilang; CASE, 10k"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n4=$(tc "$4")
n=$n4
for ni in $n1 $n2 $n3 $n4; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

Multilang corpus prep with bpe codes learn on the corpora: java, python, c, multilang; CASE, 10k
153638097 (1.046)
157756331 (1.074)
165114259 (1.124)
146940192 (1.000)


#### prep with 10K merges, NO case

In [3]:
%%bash -s "{nocase_10k['small_chunk']['small_chunk']}" "{nocase_10k['small_chunk']['python']}" "{nocase_10k['small_chunk']['c']}" "{nocase_10k['small_chunk']['multilang']}"
echo "java corpus prep with bpe codes learn on the corpora: java, python, c, multilang; NO CASE, 10k"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n4=$(tc "$4")
n=$n1
for ni in $n1 $n2 $n3 $n4; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

java corpus prep with bpe codes learn on the corpora: java, python, c, multilang; NO CASE, 10k
15796897 (1.000)
16635850 (1.053)
17426390 (1.103)
16317394 (1.033)


In [4]:
%%bash -s "{nocase_10k['python']['small_chunk']}" "{nocase_10k['python']['python']}" "{nocase_10k['python']['c']}" "{nocase_10k['python']['multilang']}"
echo "python corpus prep with bpe codes learn on the corpora: java, python, c, multilang; NO CASE, 10k"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n4=$(tc "$4")
n=$n2
for ni in $n1 $n2 $n3 $n4; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

python corpus prep with bpe codes learn on the corpora: java, python, c, multilang; NO CASE, 10k
51548877 (1.075)
47961837 (1.000)
52520017 (1.095)
50778376 (1.059)


In [5]:
%%bash -s "{nocase_10k['c']['small_chunk']}" "{nocase_10k['c']['python']}" "{nocase_10k['c']['c']}" "{nocase_10k['c']['multilang']}"
echo "C corpus prep with bpe codes learn on the corpora: java, python, c, multilang; NO CASE, 10k"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n4=$(tc "$4")
n=$n3
for ni in $n1 $n2 $n3 $n4; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

C corpus prep with bpe codes learn on the corpora: java, python, c, multilang; NO CASE, 10k
118138217 (1.115)
116201726 (1.097)
105908316 (1.000)
113480561 (1.071)


In [7]:
%%bash -s "{nocase_10k['multilang']['small_chunk']}" "{nocase_10k['multilang']['python']}" "{nocase_10k['multilang']['c']}" "{nocase_10k['multilang']['multilang']}"
echo "Multilang corpus prep with bpe codes learn on the corpora: java, python, c, multilang; NO CASE, 10k"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n4=$(tc "$4")
n=$n4
for ni in $n1 $n2 $n3 $n4; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

Multilang corpus prep with bpe codes learn on the corpora: java, python, c, multilang; NO CASE, 10k
163945551 (1.034)
164895222 (1.040)
169527200 (1.069)
158592401 (1.000)
