# Python vs Java vs C Experiments

### Setting things up

In [28]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
import os
from pandas import DataFrame
from typing import List

from metrics.vector import pearson, cooccurences, summarize_cooccurences
from metrics.merge import Merge, read_merges
from metrics.matrix import pearson_matrix, cooccurence_matrix

### Loading merges

In [30]:
java_merges_file = "/home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha/bpe/small_chunk_19-05-22T17-59-55/10000/merges.txt"
python_merges_file = "/home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha/bpe/python_19-05-20T00-48-25/10000/merges.txt"
c_merges_file = "/home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha/bpe/c_19-05-24T08-55-52/10000/merges.txt"

java_nocase_merges_file = "/home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha/bpe/small_chunk_nocase_19-05-24T19-40-05/10000/merges.txt"
python_nocase_merges_file = "/home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha/bpe/python_nocase_19-05-24T19-41-52/10000/merges.txt"
c_nocase_merges_file = "/home/lv71161/hlibbabii/.config/dataprep/1.0.0-alpha/bpe/c_nocase_19-05-24T19-41-10/10000/merges.txt"

java_merges = read_merges(java_merges_file)
python_merges = read_merges(python_merges_file)
c_merges = read_merges(c_merges_file)

java_nocase_merges = read_merges(java_nocase_merges_file)
python_nocase_merges = read_merges(python_nocase_merges_file)
c_nocase_merges = read_merges(c_nocase_merges_file)

In [31]:
print(f"N merges: {len(python_merges)}")
python_merges[:10], python_nocase_merges[:10]

N merges: 10000


([('\\', '\\'): 5050789,
  ('\\', "'"): 1534342,
  ('s', 'e'): 1113570,
  ('t', 'e'): 855855,
  ('i', 'n'): 852704,
  ('o', 'n'): 851730,
  ('r', 'e'): 840195,
  ('s', 't'): 711704,
  ('l', 'e'): 650553,
  ('a', 'l'): 548889],
 [('\\', '\\'): 10101578,
  ('\\', "'"): 3068684,
  ('s', 'e'): 2323188,
  ('i', 'n'): 1870736,
  ('t', 'e'): 1856902,
  ('r', 'e'): 1790498,
  ('o', 'n'): 1758958,
  ('s', 't'): 1490566,
  ('l', 'e'): 1376906,
  ('a', 'l'): 1154498])

In [32]:
print(f"N merges: {len(java_merges)}")
java_merges[:10], java_nocase_merges[:10]

N merges: 10000


([('\\', '\\'): 1880641,
  ('e', 'r'): 447509,
  ('o', 'n'): 404945,
  ('\\', 't'): 319763,
  ('o', 'r'): 311230,
  ('t', 'i'): 308509,
  ('i', 'n'): 274370,
  ('e', 't'): 255731,
  ('e', 'n'): 243685,
  ('e', 's'): 242396],
 [('\\', '\\'): 1880641,
  ('e', 'r'): 462971,
  ('o', 'n'): 424580,
  ('i', 'n'): 350414,
  ('r', 'e'): 334290,
  ('\\', 't'): 319763,
  ('t', 'i'): 314714,
  ('s', 't'): 309712,
  ('o', 'r'): 308089,
  ('e', 'n'): 247251])

In [33]:
print(f"N merges: {len(c_merges)}")
c_merges[:10], c_nocase_merges[:10]

N merges: 10000


([('\\', '\\'): 16840145,
  ('i', 'n'): 1721676,
  ('r', 'e'): 1717501,
  ('d', 'e'): 1430342,
  ('s', 't'): 1307662,
  ('a', 't'): 1189279,
  ('e', 'r'): 1139720,
  ('0', '0'): 986828,
  ('t', 'i'): 825342,
  ('s', 'e'): 771446],
 [('\\', '\\'): 16779596,
  ('i', 'n'): 2026487,
  ('r', 'e'): 1962547,
  ('d', 'e'): 1634017,
  ('s', 't'): 1480094,
  ('e', 'r'): 1373186,
  ('a', 't'): 1338654,
  ('0', '0'): 960346,
  ('s', 'e'): 928712,
  ('t', 'i'): 899107])

### Metric 1. Pearson

In [34]:
n_merges = 1000
print(f"N merges: {n_merges//1000}k, case")
m = pearson_matrix([java_merges[:n_merges], python_merges[:n_merges], c_merges[:n_merges]])
DataFrame(m).round(3)

N merges: 1k, case


Unnamed: 0,0,1,2
0,1.0,0.864,0.895
1,0.864,1.0,0.893
2,0.895,0.893,1.0


In [35]:
n_merges = 1000
print(f"N merges: {n_merges//1000}k, NO case")
m = pearson_matrix([java_nocase_merges[:n_merges], python_nocase_merges[:n_merges], c_nocase_merges[:n_merges]])
DataFrame(m).round(3)

N merges: 1k, NO case


Unnamed: 0,0,1,2
0,1.0,0.896,0.901
1,0.896,1.0,0.892
2,0.901,0.892,1.0


In [36]:
n_merges = 10000
print(f"N merges: {n_merges//1000}k, case")
m = pearson_matrix([java_merges[:n_merges], python_merges[:n_merges], c_merges[:n_merges]])
DataFrame(m).round(3)

N merges: 10k, case


Unnamed: 0,0,1,2
0,1.0,0.869,0.896
1,0.869,1.0,0.894
2,0.896,0.894,1.0


In [37]:
n_merges = 10000
print(f"N merges: {n_merges//1000}k, NO case")
m = pearson_matrix([java_nocase_merges[:n_merges], python_nocase_merges[:n_merges], c_nocase_merges[:n_merges]])
DataFrame(m).round(3)

N merges: 10k, NO case


Unnamed: 0,0,1,2
0,1.0,0.9,0.902
1,0.9,1.0,0.894
2,0.902,0.894,1.0


### Metric 2. Bpe merges cooccurrences

In [38]:
n_merges = 1000
print(f"N merges: {n_merges//1000}k, case")
lst = [java_merges[:n_merges], python_merges[:n_merges], c_merges[:n_merges]]
m = cooccurence_matrix(lst, lst)
DataFrame(m).round(3)

N merges: 1k, case


Unnamed: 0,0,1,2
0,1.0,0.365,0.32
1,0.365,1.0,0.404
2,0.32,0.404,1.0


In [39]:
n_merges = 1000
print(f"N merges: {n_merges//1000}k, NO case")
lst = [java_nocase_merges[:n_merges], python_nocase_merges[:n_merges], c_nocase_merges[:n_merges]]
m = cooccurence_matrix(lst, lst)
DataFrame(m).round(3)

N merges: 1k, NO case


Unnamed: 0,0,1,2
0,1.0,0.47,0.413
1,0.47,1.0,0.43
2,0.413,0.43,1.0


In [40]:
n_merges = 10000
print(f"N merges: {n_merges//1000}k, case")
lst = [java_merges[:n_merges], python_merges[:n_merges], c_merges[:n_merges]]
m = cooccurence_matrix(lst, lst)
DataFrame(m).round(3)

N merges: 10k, case


Unnamed: 0,0,1,2
0,1.0,0.347,0.299
1,0.347,1.0,0.359
2,0.299,0.359,1.0


In [41]:
n_merges = 10000
print(f"N merges: {n_merges//1000}k, NO case")
lst = [java_nocase_merges[:n_merges], python_nocase_merges[:n_merges], c_nocase_merges[:n_merges]]
m = cooccurence_matrix(lst, lst)
DataFrame(m).round(3)

N merges: 10k, NO case


Unnamed: 0,0,1,2
0,1.0,0.391,0.322
1,0.391,1.0,0.333
2,0.322,0.333,1.0


### Loading vocabs

In [42]:
from dataprep import bperegistry
from metrics.matrix import merge_similarity_rate_matrix
from pandas import DataFrame

java_bpe_vocab = bperegistry.load_base_vocab('small_chunk-10000')
python_bpe_vocab = bperegistry.load_base_vocab('python-10000')
c_bpe_vocab = bperegistry.load_base_vocab('c-10000')

java_all_vocab = bperegistry.load_all_vocab('small_chunk-10000')
python_all_vocab = bperegistry.load_all_vocab('python-10000')
c_all_vocab = bperegistry.load_all_vocab('c-10000')

java_nocase_bpe_vocab = bperegistry.load_base_vocab('small_chunk_nocase-10000')
python_nocase_bpe_vocab = bperegistry.load_base_vocab('python_nocase-10000')
c_nocase_bpe_vocab = bperegistry.load_base_vocab('c_nocase-10000')

java_nocase_all_vocab = bperegistry.load_all_vocab('small_chunk_nocase-10000')
python_nocase_all_vocab = bperegistry.load_all_vocab('python_nocase-10000')
c_nocase_all_vocab = bperegistry.load_all_vocab('c_nocase-10000')

In [43]:
list(java_bpe_vocab.items())[:200]

[('\\\\', 1880641),
 ('n', 1133884),
 ('t', 488942),
 ('\\t', 319763),
 ('get', 108939),
 ('the', 107340),
 ('_', 76549),
 ('String', 63861),
 ('nimport', 61279),
 ('Exception', 42147),
 ('org', 41915),
 ('of', 36648),
 ('0', 34004),
 ('is', 33581),
 ('to', 33490),
 ('set', 33190),
 ('1', 31542),
 ('License', 30577),
 ('tpublic', 27799),
 ('a', 27661),
 ('2', 24965),
 ('Name', 22715),
 ("\\'", 22005),
 ('List', 19667),
 ('or', 19356),
 ('in', 19339),
 ('java', 18556),
 ('Id', 18171),
 ('param', 16241),
 ('treturn', 15694),
 ('value', 15614),
 ('and', 15602),
 ('add', 15198),
 ('i', 14702),
 ('Request', 14334),
 ('com', 14189),
 ('Context', 13357),
 ('Class', 13348),
 ('File', 12920),
 ('Object', 12839),
 ('not', 12626),
 ('under', 12425),
 ('tif', 12359),
 ('file', 12128),
 ('Map', 12062),
 ('The', 11932),
 ('e', 11838),
 ('Type', 11817),
 ('Test', 11796),
 ('util', 11577),
 ('Override', 11069),
 ('it', 11054),
 ('with', 11035),
 ('name', 10841),
 ('Value', 10808),
 ('tprivate', 10728)

In [44]:
list(python_bpe_vocab.items())[:200]

[('\\\\', 5050789),
 ('n', 3221967),
 ('_', 2141551),
 ("\\'", 1534342),
 ('self', 511907),
 ('\\t', 217064),
 ('1', 207439),
 ('the', 197437),
 ('0', 189946),
 ('name', 137381),
 ('2', 132927),
 ('to', 127392),
 ('a', 120004),
 ('is', 118939),
 ('in', 117364),
 ('module', 112276),
 ('type', 95567),
 ('test', 94931),
 ('None', 87886),
 ('of', 87883),
 ('x', 85059),
 ('get', 77428),
 ('not', 75567),
 ('s', 73069),
 ('3', 70776),
 ('True', 69080),
 ('and', 69037),
 ('Error', 62479),
 ('description', 61324),
 ('result', 59035),
 ('data', 58871),
 ('dict', 57964),
 ('False', 57683),
 ('Equal', 55747),
 ('value', 52758),
 ('str', 52715),
 ('or', 50878),
 ('b', 48141),
 ('path', 44879),
 ('list', 43963),
 ('5', 43276),
 ('params', 41616),
 ('state', 41487),
 ('be', 40961),
 ('default', 40519),
 ('set', 38323),
 ('np', 38127),
 ('4', 37726),
 ('nfrom', 35580),
 ('f', 35243),
 ('c', 35197),
 ('id', 35104),
 ('key', 34944),
 ('check', 34599),
 ('required', 33381),
 ('index', 32760),
 ('X', 3244

In [45]:
list(c_bpe_vocab.items())[:200]

[('\\\\', 16840145),
 ('_', 7596271),
 ('n', 7531353),
 ('t', 5777498),
 ('0', 1003819),
 ('x', 560986),
 ('\\t', 487831),
 ('tif', 426522),
 ('1', 381609),
 ('the', 333974),
 ('2', 325263),
 ('treturn', 260662),
 ('dev', 255920),
 ('i', 246728),
 ('to', 223360),
 ('nstatic', 207303),
 ('tstruct', 156039),
 ('data', 145395),
 ('3', 139210),
 ('4', 131559),
 ('p', 131009),
 ('s', 126695),
 ('h', 124464),
 ('is', 119234),
 ('device', 119054),
 ('0x00', 117527),
 ('of', 116942),
 ('32', 116044),
 ('a', 114089),
 ('8', 113436),
 ('include', 109705),
 ('d', 102598),
 ('c', 102308),
 ('in', 99357),
 ("\\'", 99278),
 ('lock', 99030),
 ('info', 97579),
 ('tint', 94022),
 ('and', 93588),
 ('u', 92819),
 ('NULL', 92492),
 ('00', 86579),
 ('flags', 85326),
 ('define', 81467),
 ('tcase', 80298),
 ('len', 77907),
 ('set', 76541),
 ('size', 76022),
 ('addr', 75307),
 ('16', 74868),
 ('port', 74310),
 ('get', 71993),
 ('name', 71523),
 ('state', 69902),
 ('init', 69831),
 ('tbreak', 69128),
 ('write'

### Metric 3. Merge similarity rate matrix

In [46]:
n_merges = 1000
print(f"N merges: {n_merges//1000}k, case, without keywords and operators")
m = merge_similarity_rate_matrix([java_bpe_vocab, python_bpe_vocab, c_bpe_vocab], 
                                 [java_merges[:n_merges], python_merges[:n_merges], c_merges[:n_merges]])

DataFrame(m).round(3)

N merges: 1k, case, without keywords and operators


Unnamed: 0,0,1,2
0,1.0,0.72,0.673
1,0.737,1.0,0.751
2,0.753,0.784,1.0


In [47]:
n_merges = 1000
print(f"N merges: {n_merges//1000}k, case, including keywords and operators")
m = merge_similarity_rate_matrix([java_all_vocab, python_all_vocab, c_all_vocab], 
                                 [java_merges[:n_merges], python_merges[:n_merges], c_merges[:n_merges]])

DataFrame(m).round(3)

N merges: 1k, case, including keywords and operators


Unnamed: 0,0,1,2
0,1.0,0.814,0.785
1,0.822,1.0,0.832
2,0.837,0.86,1.0


In [48]:
n_merges = 10000
print(f"N merges: {n_merges//1000}k, NO case, without keywords and operators")
m = merge_similarity_rate_matrix([java_nocase_bpe_vocab, python_nocase_bpe_vocab, c_nocase_bpe_vocab], 
                                 [java_nocase_merges[:n_merges], python_nocase_merges[:n_merges], c_nocase_merges[:n_merges]])

DataFrame(m).round(3)

N merges: 10k, NO case, without keywords and operators


Unnamed: 0,0,1,2
0,1.0,0.939,0.897
1,0.935,1.0,0.916
2,0.89,0.893,1.0


In [49]:
n_merges = 10000
print(f"N merges: {n_merges//1000}k, NO case, including keywords and operators")
m = merge_similarity_rate_matrix([java_nocase_all_vocab, python_nocase_all_vocab, c_nocase_all_vocab], 
                                 [java_nocase_merges[:n_merges], python_nocase_merges[:n_merges], c_nocase_merges[:n_merges]])

DataFrame(m).round(3)

N merges: 10k, NO case, including keywords and operators


Unnamed: 0,0,1,2
0,1.0,0.957,0.917
1,0.953,1.0,0.935
2,0.891,0.894,1.0


### Metric 4. Corpora sizes

### prep with 5K merges

In [58]:
java_java_path = "/home/lv71161/hlibbabii/raw_datasets/allamanis/small_chunk_19-05-22T17-59-55_preprocessed_00900_small_chunk-5000"
java_python_path = "/home/lv71161/hlibbabii/raw_datasets/allamanis/small_chunk_19-05-22T17-59-55_preprocessed_00900_python-5000"
java_c_path = "/home/lv71161/hlibbabii/raw_datasets/allamanis/small_chunk_19-05-22T17-59-55_preprocessed_00900_c-5000"

python_java_path = "/home/lv71161/hlibbabii/raw_datasets/python_19-05-20T00-48-25_preprocessed_00900_small_chunk-5000"
python_python_path = "/home/lv71161/hlibbabii/raw_datasets/python_19-05-20T00-48-25_preprocessed_00900_python-5000"
python_c_path = "/home/lv71161/hlibbabii/raw_datasets/python_19-05-20T00-48-25_preprocessed_00900_c-5000"

c_java_path = "/home/lv71161/hlibbabii/raw_datasets/c_19-05-24T08-55-52_preprocessed_00900_small_chunk-5000"
c_python_path = "/home/lv71161/hlibbabii/raw_datasets/c_19-05-24T08-55-52_preprocessed_00900_python-5000"
c_c_path= "/home/lv71161/hlibbabii/raw_datasets/c_19-05-24T08-55-52_preprocessed_00900_c-5000"

In [51]:
%%bash -s "$java_java_path" "$java_python_path" "$java_c_path"
echo "java corpus prep with bpe codes learn on the corpora: java, python, c; CASE"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n=$n1
for ni in $n1 $n2 $n3; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

java corpus prep with bpe codes learn on the corpora: java, python, c; CASE
16646436 (1.000)
18744497 (1.126)
20742165 (1.246)


In [52]:
%%bash -s "$python_java_path" "$python_python_path" "$python_c_path"
echo "python corpus prep with bpe codes learn on the corpora: java, python, c; CASE"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n=$n2
for ni in $n1 $n2 $n3; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

python corpus prep with bpe codes learn on the corpora: java, python, c; CASE
56605830 (1.164)
48636302 (1.000)
57059920 (1.173)


In [53]:
%%bash -s "$c_java_path" "$c_python_path" "$c_c_path"
echo "c corpus prep with bpe codes learn on the corpora: java, python, c; CASE"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n=$n3
for ni in $n1 $n2 $n3; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

c corpus prep with bpe codes learn on the corpora: java, python, c; CASE
150253792 (1.162)
148516447 (1.149)
129288362 (1.000)


### prep with 10K merges

In [60]:
java_java_path = "/home/lv71161/hlibbabii/raw_datasets/allamanis/small_chunk_19-05-22T17-59-55_preprocessed_00900_small_chunk-10000"
java_python_path = "/home/lv71161/hlibbabii/raw_datasets/allamanis/small_chunk_19-05-22T17-59-55_preprocessed_00900_python-10000"
java_c_path = "/home/lv71161/hlibbabii/raw_datasets/allamanis/small_chunk_19-05-22T17-59-55_preprocessed_00900_c-10000"

python_java_path = "/home/lv71161/hlibbabii/raw_datasets/python_19-05-20T00-48-25_preprocessed_00900_small_chunk-10000"
python_python_path = "/home/lv71161/hlibbabii/raw_datasets/python_19-05-20T00-48-25_preprocessed_00900_python-10000"
python_c_path = "/home/lv71161/hlibbabii/raw_datasets/python_19-05-20T00-48-25_preprocessed_00900_c-10000"

c_java_path = "/home/lv71161/hlibbabii/raw_datasets/c_19-05-24T08-55-52_preprocessed_00900_small_chunk-10000"
c_python_path = "/home/lv71161/hlibbabii/raw_datasets/c_19-05-24T08-55-52_preprocessed_00900_python-10000"
c_c_path= "/home/lv71161/hlibbabii/raw_datasets/c_19-05-24T08-55-52_preprocessed_00900_c-10000"

In [61]:
%%bash -s "$java_java_path" "$java_python_path" "$java_c_path"
echo "java corpus prep with bpe codes learn on the corpora: java, python, c; CASE"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n=$n1
for ni in $n1 $n2 $n3; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

java corpus prep with bpe codes learn on the corpora: java, python, c; CASE
15776027 (1.000)
17508868 (1.110)
19036770 (1.207)


In [65]:
%%bash -s "$python_java_path" "$python_python_path" "$python_c_path"
echo "python corpus prep with bpe codes learn on the corpora: java, python, c; CASE"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n=$n2
for ni in $n1 $n2 $n3; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

python corpus prep with bpe codes learn on the corpora: java, python, c; CASE
51359176 (1.113)
46159411 (1.000)
52852048 (1.145)


In [63]:
%%bash -s "$c_java_path" "$c_python_path" "$c_c_path"
echo "c corpus prep with bpe codes learn on the corpora: java, python, c; CASE"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n=$n3
for ni in $n1 $n2 $n3; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

c corpus prep with bpe codes learn on the corpora: java, python, c; CASE
140787203 (1.144)
140231349 (1.139)
123069450 (1.000)


### prep with 10K merges, NO case

In [54]:
java_java_nocase_path="/home/lv71161/hlibbabii/raw_datasets/allamanis/small_chunk_nocase_19-05-24T19-40-05_preprocessed_00901_small_chunk_nocase-10000"
java_python_nocase_path="/home/lv71161/hlibbabii/raw_datasets/allamanis/small_chunk_nocase_19-05-24T19-40-05_preprocessed_00901_python_nocase-10000"
java_c_nocase_path="/home/lv71161/hlibbabii/raw_datasets/allamanis/small_chunk_nocase_19-05-24T19-40-05_preprocessed_00901_c_nocase-10000"

python_java_nocase_path="/home/lv71161/hlibbabii/raw_datasets/python_nocase_19-05-24T19-41-52_preprocessed_00901_small_chunk_nocase-10000"
python_python_nocase_path="/home/lv71161/hlibbabii/raw_datasets/python_nocase_19-05-24T19-41-52_preprocessed_00901_python_nocase-10000"
python_c_nocase_path="/home/lv71161/hlibbabii/raw_datasets/python_nocase_19-05-24T19-41-52_preprocessed_00901_c_nocase-10000"

c_java_nocase_path="/home/lv71161/hlibbabii/raw_datasets/c_nocase_19-05-24T19-41-10_preprocessed_00901_small_chunk_nocase-10000"
c_python_nocase_path="/home/lv71161/hlibbabii/raw_datasets/c_nocase_19-05-24T19-41-10_preprocessed_00901_python_nocase-10000"
c_c_nocase_path="/home/lv71161/hlibbabii/raw_datasets/c_nocase_19-05-24T19-41-10_preprocessed_00901_c_nocase-10000"

In [55]:
%%bash -s "$java_java_nocase_path" "$java_python_nocase_path" "$java_c_nocase_path"
echo "java corpus prep with bpe codes learn on the corpora: java, python, c; NO case"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n=$n1
for ni in $n1 $n2 $n3; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

java corpus prep with bpe codes learn on the corpora: java, python, c; NO case
17532443 (1.000)
18533237 (1.057)
19495142 (1.112)


In [56]:
%%bash -s "$python_java_nocase_path" "$python_python_nocase_path" "$python_c_nocase_path"
echo "python corpus prep with bpe codes learn on the corpora: java, python, c; NO case"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n=$n2
for ni in $n1 $n2 $n3; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

python corpus prep with bpe codes learn on the corpora: java, python, c; NO case
103404272 (1.100)
94029664 (1.000)
104268904 (1.109)


In [57]:
%%bash -s "$c_java_nocase_path" "$c_python_nocase_path" "$c_c_nocase_path"
echo "c corpus prep with bpe codes learn on the corpora: java, python, c; NO case"
source "$HOME/.bashrc"
n1=$(tc "$1")
n2=$(tc "$2")
n3=$(tc "$3")
n=$n3
for ni in $n1 $n2 $n3; do
    echo "print(f\"${ni} ({${ni}/${n}:.3f})\")" | python
done

c corpus prep with bpe codes learn on the corpora: java, python, c; NO case
140329184 (1.113)
140093125 (1.111)
126129377 (1.000)
