In [2]:
!pip install pandas

Collecting pandas
  Downloading pandas-1.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 5.0 MB/s eta 0:00:01    |██████████▋                     | 3.7 MB 5.0 MB/s eta 0:00:02     |█████████████████████████████▉  | 10.5 MB 5.0 MB/s eta 0:00:01
Installing collected packages: pandas
Successfully installed pandas-1.3.3
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m


In [1]:
import time
import sys
import os
import copy
import random
import torch 
import numpy as np
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

sys.path.append('../tools')
from generate import generate, sent_to_ids, load_bart, load_sents, load_dict, ids_to_tokens
from cka import encdec_cka_sim, cal_encdec_mean

In [2]:
def random_sampling(sents1, sents2, num):
    sents = [(sent1, sent2) for sent1, sent2 in zip(sents1, sents2)]
    sampled_sents = random.sample(sents, num)
    sampled_sents1 = [x[0] for x in sampled_sents]
    sampled_sents2 = [x[1] for x in sampled_sents]
    return sampled_sents1, sampled_sents2

# Japanese BART

In [3]:
# pre-trained JaBART
jabart_jako_path = "../pretrained_bart/trim/jabart_jako"
jabart_jako_name = "ja_bart_base.pt"
pre_model = load_bart(
    path=jabart_jako_path, model_name=jabart_jako_name
).to(DEVICE)

## Korean/Japanese

In [4]:
# dict
koja_d = load_dict("../pretrained_bart/trim/jabart_jako/dict.txt")

# sentences 
file_path = "../data/dev.ja"
with open(file_path , "r") as f:
    sentences_ja = f.readlines()

file_path = "../data/dev.ko"
with open(file_path , "r") as f:
    sentences_ko = f.readlines()

In [7]:
# ja-ko
ft_path = "../ja-ko/bart/checkpoints_"
ft_name = "checkpoint_best.pt"
ft_model = load_bart(
    path=ft_path, model_name=ft_name
).to(DEVICE)

start = time.time()
encdec_cka_sim(
    pre=pre_model, ft=ft_model, 
    pre_d=koja_d, ft_d=koja_d, 
    pre_sents=sentences_ja[:100], ft_sents=sentences_ja[:100],
    batch_size=16
)
end = time.time()
print (f"elapsed_time:{end-start}[sec]")
print()

random.seed(1)
sampled_sents_ja, sampled_sents_ja = random_sampling(
    sentences_ja, sentences_ja, 100
)

start = time.time()
encdec_cka_sim(
    pre=pre_model, ft=ft_model, 
    pre_d=koja_d, ft_d=koja_d, 
    pre_sents=sampled_sents_ja, ft_sents=sampled_sents_ja,
    batch_size=16
)
end = time.time()
print (f"elapsed_time:{end-start}[sec]")

(8, 100, 768) (7, 100, 768) (6, 100, 768)
(8, 100, 768) (7, 100, 768)
Encoder CKA
Layer 0 0.9949951773347993
Layer 1 0.978321613472265
Layer 2 0.9707902466643927
Layer 3 0.9646707396818045
Layer 4 0.9646002217313576
Layer 5 0.9685885867051048
Layer 6 0.9706604459363375
Layer 7 0.9782752876191654

Decoder CKA
Layer 0 0.12777481907275934
Layer 1 0.29481993677082186
Layer 2 0.35793248291284
Layer 3 0.3891760720934722
Layer 4 0.4576973959278487
Layer 5 0.44532457896215877
Layer 6 0.15896837357511515

Decoder up to self attention CKA
Layer 0 0.19339110636848317
Layer 1 0.33576518366058417
Layer 2 0.41018147017254963
Layer 3 0.42937301169240033
Layer 4 0.44847606941201223
Layer 5 0.393520348780849
elapsed_time:111.73702645301819[sec]

(8, 100, 768) (7, 100, 768) (6, 100, 768)
(8, 100, 768) (7, 100, 768)
Encoder CKA
Layer 0 0.9971703786815697
Layer 1 0.9812738013289333
Layer 2 0.9739238246902284
Layer 3 0.9650739860483872
Layer 4 0.9618400017708343
Layer 5 0.9660422979403386
Layer 6 0.9681227

In [8]:
# ko-ja
ft_path = "../ko-ja/bart/checkpoints_"
ft_name = "checkpoint_best.pt"
ft_model = load_bart(
    path=ft_path, model_name=ft_name
).to(DEVICE)

start = time.time()
encdec_cka_sim(
    pre=pre_model, ft=ft_model, 
    pre_d=koja_d, ft_d=koja_d, 
    pre_sents=sentences_ja[:100], ft_sents=sentences_ko[:100],
    batch_size=16
)
end = time.time()
print (f"elapsed_time:{end-start}[sec]")
print()

random.seed(1)
sampled_sents_ja, sampled_sents_ko = random_sampling(
    sentences_ja, sentences_ko, 100
)

start = time.time()
encdec_cka_sim(
    pre=pre_model, ft=ft_model, 
    pre_d=koja_d, ft_d=koja_d, 
    pre_sents=sampled_sents_ja, ft_sents=sampled_sents_ko,
    batch_size=16
)
end = time.time()
print (f"elapsed_time:{end-start}[sec]")

(8, 100, 768) (7, 100, 768) (6, 100, 768)
(8, 100, 768) (7, 100, 768)
Encoder CKA
Layer 0 0.8670065408395228
Layer 1 0.8895461912213435
Layer 2 0.8873928891970647
Layer 3 0.8871296692502944
Layer 4 0.8912271119372739
Layer 5 0.8894015121453398
Layer 6 0.8686754130533374
Layer 7 0.8919541208527568

Decoder CKA
Layer 0 0.14147046357928023
Layer 1 0.26892974630178157
Layer 2 0.33252863252456805
Layer 3 0.365625348645496
Layer 4 0.4467685761657224
Layer 5 0.42780138813384616
Layer 6 0.17588759510507052

Decoder up to self attention CKA
Layer 0 0.20081955237683285
Layer 1 0.3132207114217684
Layer 2 0.3832599460245699
Layer 3 0.4211763134670955
Layer 4 0.4506675530866451
Layer 5 0.37938939485516543
elapsed_time:108.37133860588074[sec]

(8, 100, 768) (7, 100, 768) (6, 100, 768)
(8, 100, 768) (7, 100, 768)
Encoder CKA
Layer 0 0.8828052578510951
Layer 1 0.9040147464892014
Layer 2 0.8906150178104792
Layer 3 0.8838428647354586
Layer 4 0.8944273267348168
Layer 5 0.9063596033533926
Layer 6 0.888456

## English/Japanese

In [9]:
# dict
enja_d = load_dict("../pretrained_bart/trim/jabart_jaen/dict.txt")

# sentences 
file_path = "../data/enja/dev.ja"
with open(file_path , "r") as f:
    sentences_ja = f.readlines()

file_path = "../data/enja/dev.en"
with open(file_path , "r") as f:
    sentences_en = f.readlines()

In [10]:
# ja-en
ft_path = "../ja-en/bart/checkpoints_"
ft_name = "checkpoint_best.pt"
ft_model = load_bart(
    path=ft_path, model_name=ft_name
).to(DEVICE)

start = time.time()
encdec_cka_sim(
    pre=pre_model, ft=ft_model, 
    pre_d=koja_d, ft_d=enja_d, 
    pre_sents=sentences_ja[:100], ft_sents=sentences_ja[:100],
    batch_size=16
)
end = time.time()
print (f"elapsed_time:{end-start}[sec]")

print('\nUse random sampled sentences')
random.seed(1)
sampled_sents_ja, sampled_sents_ja = random_sampling(
    sentences_ja, sentences_ja, 100
)

start = time.time()
encdec_cka_sim(
    pre=pre_model, ft=ft_model, 
    pre_d=koja_d, ft_d=enja_d, 
    pre_sents=sampled_sents_ja, ft_sents=sampled_sents_ja,
    batch_size=16
)
end = time.time()
print (f"elapsed_time:{end-start}[sec]")

(8, 100, 768) (7, 100, 768) (6, 100, 768)
(8, 100, 768) (7, 100, 768)
Encoder CKA
Layer 0 0.9855443313644157
Layer 1 0.9842778298120605
Layer 2 0.9689511473773461
Layer 3 0.9488710523249988
Layer 4 0.9355105208633925
Layer 5 0.9414986054875325
Layer 6 0.9442956888864806
Layer 7 0.9594235106182454

Decoder CKA
Layer 0 0.22672378119550454
Layer 1 0.3058987004779762
Layer 2 0.3696682207217582
Layer 3 0.40461809484243816
Layer 4 0.47326659327561416
Layer 5 0.35687222645448163
Layer 6 0.0452675229508782

Decoder up to self attention CKA
Layer 0 0.2742708872149076
Layer 1 0.3323725391104788
Layer 2 0.39381109783899926
Layer 3 0.4624094692957962
Layer 4 0.42897521509000486
Layer 5 0.2590982724671253
elapsed_time:93.67176389694214[sec]

Use random sampled sentences
(8, 100, 768) (7, 100, 768) (6, 100, 768)
(8, 100, 768) (7, 100, 768)
Encoder CKA
Layer 0 0.989563281497098
Layer 1 0.974816472648417
Layer 2 0.9528309253401753
Layer 3 0.9221236680843773
Layer 4 0.905613568223484
Layer 5 0.91970688

In [11]:
# en-ja
ft_path = "../en-ja/bart/checkpoints_"
ft_name = "checkpoint_best.pt"
ft_model = load_bart(
    path=ft_path, model_name=ft_name
).to(DEVICE)

start = time.time()
encdec_cka_sim(
    pre=pre_model, ft=ft_model, 
    pre_d=koja_d, ft_d=enja_d, 
    pre_sents=sentences_ja[:100], ft_sents=sentences_en[:100],
    batch_size=16
)
end = time.time()
print (f"elapsed_time:{end-start}[sec]")

print('\nUse random sampled sentences')
random.seed(1)
sampled_sents_ja, sampled_sents_en = random_sampling(
    sentences_ja, sentences_en, 100
)

start = time.time()
encdec_cka_sim(
    pre=pre_model, ft=ft_model, 
    pre_d=koja_d, ft_d=enja_d, 
    pre_sents=sampled_sents_ja, ft_sents=sampled_sents_en,
    batch_size=16
)
end = time.time()
print (f"elapsed_time:{end-start}[sec]")

(8, 100, 768) (7, 100, 768) (6, 100, 768)
(8, 100, 768) (7, 100, 768)
Encoder CKA
Layer 0 0.755642804883526
Layer 1 0.7259107216948855
Layer 2 0.7298314655374136
Layer 3 0.7247183757701026
Layer 4 0.7424009289116854
Layer 5 0.7604993429803047
Layer 6 0.748688414409073
Layer 7 0.7792510965659427

Decoder CKA
Layer 0 0.25251256084564844
Layer 1 0.283276554135944
Layer 2 0.3501964215181492
Layer 3 0.3955378594849012
Layer 4 0.43331348061618863
Layer 5 0.3106614850973206
Layer 6 0.05336002335811889

Decoder up to self attention CKA
Layer 0 0.2955529599390394
Layer 1 0.3063162122481898
Layer 2 0.38072998388793894
Layer 3 0.4341193086095668
Layer 4 0.39252758036673513
Layer 5 0.21263450161342567
elapsed_time:87.59473395347595[sec]

Use random sampled sentences
(8, 100, 768) (7, 100, 768) (6, 100, 768)
(8, 100, 768) (7, 100, 768)
Encoder CKA
Layer 0 0.43952244795753864
Layer 1 0.45148635766727835
Layer 2 0.45407653326032255
Layer 3 0.456910254492851
Layer 4 0.44128720776525304
Layer 5 0.43211

# English BART

In [12]:
# pre-trained EnBART
enbart_enja_path = "../pretrained_bart/trim/enbart_jaen"
enbart_enja_name = "jaen_en_bart_base.pt"
pre_model = load_bart(
    path=enbart_enja_path, model_name=enbart_enja_name
).to(DEVICE)

## Japanese/English

In [13]:
# dict
enja_d = load_dict("../pretrained_bart/trim/enbart_jaen/dict.txt")

# sentences 
file_path = "../data/enja_2/enBART/dev.ja"
with open(file_path, "r") as f:
    sentences_ja = f.readlines()

file_path = "../data/enja_2/enBART/dev.en"
with open(file_path, "r") as f:
    sentences_en = f.readlines()

In [14]:
# ja-en
ft_path = "../ja-en/enbart/v2/checkpoints"
ft_name = "checkpoint_best.pt"
ft_model = load_bart(
    path=ft_path, model_name=ft_name
).to(DEVICE)

start = time.time()
encdec_cka_sim(
    pre=pre_model, ft=ft_model, 
    pre_d=enja_d, ft_d=enja_d, 
    pre_sents=sentences_en[:100], ft_sents=sentences_ja[:100],
    batch_size=16
)
end = time.time()
print (f"elapsed_time:{end-start}[sec]")

print('\nUse random sampled sentences')
random.seed(1)
sampled_sents_ja, sampled_sents_en = random_sampling(
    sentences_ja, sentences_en, 100
)

start = time.time()
encdec_cka_sim(
    pre=pre_model, ft=ft_model, 
    pre_d=enja_d, ft_d=enja_d,  
    pre_sents=sampled_sents_en, ft_sents=sampled_sents_ja,
    batch_size=16
)
end = time.time()
print (f"elapsed_time:{end-start}[sec]")

(8, 100, 768) (7, 100, 768) (6, 100, 768)
(8, 100, 768) (7, 100, 768)
Encoder CKA
Layer 0 0.654298177716511
Layer 1 0.7688231598694694
Layer 2 0.7613042981440016
Layer 3 0.7550312213702418
Layer 4 0.759050423254224
Layer 5 0.7555643831752289
Layer 6 0.7405338358666992
Layer 7 0.7405338358666992

Decoder CKA
Layer 0 0.11193837400885838
Layer 1 0.1367184881352814
Layer 2 0.1873775117338924
Layer 3 0.2853384773446727
Layer 4 0.20546009958102718
Layer 5 0.13299156414864938
Layer 6 0.449261108132735

Decoder up to self attention CKA
Layer 0 0.11247151019520017
Layer 1 0.21111405889740117
Layer 2 0.3054620242484335
Layer 3 0.3228863073744893
Layer 4 0.2926185589791821
Layer 5 0.1741042261593089
elapsed_time:89.07417297363281[sec]

Use random sampled sentences
(8, 100, 768) (7, 100, 768) (6, 100, 768)
(8, 100, 768) (7, 100, 768)
Encoder CKA
Layer 0 0.44308230789884906
Layer 1 0.5502702209143098
Layer 2 0.5519916281037587
Layer 3 0.5563345593379978
Layer 4 0.5585082009441193
Layer 5 0.55471259

In [15]:
# en-ja
ft_path = "../en-ja/enbart/checkpoints"
ft_name = "checkpoint_best.pt"
ft_model = load_bart(
    path=ft_path, model_name=ft_name
).to(DEVICE)

start = time.time()
encdec_cka_sim(
    pre=pre_model, ft=ft_model, 
    pre_d=enja_d, ft_d=enja_d, 
    pre_sents=sentences_en[:100], ft_sents=sentences_en[:100],
    batch_size=16
)
end = time.time()
print (f"elapsed_time:{end-start}[sec]")

print('\nUse random sampled sentences')
random.seed(1)
sampled_sents_en, sampled_sents_en = random_sampling(
    sentences_en, sentences_en, 100
)

start = time.time()
encdec_cka_sim(
    pre=pre_model, ft=ft_model, 
    pre_d=enja_d, ft_d=enja_d,  
    pre_sents=sampled_sents_en, ft_sents=sampled_sents_en,
    batch_size=16
)
end = time.time()
print (f"elapsed_time:{end-start}[sec]")

(8, 100, 768) (7, 100, 768) (6, 100, 768)
(8, 100, 768) (7, 100, 768)
Encoder CKA
Layer 0 0.9965538477834348
Layer 1 0.975280185956344
Layer 2 0.9437826740859443
Layer 3 0.9362823855683542
Layer 4 0.9306819536456685
Layer 5 0.9308850844551042
Layer 6 0.9296681597115906
Layer 7 0.9296681597115906

Decoder CKA
Layer 0 0.11343461072372857
Layer 1 0.15391131681694342
Layer 2 0.1982631370654231
Layer 3 0.23597873938762637
Layer 4 0.279553364208512
Layer 5 0.30365452656952513
Layer 6 0.6037295704306445

Decoder up to self attention CKA
Layer 0 0.12466590918386924
Layer 1 0.16705264895038846
Layer 2 0.19181686344986004
Layer 3 0.2722902299273229
Layer 4 0.2996745973027566
Layer 5 0.3568976167906651
elapsed_time:143.360004901886[sec]

Use random sampled sentences
(8, 100, 768) (7, 100, 768) (6, 100, 768)
(8, 100, 768) (7, 100, 768)
Encoder CKA
Layer 0 0.9947663859591692
Layer 1 0.9690050812960682
Layer 2 0.9267659871455628
Layer 3 0.9259282590568918
Layer 4 0.9225745072622071
Layer 5 0.9233199

## French/English

In [16]:
# dict
enfr_d = load_dict("../pretrained_bart/trim/enbart_fren/dict.txt")

# sentences 
file_path = "../data/enfr/random/dev.fr"
with open(file_path, "r") as f:
    sentences_fr = f.readlines()

file_path = "../data/enfr/random/dev.en"
with open(file_path, "r") as f:
    sentences_en = f.readlines()

In [17]:
# fr-en
ft_path = "../fr-en/bart/1M/v2/checkpoints"
ft_name = "checkpoint_best.pt"
ft_model = load_bart(
    path=ft_path, model_name=ft_name
).to(DEVICE)

start = time.time()
encdec_cka_sim(
    pre=pre_model, ft=ft_model, 
    pre_d=enja_d, ft_d=enfr_d, 
    pre_sents=sentences_en[:100], ft_sents=sentences_fr[:100],
    batch_size=16
)
end = time.time()
print (f"elapsed_time:{end-start}[sec]")


print('\nUse random sampled sentences')
random.seed(1)
sampled_sents_en, sampled_sents_fr = random_sampling(
    sentences_en, sentences_fr, 100
)

start = time.time()
encdec_cka_sim(
    pre=pre_model, ft=ft_model, 
    pre_d=enja_d, ft_d=enfr_d,  
    pre_sents=sampled_sents_en, ft_sents=sampled_sents_fr,
    batch_size=16
)
end = time.time()
print (f"elapsed_time:{end-start}[sec]")

(8, 100, 768) (7, 100, 768) (6, 100, 768)
(8, 100, 768) (7, 100, 768)
Encoder CKA
Layer 0 0.7471251485767328
Layer 1 0.7778515655411314
Layer 2 0.7695343506641981
Layer 3 0.7402289693809095
Layer 4 0.7444042194525834
Layer 5 0.7603124588891431
Layer 6 0.7451640155456389
Layer 7 0.7451640155456389

Decoder CKA
Layer 0 0.0865684785174013
Layer 1 0.08488433385486086
Layer 2 0.1574676924823072
Layer 3 0.23263037164746725
Layer 4 0.277752855277543
Layer 5 0.2747906411352265
Layer 6 0.38647164713488663

Decoder up to self attention CKA
Layer 0 0.08890460576881483
Layer 1 0.15645316283939398
Layer 2 0.24675014853677568
Layer 3 0.31262805062130006
Layer 4 0.3556898458948217
Layer 5 0.28861438428914443
elapsed_time:93.26017045974731[sec]

Use random sampled sentences
(8, 100, 768) (7, 100, 768) (6, 100, 768)
(8, 100, 768) (7, 100, 768)
Encoder CKA
Layer 0 0.7574330674767051
Layer 1 0.801269538625719
Layer 2 0.7998796712120713
Layer 3 0.7890301957637343
Layer 4 0.7976716919757547
Layer 5 0.80152

In [18]:
# en-fr
ft_path = "../en-fr/bart/1M/checkpoints"
ft_name = "checkpoint_best.pt"
ft_model = load_bart(
    path=ft_path, model_name=ft_name
).to(DEVICE)

start = time.time()
encdec_cka_sim(
    pre=pre_model, ft=ft_model, 
    pre_d=enja_d, ft_d=enfr_d, 
    pre_sents=sentences_en[:100], ft_sents=sentences_en[:100],
    batch_size=16
)
end = time.time()
print (f"elapsed_time:{end-start}[sec]")

print('\nUse random sampled sentences')
random.seed(1)
sampled_sents_en, sampled_sents_en = random_sampling(
    sentences_en, sentences_en, 100
)

start = time.time()
encdec_cka_sim(
    pre=pre_model, ft=ft_model, 
    pre_d=enja_d, ft_d=enfr_d,  
    pre_sents=sampled_sents_en, ft_sents=sampled_sents_en,
    batch_size=16
)
end = time.time()
print (f"elapsed_time:{end-start}[sec]")

(8, 100, 768) (7, 100, 768) (6, 100, 768)
(8, 100, 768) (7, 100, 768)
Encoder CKA
Layer 0 0.9860679393196332
Layer 1 0.8838703663785191
Layer 2 0.8528427986416215
Layer 3 0.8147620972026153
Layer 4 0.7997088820301695
Layer 5 0.798447798704826
Layer 6 0.7600422429275261
Layer 7 0.7600422429275261

Decoder CKA
Layer 0 0.08753203404535702
Layer 1 0.0921635522476793
Layer 2 0.1522007049912804
Layer 3 0.20841556070939177
Layer 4 0.24060344570743167
Layer 5 0.2382468474216198
Layer 6 0.2892679693610796

Decoder up to self attention CKA
Layer 0 0.09026119869653651
Layer 1 0.15120692696250757
Layer 2 0.2189137682652893
Layer 3 0.26889816993114063
Layer 4 0.2831166777517662
Layer 5 0.2377757804361311
elapsed_time:101.05390071868896[sec]

Use random sampled sentences
(8, 100, 768) (7, 100, 768) (6, 100, 768)
(8, 100, 768) (7, 100, 768)
Encoder CKA
Layer 0 0.9771762965273146
Layer 1 0.9073712394139727
Layer 2 0.8965725297233069
Layer 3 0.8815354150381705
Layer 4 0.8775547720352443
Layer 5 0.87244