In [None]:
# libraries from finetuning_parameters.py
from finetuning_parameters import get_args
from future.baseline_trainer import BaselineTuner
from future.modules import ptl2classes
from future.hooks import EvaluationRecorder

from data_loader.wrap_sampler import wrap_sampler
import data_loader.task_configs as task_configs
import data_loader.data_configs as data_configs
from future.collocate_fns import task2collocate_fn

import utils.checkpoint as checkpoint
import utils.logging as logging

import torch
import random
import os

# libraries from future/base.py
from torch.utils.data import SequentialSampler, RandomSampler
from future.hooks import EvaluationRecorder
import utils.eval_meters as eval_meters
from seqeval.metrics import f1_score as f1_score_tagging
import torch

# libraries from future/baseline_trainer.py
import torch
import torch.nn as nn
import numpy as np
from copy import deepcopy
from future.base import BaseTrainer
from future.hooks.base_hook import HookContainer
from future.hooks import EvaluationRecorder
from torch.utils.data import RandomSampler
from collections import defaultdict, Counter
from tqdm import tqdm

# and so on..
from finetuning_baseline import init_config, init_task, init_hooks

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
# define task and model (same in finetuning_parameters.py)

In [None]:
parser = get_args()
conf = parser.parse_args(args=[])

In [None]:
conf.dataset_name = 'xnli'
conf.trn_languages = 'english'
if conf.dataset_name == 'pawsx':
    conf.eval_languages = 'english,german,chinese,french,japanese,korean,spanish'
elif conf.dataset_name == 'xnli':
    conf.eval_languages = 'english,arabic,bulgarian,chinese,french,german,greek,hindi,russian,spanish,swahili,thai,turkish,urdu,vietnamese'
conf.finetune_epochs = 10
conf.finetune_batch_size = 256
conf.eval_every_batch = 50
conf.override = False
conf.train_fast = False
conf.world = '0'
conf.finetune_lr = 1e-5

In [None]:
init_config(conf)
model, tokenizer, data_iter, metric_name, collocate_batch_fn = init_task(conf)

data_iter

In [None]:
adapt_loaders = {}
for language, language_dataset in data_iter.items():
    # NOTE: the sample dataset are refered
    adapt_loaders[language] = wrap_sampler(
        trn_batch_size=conf.finetune_batch_size,
        infer_batch_size=conf.inference_batch_size,
        language=language,
        language_dataset=language_dataset,
    )
hooks = init_hooks(conf, metric_name)

In [None]:
adapt_loaders

In [None]:
trainer = BaselineTuner(
        conf, collocate_batch_fn=collocate_batch_fn, logger=conf.logger
    )
trainer.conf.eval_languages

In [None]:
import pickle

def save_pickle(file, data):
    with open(file, 'wb') as f:
        pickle.dump(data, f)
        
def load_pickle(file):
    with open(file, 'rb') as f:
        return pickle.load(f)

In [None]:
if conf.dataset_name == 'xnli':
    PATH='./checkpoint_baseline/xnli/debug/1646210965_model_task-xnli_flr-3.0E-05_ftbs-32_ftepcs-10_sd-3_trnfast-False_evalevery-2000_tlang-en_vlang-en-ar-bg-zh-fr-de-el-hi-ru-es-sw-th-tr-ur-vi/state_dicts/best_state.pt'
elif conf.dataset_name == 'pawsx':
    PATH='./checkpoint_baseline/pawsx/debug/1646200917_model_task-pawsx_flr-1.0E-05_ftbs-32_ftepcs-10_sd-3_trnfast-False_evalevery-200_tlang-en_vlang-en-de-zh-fr-ja-ko-es/state_dicts/best_state.pt'
model.load_state_dict(torch.load(PATH)['best_state_dict'], strict=True)

In [None]:
# trainer.train
opt, model = trainer._init_model_opt(model)
trainer.model = model
trainer.model.eval()

# for epoch_index in tqdm(range(1, 1 + 1)):
    
for language in trainer.conf.eval_languages: # trn_languages
    print ("Start language-{}".format(language))
    labels = np.empty((0,))
    features = np.empty((0, 768))
    trn_iters = []
    egs = adapt_loaders[language].val_egs # trn_egs
    trn_iters.append(iter(egs))

    batches_per_epoch = max(len(ti) for ti in trn_iters)
    for batch_index in range(1, batches_per_epoch + 1):
        trn_loss = []
        for ti in trn_iters:
            try:
                batched = next(ti)
            except StopIteration:
                continue
            batched, golds, uids, _golds_tagging = trainer.collocate_batch_fn(
                batched
            )
            with torch.no_grad():
                hidden = trainer.model.get_last_hidden(**batched)
                labels = np.concatenate((labels, golds.cpu()))
                features = np.concatenate((features, hidden.cpu()), axis=0)
    
    os.makedirs('./stats/{}'.format(conf.dataset_name), exist_ok=True)
    
    output_dict = {}
    for i in np.unique(labels):
        feature = features[labels == i]
        mean = np.mean(feature, axis=0)
        cov = np.cov(feature.T)
        output_dict[str(int(i))] = [{"mean": mean, "cov": cov}]
    save_pickle('./stats/{}/{}.pkl'.format(conf.dataset_name, language), output_dict)

# Class - Class

In [None]:
from numpy import dot
from numpy.linalg import norm

def cov_sim(r1, r2):
    result = np.matmul(r1, r2)
    result /= norm(r1) * norm(r2)
    return result

base_infos = load_pickle('./stats/{}/english.pkl'.format(conf.dataset_name))
for i in list(base_infos.keys()):
    print ('Start {}'.format(i))
    base_mean = base_infos[i][0]['mean']
    base_cov = base_infos[i][0]['cov'].flatten()
    for language in trainer.conf.eval_languages:
        infos = load_pickle('./stats/{}/{}.pkl'.format(conf.dataset_name, language))
        mean = infos[i][0]['mean']
        cov = infos[i][0]['cov'].flatten()
        
        cos_sim = dot(base_mean, mean)
        cos_sim /= (norm(base_mean)*norm(mean))

        cos_sim2 = dot(base_cov, cov)
        cos_sim2 /= (norm(base_cov)*norm(cov))
        # cos_sim2 = cov_sim(base_cov, cov)
        print (language, round(cos_sim, 4), round(cos_sim2, 4))
    
    print ('')

In [None]:
"""
Before Source Training
"""

"""
PAWSX
"""
# Start 0
# english 1.0 1.0
# german 0.9038 0.7792
# chinese 0.8951 0.8045
# french 0.9118 0.8259
# japanese 0.9123 0.806
# korean 0.9032 0.8112
# spanish 0.9129 0.8227

# Start 1
# english 1.0 1.0
# german 0.9124 0.7765
# chinese 0.9041 0.7927
# french 0.9156 0.8166
# japanese 0.918 0.8038
# korean 0.9105 0.8002
# spanish 0.9207 0.8039

"""
XNLI
"""
# Start 0
# english 1.0 1.0
# arabic 0.8107 0.6518
# bulgarian 0.8379 0.539
# chinese 0.8258 0.451
# french 0.9006 0.5543
# german 0.914 0.4803
# greek 0.8511 0.5334
# hindi 0.6673 0.587
# russian 0.8461 0.525
# spanish 0.8916 0.5695
# swahili 0.9277 0.6539
# thai 0.8903 0.5036
# turkish 0.9181 0.6889
# urdu 0.7186 0.6183
# vietnamese 0.8612 0.4259

# Start 1
# english 1.0 1.0
# arabic 0.8366 0.5458
# bulgarian 0.8644 0.5385
# chinese 0.8403 0.5122
# french 0.9161 0.5704
# german 0.9278 0.4944
# greek 0.8702 0.4617
# hindi 0.6983 0.6012
# russian 0.8693 0.492
# spanish 0.9085 0.5575
# swahili 0.9388 0.6205
# thai 0.8821 0.5699
# turkish 0.9298 0.6774
# urdu 0.7511 0.6517
# vietnamese 0.8769 0.5679

In [None]:
"""
After Source Training
"""

"""
PAWSX
"""
# Start 0
# english 1.0 1.0
# german 0.9038 0.7792
# chinese 0.8951 0.8045
# french 0.9118 0.8259
# japanese 0.9123 0.806
# korean 0.9032 0.8112
# spanish 0.9129 0.8227

# Start 1
# english 1.0 1.0
# german 0.9124 0.7765
# chinese 0.9041 0.7927
# french 0.9156 0.8166
# japanese 0.918 0.8038
# korean 0.9105 0.8002
# spanish 0.9207 0.803

"""
XNLI
"""
# Start 0
# english 1.0 1.0
# arabic 0.115 0.3549
# bulgarian 0.1077 0.3603
# chinese 0.153 0.4234
# french 0.3996 0.6083
# german 0.3272 0.5233
# greek 0.1087 0.4416
# hindi 0.0739 0.3432
# russian 0.1007 0.3529
# spanish 0.5172 0.5821
# swahili 0.5264 0.5719
# thai 0.3589 0.5408
# turkish 0.4851 0.5557
# urdu 0.1253 0.3192
# vietnamese 0.4155 0.5573

# Start 1
# english 1.0 1.0
# arabic 0.8239 0.7541
# bulgarian 0.8507 0.8166
# chinese 0.847 0.7968
# french 0.7637 0.7376
# german 0.7562 0.71
# greek 0.8571 0.803
# hindi 0.8533 0.7946
# russian 0.8499 0.8058
# spanish 0.6995 0.7119
# swahili 0.5938 0.5803
# thai 0.6957 0.7109
# turkish 0.6058 0.548
# urdu 0.823 0.7118
# vietnamese 0.6657 0.6466

# Relation - Relation

In [None]:
from numpy import dot
from numpy.linalg import norm

def cov_sim(r1, r2):
    result = np.matmul(r1, r2)
    result /= norm(r1) * norm(r2)
    return result

base_infos = load_pickle('./stats/{}/english.pkl'.format(conf.dataset_name))
for i in list(base_infos.keys()):
    for j in list(base_infos.keys()):
        if i>=j:
            continue
        else:
            print ('Start {}-{}'.format(i, j))
            base_infos = load_pickle('./stats/{}/english.pkl'.format(conf.dataset_name))
            base_mean = base_infos[i][0]['mean'] - base_infos[j][0]['mean']
            base_cov = (base_infos[i][0]['cov'].flatten() + base_infos[j][0]['cov'].flatten()) / 2

            for language in trainer.conf.eval_languages:
                infos = load_pickle('./stats/{}/{}.pkl'.format(conf.dataset_name, language))
                mean = infos[i][0]['mean'] - infos[j][0]['mean']
                cov = (infos[i][0]['cov'].flatten() + infos[j][0]['cov'].flatten()) / 2

                cos_sim = dot(base_mean, mean)
                cos_sim /= (norm(base_mean)*norm(mean))

                cos_sim2 = dot(base_cov, cov)
                cos_sim2 /= (norm(base_cov)*norm(cov))
                # cos_sim2 = cov_sim(base_cov, cov)
                print (language, round(cos_sim, 4), round(cos_sim2, 4))

            print ('')

# PCA & T-sne (of mean)

In [None]:
import matplotlib.pyplot as plt

In [None]:
mean_vectors = []
labels = []
for lang in conf.eval_languages:
    infos = load_pickle('./stats/{}/{}.pkl'.format(conf.dataset_name, lang))
    for k in infos.keys():
        mean_vectors.append(infos[k][0]['mean'])
        labels.append('{}({})'.format(lang, k))
mean_vectors = np.array(mean_vectors)

mean_vectors = []
labels = []
for k in infos.keys():
    for lang in conf.eval_languages:
        infos = load_pickle('./stats/{}/{}.pkl'.format(conf.dataset_name, lang))
        mean_vectors.append(infos[k][0]['mean'])
        labels.append('{}({})'.format(lang, k))
mean_vectors = np.array(mean_vectors)

In [None]:
euc_matrix = np.zeros([len(mean_vectors), len(mean_vectors)])
cos_matrix = np.zeros([len(mean_vectors), len(mean_vectors)])

for i in range(euc_matrix.shape[0]):
    for j in range(euc_matrix.shape[1]):
        euc_matrix[i,j] = np.linalg.norm(mean_vectors[i]-mean_vectors[j])
        cos_matrix[i,j] = np.sum(mean_vectors[i]*mean_vectors[j]) / (np.linalg.norm(mean_vectors[i])*np.linalg.norm(mean_vectors[j]))

plt.figure(figsize=(12,8))
plt.pcolor(euc_matrix)
plt.xticks(np.arange(0.5, euc_matrix.shape[0], 1), labels, rotation=90)
plt.yticks(np.arange(0.5, euc_matrix.shape[1], 1), labels)
plt.colorbar()
plt.title("Euclidean distance matrix btw mean vectors")
plt.show()
plt.close()

plt.figure(figsize=(12,8))
plt.pcolor(cos_matrix)
plt.xticks(np.arange(0.5, euc_matrix.shape[0], 1), labels, rotation=90)
plt.yticks(np.arange(0.5, euc_matrix.shape[1], 1), labels)
plt.colorbar()
plt.title("Cosine similarity matrix btw mean vectors")
plt.show()
plt.close()

In [None]:
def plot_x_y(x, y, labels):    
    for x_, y_, label in list(zip(x, y, labels)):
        if 'english' in label:
            color = 'black'
        elif 'german' in label:
            color = 'darkgray'
        elif 'chinese' in label:
            color = 'rosybrown'
        elif 'french' in label:
            color = 'lightcoral'
        elif 'japanese' in label:
            color = 'maroon'
        elif 'korean' in label:
            color = 'red'
        elif 'spanish' in label:
            color = 'chocolate'
        elif 'arabic' in label:
            color = 'sandybrown'
        elif 'bulgarian' in label:
            color = 'darkorange'
        elif 'greek' in label:
            color = 'goldenrod'
        elif 'hindi' in label:
            color = 'gold'
        elif 'russian' in label:
            color = 'olive'
        elif 'swahili' in label:
            color = 'darkolivegreen'
        elif 'thai' in label:
            color = 'lightseagreen'
        elif 'turkish' in label:
            color = 'blue'
        elif 'urdu' in label:
            color = 'fuchsia'
        elif 'vietnamese' in label:
            color = 'deeppink'

        if '0' in label:
            marker = 'x'
        elif '1' in label:
            marker = 'o'
        elif '2' in label:
            marker = '*'
        else:
            marker = 'o'

        plt.scatter(x_, y_, color=color, marker=marker, label=label)
    plt.legend(bbox_to_anchor=(1, 1))
    plt.show()
    plt.close()

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
projected_values = pca.fit_transform(mean_vectors)
x, y = projected_values[:,0], projected_values[:,1]
plot_x_y(x, y, labels)

In [None]:
from sklearn.manifold import TSNE
from sklearn.datasets import load_digits

# 2차원으로 차원 축소
n_components = 2

# t-sne 모델 생성
model = TSNE(n_components=n_components, init='pca', learning_rate='auto')

# 학습한 결과 2차원 공간 값 출력
projected_values = model.fit_transform(mean_vectors)
x, y = projected_values[:,0], projected_values[:,1]
plot_x_y(x, y, labels)

# For panws

In [None]:
diff_vectors = mean_vectors[:7,:] - mean_vectors[7:,:]
diff_labels = [l.replace('(0)','') for l in labels[:7]]

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
projected_values = pca.fit_transform(diff_vectors)
x, y = projected_values[:,0], projected_values[:,1]
plot_x_y(x, y, diff_labels)

In [None]:
from sklearn.manifold import TSNE
from sklearn.datasets import load_digits

# 2차원으로 차원 축소
n_components = 2

# t-sne 모델 생성
model = TSNE(n_components=n_components, init='pca', learning_rate='auto')

# 학습한 결과 2차원 공간 값 출력
projected_values = model.fit_transform(diff_vectors)
x, y = projected_values[:,0], projected_values[:,1]
plot_x_y(x, y, diff_labels)