In [1]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import csv
from numpy.linalg import norm


In [2]:
## get information of more transformed and less transformed {id:more_tran{},less_tran{}}
## compute the weight for transformation degree
trans_degree = {}
with open('COSTRA1.1.tsv') as f:
  file = csv.reader(f, delimiter ='\t')
  for line in file:
    more_trans = [int(i) for i in line[4].split(",") if len(line[4])!=0]
    less_trans = [int(i) for i in line[5].split(",") if len(line[5])!=0]
    trans_degree[line[0]] = round((len(less_trans)+1)/(len(more_trans)+len(less_trans)+1),2)


In [60]:
def data_process(file_name):
  seed_dict = {}
  with open(file_name,"r") as file:
    tsv_file = csv.reader(file, delimiter="\t")
    data_dict = {'sent_id':[],'seed_id':[],'label':[],'vec':[]}
    for line in tsv_file:
      data_dict['sent_id'].append(line[0])
      data_dict['seed_id'].append(line[1])
      data_dict['label'].append(line[2])
      data_dict['vec'].append([float(i) for i in line[3:]])
      ## store seed
      if line[2] == 'seed':
        seed_dict[line[1]] = [float(i) for i in line[3:]]
  data_df = pd.DataFrame(data_dict)
  data_df['vec_sub_seed'] = data_df.apply(lambda row: np.array(row['vec']) - np.array(seed_dict[row['seed_id']]),axis=1)
  data_df['degree'] = data_df.apply(lambda row: trans_degree[row['sent_id']],axis= 1)

  return data_df,seed_dict

In [61]:
## train transfomation vectors for each labels
def get_dist_vec(file_name):
  data_df = data_process(file_name)[0]
  dist_vec = dict()
  labels = data_df['label'].unique()
  for label in labels:
    deriv_class = data_df[data_df['label'] == label]
    ## 80% used for training
    dist_vec[label] = deriv_class.iloc[0:int(len(deriv_class)*0.8)]['vec_sub_seed'].mean()
  return dist_vec

In [68]:
def test_dist_vec(file_name):
  ## get a cos_sim matrix, sentence_type, test_data_type, value
  data_df,seed_dict = data_process(file_name)
  labels = data_df['label'].unique()
  result = df = pd.DataFrame(columns=labels,index=labels)
  dist_vec = get_dist_vec(file_name)
  for label_sent in labels:
    deriv_class = data_df[data_df['label'] == label_sent]

    ## 20% used for testing
    test_data = deriv_class.iloc[int(len(deriv_class)*0.8):]

    ## computing new transformation sentence, compare with original
    for label_dist in labels:
      test_data['predict_vec'] = test_data.apply(lambda row: np.array(seed_dict[row['seed_id']]) + np.array(dist_vec[label_dist]),axis =1)
      test_data['cos_sim'] = test_data.apply(lambda row:np.dot(np.array(row['vec']),np.array(row['predict_vec']))/(norm(np.array(row['vec'])*norm(np.array(row['predict_vec'])))), axis = 1)
      result.loc[label_sent,label_dist] = test_data['cos_sim'].mean()
  return result

In [None]:
result = test_dist_vec('bert_vec_unsup.tsv')

In [75]:
result

Unnamed: 0,ban,different meaning,formal sentence,future,generalization,minimal change,nonsense,nonstandard sentence,opposite meaning,paraphrase,past,possibility,seed,simple sentence
ban,0.871844,0.793146,0.79122,0.789131,0.790145,0.790982,0.791761,0.793375,0.821592,0.794079,0.782052,0.788494,0.793061,0.797453
different meaning,0.85003,0.923477,0.920784,0.909969,0.920517,0.923534,0.922951,0.922115,0.908312,0.922635,0.915402,0.910619,0.92391,0.921555
formal sentence,0.829289,0.891663,0.891009,0.877364,0.888005,0.891249,0.891098,0.888911,0.879878,0.891635,0.885515,0.879954,0.892277,0.889357
future,0.864927,0.920428,0.91879,0.931696,0.9196,0.920248,0.919967,0.918745,0.909082,0.920158,0.903692,0.918801,0.921514,0.919268
generalization,0.631272,0.691736,0.687955,0.68403,0.696534,0.691582,0.692569,0.692549,0.672946,0.691128,0.687412,0.686614,0.692666,0.692443
minimal change,0.809584,0.87697,0.874378,0.866275,0.874877,0.877275,0.876696,0.875943,0.861692,0.876235,0.869938,0.866585,0.877547,0.875088
nonsense,0.848263,0.92009,0.917355,0.907366,0.917448,0.919957,0.919999,0.918248,0.90467,0.919115,0.912592,0.907935,0.920077,0.917773
nonstandard sentence,0.778134,0.844138,0.837897,0.8285,0.842419,0.84514,0.84411,0.848726,0.827416,0.842634,0.841032,0.834077,0.844624,0.843647
opposite meaning,0.734323,0.747297,0.744186,0.734799,0.743386,0.745999,0.745298,0.745582,0.758575,0.746946,0.742519,0.735868,0.74715,0.748007
paraphrase,0.759156,0.821712,0.819287,0.808685,0.819826,0.821747,0.821824,0.821478,0.806916,0.821267,0.816591,0.813151,0.822327,0.819816


In [None]:
del result['seed']

In [77]:
result = result.drop('seed')

In [78]:
result.style.highlight_max(color = 'lightgreen', axis = 1)

Unnamed: 0,ban,different meaning,formal sentence,future,generalization,minimal change,nonsense,nonstandard sentence,opposite meaning,paraphrase,past,possibility,simple sentence
ban,0.871844,0.793146,0.79122,0.789131,0.790145,0.790982,0.791761,0.793375,0.821592,0.794079,0.782052,0.788494,0.797453
different meaning,0.85003,0.923477,0.920784,0.909969,0.920517,0.923534,0.922951,0.922115,0.908312,0.922635,0.915402,0.910619,0.921555
formal sentence,0.829289,0.891663,0.891009,0.877364,0.888005,0.891249,0.891098,0.888911,0.879878,0.891635,0.885515,0.879954,0.889357
future,0.864927,0.920428,0.91879,0.931696,0.9196,0.920248,0.919967,0.918745,0.909082,0.920158,0.903692,0.918801,0.919268
generalization,0.631272,0.691736,0.687955,0.68403,0.696534,0.691582,0.692569,0.692549,0.672946,0.691128,0.687412,0.686614,0.692443
minimal change,0.809584,0.87697,0.874378,0.866275,0.874877,0.877275,0.876696,0.875943,0.861692,0.876235,0.869938,0.866585,0.875088
nonsense,0.848263,0.92009,0.917355,0.907366,0.917448,0.919957,0.919999,0.918248,0.90467,0.919115,0.912592,0.907935,0.917773
nonstandard sentence,0.778134,0.844138,0.837897,0.8285,0.842419,0.84514,0.84411,0.848726,0.827416,0.842634,0.841032,0.834077,0.843647
opposite meaning,0.734323,0.747297,0.744186,0.734799,0.743386,0.745999,0.745298,0.745582,0.758575,0.746946,0.742519,0.735868,0.748007
paraphrase,0.759156,0.821712,0.819287,0.808685,0.819826,0.821747,0.821824,0.821478,0.806916,0.821267,0.816591,0.813151,0.819816


In [None]:
result2 = test_dist_vec('LaBSE.tsv')

In [80]:
del result2['seed']
result2 = result2.drop('seed')

In [81]:
result2.style.highlight_max(color = 'lightgreen', axis = 1)

Unnamed: 0,ban,different meaning,formal sentence,future,generalization,minimal change,nonsense,nonstandard sentence,opposite meaning,paraphrase,past,possibility,simple sentence
ban,0.908636,0.85627,0.856759,0.852592,0.847226,0.856818,0.852912,0.852621,0.872373,0.85792,0.84132,0.861245,0.856496
different meaning,0.869263,0.913549,0.911573,0.895953,0.90412,0.913539,0.912387,0.908771,0.908171,0.9126,0.902856,0.891687,0.911063
formal sentence,0.840812,0.881631,0.882586,0.863858,0.872444,0.881459,0.879007,0.874732,0.879097,0.881547,0.875011,0.857639,0.879454
future,0.865414,0.894271,0.892775,0.911796,0.884285,0.894816,0.892792,0.891723,0.891624,0.894266,0.877093,0.884111,0.89226
generalization,0.620812,0.653563,0.650912,0.637312,0.662165,0.651367,0.649258,0.640523,0.653384,0.651273,0.647594,0.636617,0.660528
minimal change,0.872378,0.9161,0.914638,0.901427,0.906769,0.916517,0.91481,0.912244,0.911212,0.915564,0.906616,0.892418,0.913911
nonsense,0.859517,0.903997,0.902001,0.886408,0.894779,0.903761,0.904608,0.899198,0.898237,0.902729,0.893675,0.880871,0.901115
nonstandard sentence,0.788439,0.822197,0.820841,0.808717,0.810171,0.823373,0.820388,0.824879,0.81991,0.822322,0.817431,0.804205,0.820484
opposite meaning,0.803231,0.82034,0.819153,0.804107,0.814031,0.819763,0.816806,0.813126,0.826481,0.819616,0.81181,0.800886,0.820728
paraphrase,0.776258,0.814713,0.814577,0.799571,0.804719,0.814224,0.812788,0.808095,0.811504,0.813986,0.808132,0.795192,0.812987


In [None]:
result3 = test_dist_vec('Phrase_Embedding.tsv')

In [83]:
del result3['seed']
result3 = result3.drop('seed')
result3.style.highlight_max(color = 'lightgreen', axis = 1)

Unnamed: 0,ban,different meaning,formal sentence,future,generalization,minimal change,nonsense,nonstandard sentence,opposite meaning,paraphrase,past,possibility,simple sentence
ban,0.985117,0.982955,0.982624,0.982101,0.980696,0.983186,0.983111,0.982632,0.98358,0.983006,0.982865,0.981886,0.982179
different meaning,0.984255,0.985877,0.985268,0.985034,0.983727,0.985932,0.985931,0.985197,0.98581,0.985766,0.985591,0.983822,0.985249
formal sentence,0.965474,0.967207,0.967907,0.966841,0.963156,0.967432,0.967267,0.965947,0.967066,0.967534,0.967522,0.964687,0.965403
future,0.97758,0.97935,0.979246,0.980029,0.975887,0.979507,0.979368,0.978468,0.979178,0.979437,0.979377,0.977477,0.978025
generalization,0.922534,0.922989,0.921744,0.921497,0.924773,0.923049,0.923315,0.922815,0.923849,0.922826,0.922001,0.921211,0.924037
minimal change,0.984193,0.986001,0.985614,0.985461,0.983187,0.986114,0.986003,0.985321,0.985865,0.986001,0.985984,0.98401,0.985152
nonsense,0.982275,0.984076,0.983604,0.983308,0.98218,0.984177,0.984205,0.983032,0.984008,0.983861,0.983739,0.982034,0.983444
nonstandard sentence,0.957036,0.957531,0.956998,0.956887,0.954381,0.957749,0.957547,0.958693,0.957864,0.957791,0.95773,0.956962,0.956922
opposite meaning,0.965654,0.966591,0.966459,0.966064,0.964527,0.966775,0.96675,0.965916,0.967047,0.966667,0.9663,0.964948,0.965908
paraphrase,0.948689,0.94975,0.950072,0.949527,0.9458,0.949994,0.949815,0.949429,0.949784,0.950207,0.950106,0.947919,0.948186


#The codes below are from past and no longer useful

In [4]:
def compute_cos(label):
  deriv_class = data_df[data_df['label'] == label]
  split = int(len(deriv_class)*0.8)
  dist_vec = deriv_class.iloc[0:split]['vec_sub_seed'].mean()

  test = deriv_class.iloc[split:]

  ## predict the vec by computing seed + dist
  test['predict_vec'] = test.apply(lambda row: np.array(seed_dict[row['seed_id']]) + np.array(dist_vec),axis =1)
  test['cos_sim'] = test.apply(lambda row:np.dot(np.array(row['vec']),np.array(row['predict_vec']))/(norm(np.array(row['vec'])*norm(np.array(row['predict_vec'])))), axis = 1)
  ## compare with vec(true) and predicted_vec, cosine similarity
  return test['cos_sim'].mean()

In [None]:
data_df, seed_dict= data_process('bert_vec_unsup.tsv')
labels = data_df['label'].unique()
result = []
for label in labels:
  result.append([label,compute_cos(label)])
result.sort(key=lambda row: row[1],reverse=True)

In [8]:
# import module
from tabulate import tabulate

# create header
head = ['label', "cos_similarity"]

# display table
print(tabulate(result, headers=head, tablefmt="grid"))


+----------------------+------------------+
| label                |   cos_similarity |
| seed                 |         1        |
+----------------------+------------------+
| possibility          |         0.949993 |
+----------------------+------------------+
| past                 |         0.934856 |
+----------------------+------------------+
| future               |         0.931696 |
+----------------------+------------------+
| different meaning    |         0.923477 |
+----------------------+------------------+
| nonsense             |         0.919999 |
+----------------------+------------------+
| formal sentence      |         0.891009 |
+----------------------+------------------+
| minimal change       |         0.877275 |
+----------------------+------------------+
| ban                  |         0.871844 |
+----------------------+------------------+
| nonstandard sentence |         0.848726 |
+----------------------+------------------+
| simple sentence      |        

In [None]:
data_df, seed_dict= data_process('LaBSE.tsv')
result = []
labels = data_df['label'].unique()
for label in labels:
  result.append([label,compute_cos(label)])

result.sort(key=lambda row: row[1],reverse=True)

In [10]:
from tabulate import tabulate
# create header
head = ['label', "cos_similarity"]

# display table
print(tabulate(result, headers=head, tablefmt="grid"))

+----------------------+------------------+
| label                |   cos_similarity |
| seed                 |         1        |
+----------------------+------------------+
| possibility          |         0.945029 |
+----------------------+------------------+
| minimal change       |         0.916517 |
+----------------------+------------------+
| different meaning    |         0.913549 |
+----------------------+------------------+
| future               |         0.911796 |
+----------------------+------------------+
| past                 |         0.909999 |
+----------------------+------------------+
| ban                  |         0.908636 |
+----------------------+------------------+
| nonsense             |         0.904608 |
+----------------------+------------------+
| formal sentence      |         0.882586 |
+----------------------+------------------+
| opposite meaning     |         0.826481 |
+----------------------+------------------+
| nonstandard sentence |        

In [None]:
data_df, seed_dict= data_process('Phrase_Embedding.tsv')
result = []
labels = data_df['label'].unique()
for label in labels:
  result.append([label,compute_cos(label)])

result.sort(key=lambda row: row[1],reverse=True)

In [12]:
from tabulate import tabulate
# create header
head = ['label', "cos_similarity"]

# display table
print(tabulate(result, headers=head, tablefmt="grid"))

+----------------------+------------------+
| label                |   cos_similarity |
| seed                 |         1        |
+----------------------+------------------+
| minimal change       |         0.986114 |
+----------------------+------------------+
| different meaning    |         0.985877 |
+----------------------+------------------+
| ban                  |         0.985117 |
+----------------------+------------------+
| possibility          |         0.984613 |
+----------------------+------------------+
| nonsense             |         0.984205 |
+----------------------+------------------+
| past                 |         0.98225  |
+----------------------+------------------+
| future               |         0.980029 |
+----------------------+------------------+
| formal sentence      |         0.967907 |
+----------------------+------------------+
| opposite meaning     |         0.967047 |
+----------------------+------------------+
| nonstandard sentence |        

In [None]:
data_df, seed_dict= data_process('mixup_by_seed_bert_unsup.tsv')
result = []
labels = data_df['label'].unique()
for label in labels:
  result.append([label,compute_cos(label)])

result.sort(key=lambda row: row[1],reverse=True)

In [None]:
from tabulate import tabulate
# create header
head = ['label', "cos_similarity"]

# display table
print(tabulate(result, headers=head, tablefmt="grid"))

+----------------------+------------------+
| label                |   cos_similarity |
| seed                 |         1        |
+----------------------+------------------+
| different meaning    |         0.787777 |
+----------------------+------------------+
| possibility          |         0.777093 |
+----------------------+------------------+
| nonstandard sentence |         0.771835 |
+----------------------+------------------+
| opposite meaning     |         0.771732 |
+----------------------+------------------+
| future               |         0.762693 |
+----------------------+------------------+
| past                 |         0.761243 |
+----------------------+------------------+
| formal sentence      |         0.759873 |
+----------------------+------------------+
| paraphrase           |         0.755519 |
+----------------------+------------------+
| generalization       |         0.751795 |
+----------------------+------------------+
| nonsense             |        

In [None]:
data_df, seed_dict= data_process('doc2vec_vsize_256.tsv')
result = []
labels = data_df['label'].unique()
for label in labels:
  result.append([label,compute_cos(label)])

result.sort(key=lambda row: row[1],reverse=True)

In [None]:
from tabulate import tabulate
# create header
head = ['label', "cos_similarity"]

# display table
print(tabulate(result, headers=head, tablefmt="grid"))

+----------------------+------------------+
| label                |   cos_similarity |
| seed                 |         1        |
+----------------------+------------------+
| minimal change       |         0.649096 |
+----------------------+------------------+
| possibility          |         0.620002 |
+----------------------+------------------+
| past                 |         0.575301 |
+----------------------+------------------+
| future               |         0.566623 |
+----------------------+------------------+
| different meaning    |         0.542322 |
+----------------------+------------------+
| ban                  |         0.534627 |
+----------------------+------------------+
| nonsense             |         0.470186 |
+----------------------+------------------+
| opposite meaning     |         0.44971  |
+----------------------+------------------+
| formal sentence      |         0.442651 |
+----------------------+------------------+
| nonstandard sentence |        

In [None]:
data_df, seed_dict= data_process('mixup_by_seed_bert_unsup.tsv')
result = []
for label in labels:
  result.append([label,compute_cos(label)])

In [None]:
labels = data_df['label'].unique()
result.sort(key=lambda row: row[1],reverse=True)

In [None]:
# import module
from tabulate import tabulate

# create header
head = ['label', "cos_similarity"]

# display table
print(tabulate(result, headers=head, tablefmt="grid"))


+----------------------+------------------+
| label                |   cos_similarity |
| seed                 |         1        |
+----------------------+------------------+
| possibility          |         0.936635 |
+----------------------+------------------+
| past                 |         0.930636 |
+----------------------+------------------+
| future               |         0.923555 |
+----------------------+------------------+
| different meaning    |         0.913798 |
+----------------------+------------------+
| nonsense             |         0.901452 |
+----------------------+------------------+
| formal sentence      |         0.882741 |
+----------------------+------------------+
| minimal change       |         0.866246 |
+----------------------+------------------+
| ban                  |         0.845272 |
+----------------------+------------------+
| paraphrase           |         0.81659  |
+----------------------+------------------+
| nonstandard sentence |        