# Data prep for T-DNA
https://github.com/shizhediao/T-DNA
1. fasttext model from which we get ngram embeddings
2. T-DNA expects:
    * data in the form of `text \t label` - english_snippet_graph_matches_100k.tsv
    * ngrams frequency file in the form `ngram \t count` - english_snippet_graph_matches_100k_ngrams.tsv
    * ngram embeddings file in numpy array format - english_snippet_graph_matches_100k_fasttext.npy


In [25]:
import fasttext
import numpy as np
import pandas as pd
import json

In [2]:
# !pip install fasttext

Collecting fasttext
  Using cached fasttext-0.9.2.tar.gz (68 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.9.1-py2.py3-none-any.whl (211 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25ldone
[?25h  Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=295068 sha256=d2ea0f87ef3e975ae3d9ec3167bec29028959b2f3b8669d2ab116fd464282428
  Stored in directory: /home/ec2-user/.cache/pip/wheels/c3/5c/d0/4a725c6ee7df3267d818d3bc9d89bb173b94832f2b9eca6368
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.9.1


In [4]:
import torch
torch.__version__

'1.7.1'

In [56]:
# train a model from the 'TAPT' data so we can extract warm-start embeddings for the ngrams to feed the T-DNA model training code.
# the dimension of the vectors must be the same as the LLM we will be continuing to train. 
# since we will only be using unigrams and bigrams, we only need wordNgrams set to 2

# roberta-large-dim: 1024
# roberta-base-dim: 768

languages = ['fr','de','es','hi','pt','ru','sv','tr','zh','ar']


In [29]:
for lang in languages:
    model = fasttext.train_unsupervised('../data/transcripts/transcripts-all-'+lang+'.csv, 
                                        model='skipgram', 
                                        lr=0.05, 
                                        dim=768, 
                                        ws=4, 
                                        wordNgrams=2, 
                                        epoch=3, 
                                        thread=12)
    model.save_model('../models/fasttext/'+lang+'768_fasttext.bin')

In [30]:
model = fasttext.load_model("../models/fasttext/ar768_fasttext.bin")



In [31]:
# quick sanity check
model.get_nearest_neighbors('covid')

[(0.9350162148475647, '92%'),
 (0.9100903272628784, '1439'),
 (0.9003435373306274, '37%'),
 (0.8975725769996643, 'اثبتت'),
 (0.8953256607055664, 'انخرطت'),
 (0.8933001160621643, 'اخضرارا'),
 (0.8902260065078735, 'قمم'),
 (0.8894528746604919, '11%'),
 (0.8856233358383179, 'اصطلاحا'),
 (0.8846212029457092, 'هزيمه')]

In [32]:
# generates a numpy array of embeddings for all the ngrams for use in T-DNA code
# languages = ['fr','de','es','hi','pt','ru','sv','tr']
languages = ['ar','zh']
for lang in languages:
    print('language:',lang)
    ngrams = pd.read_csv('../data/ngrams/'+lang+'_ngrams_32768.tsv',sep='\t',names=['ngram','count'])
    model = fasttext.load_model('../models/fasttext/'+lang+'768_fasttext.bin')
    vectors = []
    for row in ngrams.iterrows():
        w = row[1]['ngram']
        v = model.get_word_vector(w)
        vectors.append(v)
    np.save('../data/ngrams/'+lang+'_ngrams_32768.npy',np.array(vectors))

language: ar
language: zh




In [15]:
# model.get_word_vector('bonjour')

In [33]:
for lang in languages:
    l = !wc -l ../data/ngrams/{lang}_ngrams_32768.tsv
    print(lang, l)

ar ['627 ../data/ngrams/ar_ngrams_32768.tsv']
zh ['32768 ../data/ngrams/zh_ngrams_32768.tsv']


In [22]:
for lang in languages:
    l = !head -n 10 ../data/ngrams/{lang}_ngrams_32768.tsv
    print(lang, l)

fr ['-ce\t13566', "aujourd'hui\t8508", 'peut-être\t5146', '-là\t4574', 'justement\t3379', "quelqu'\t2957", 'france\t2795', "jusqu'\t2763", 'demain\t2204', 'disent\t2106']
de ['menschen\t134612', 'leute\t116217', 'sozusagen\t50103', 'sage\t44973', 'prozent\t39343', 'sowas\t31195', 'gucken\t26793', 'irgendwas\t26673', 'z.b.\t25604', 'regierung\t24950']
es ['y a\t16417', 'güey\t14541', 'a a\t8509', 'y y\t6852', 'vas a\t5341', 'viendo\t4710', 'diciendo\t4644', 'decía\t4395', 'poquito\t4304', 'a y\t4219']
hi ['दैट\t9480', 'नॉट\t7357', 'थिस\t6231', 'हैव\t5001', 'वेरी\t4619', 'दट\t4492', 'यू नो\t4457', 'ऐंड\t4189', 'पीपल\t3773', 'बिकॉज़\t3484']
pt ['bolsonaro\t42043', 'Ucrânia\t18689', 'Estados Unidos\t15261', 'Jovem\t15149', 'Jovem Pan\t14536', 'falei\t10384', 'deu\t9949', 'daqui\t9515', 'Olha\t9443', 'Jair\t8933']
ru ['мардан\t2746', 'точки зрения\t2231', 'украины\t2083', 'Комсомольская\t2071', 'причём\t2023', 'Комсомольская правда\t2000', 'радио Комсомольская\t1943', 'россии\t1838', 'эфире\

In [35]:
# compare embeddings from different langues. 
# We do not expect these to be similar. Just a curious though.

es_model = fasttext.load_model("../models/fasttext/es768_fasttext.bin")
ru_model = fasttext.load_model("../models/fasttext/ru768_fasttext.bin")
de_model = fasttext.load_model("../models/fasttext/de768_fasttext.bin")
fr_model = fasttext.load_model("../models/fasttext/fr768_fasttext.bin")



In [47]:
text = ['quizás','может быть','vielleicht','peut-être']

In [48]:
sp_text=es_model.get_word_vector(text[0])
ru_text=ru_model.get_word_vector(text[1])
de_text=de_model.get_word_vector(text[2])
fr_text=fr_model.get_word_vector(text[3])

In [49]:
import numpy as np

def cos_sim(a, b):
	"""Takes 2 vectors a, b and returns the cosine similarity according 
	to the definition of the dot product
	"""
	dot_product = np.dot(a, b)
	norm_a = np.linalg.norm(a)
	norm_b = np.linalg.norm(b)
	return dot_product / (norm_a * norm_b)

In [50]:
print(cos_sim(sp_text, ru_text))

0.023469687


In [51]:
print(cos_sim(sp_text, de_text))

0.015318886


In [52]:
print(cos_sim(sp_text, fr_text))

0.04430275


In [53]:
print(cos_sim(ru_text, de_text))

-0.020328093


In [54]:
print(cos_sim(ru_text, fr_text))

-0.097885266


In [55]:
print(cos_sim(de_text, fr_text))

0.012663008
