In [1]:
#Word embeddings from small aligned data to compare translational choices

In [79]:
import nltk
import random
from gensim.models import Word2Vec
import numpy as np

In [80]:
random.seed(30) #for reproducibility!

In [81]:
dire ='/Users/yuribizzoni/Downloads/'

In [99]:
#Europarl corpus in English and Spanish
esp = open(dire+"Translation/B7/epuds-parallel/words/epuds.en-es.es").read()
eng = open(dire+"Translation/B7/epuds-parallel/words/epuds.en-es.en").read()
#

In [100]:
len(esp), len(eng) #length in chars

(19400792, 17337842)

In [101]:
#English is the original, Spanish the translation. The two are sentence-aligned.
len(esp.split("\n")), len(eng.split("\n"))

(125853, 125853)

In [102]:
#I tokenize each sentence in both corpora
palabras =  [nltk.wordpunct_tokenize(fra) for fra in esp.lower().split("\n")]
words = [nltk.wordpunct_tokenize(sen) for sen in eng.lower().split("\n")]
print(len(palabras), len(words))
palabras[0]

125853 125853


['me',
 'gustaría',
 'agradecerle',
 'la',
 'oportunidad',
 'que',
 'me',
 'brinda',
 'de',
 'dirigirme',
 'a',
 'la',
 'cámara',
 '.']

In [103]:
#I create a single "sentence" out of the original and the translation

aligned = []
for i in range(len(palabras)):
    fra = palabras[i]
    sen = words[i]
    multi = fra+sen
    
    random.shuffle(multi) #<<< Different shufflings still return good performances
    #It seems necessary with this setting to shuffle words' contexts.
    #Concatenating the sentences makes the far away context too irrelevant and so the languages stay divided. 

    aligned.append(multi)
    
aligned[0] # now Spanish and English are shuffled!

['giving',
 'a',
 'la',
 '.',
 'like',
 '.',
 'this',
 'you',
 'que',
 'me',
 'president',
 'dirigirme',
 'house',
 'i',
 'thank',
 'for',
 'brinda',
 'de',
 'should',
 'oportunidad',
 'address',
 'to',
 'the',
 'la',
 'mr',
 'agradecerle',
 'to',
 ',',
 'opportunity',
 'cámara',
 'me',
 'me',
 'gustaría']

In [105]:
#Another shuffled sentence and how it looks like
aligned[10]

['finish',
 'from',
 'de',
 'that',
 '’.',
 'odiar',
 'él',
 '‘',
 'shaw',
 'gustaría',
 'ultimate',
 'sino',
 'una',
 'the',
 'es',
 ',',
 'no',
 'him',
 'shaw',
 'to',
 'no',
 'inhumano',
 'prójimo',
 'hate',
 'like',
 'is',
 '».',
 'fellow',
 'es',
 'eso',
 'i',
 'but',
 'of',
 'mostrar',
 'be',
 ':',
 'not',
 'essence',
 'to',
 'inhumanity',
 ':',
 '«',
 'por',
 'bernard',
 ',',
 'esencia',
 'with',
 'me',
 'george',
 'alguno',
 'a',
 'apathetic',
 'al',
 'quote',
 'cita',
 'your',
 'pecado',
 'the',
 'bernard',
 'de',
 'interés',
 'mayor',
 'is',
 'towards',
 'man',
 'to',
 'should',
 'lo',
 'george',
 'con',
 'terminar',
 'el',
 'sin',
 'la']

In [106]:
#the average length and standard deviation of our concatenated bilingual sentences
np.mean([len(s) for s in aligned]), np.std([len(s) for s in aligned])

(53.57603712267487, 30.05990216341198)

# Training the model

In [107]:
%%time

#I train a model on the average + std (53+30=83) deviation bilingual context of each word.
# 300 is the standard w2v size
madmod0 = Word2Vec(aligned,window=83,size=300)
# This should take around 1 or 2 minutes

CPU times: user 2min 59s, sys: 862 ms, total: 3min
Wall time: 1min 5s


In [108]:
madmod0.wv.most_similar("germany") # if it worked, we should see a bilingual country cluster
# with Alemania in pole position

[('alemania', 0.9596759080886841),
 ('italy', 0.8487658500671387),
 ('italia', 0.8454217910766602),
 ('francia', 0.8358932733535767),
 ('france', 0.7950721979141235),
 ('españa', 0.79485684633255),
 ('spain', 0.7900786399841309),
 ('belgium', 0.7790611982345581),
 ('bélgica', 0.7689522504806519),
 ('suecia', 0.760866641998291)]

In [109]:
#So, summing up. 

In [110]:
#1. I have to shuffle the data to obtain a working translingual. If I do not shuffle, the spaces keep apart.
# This can be probably improved in a new version.

In [111]:
#2. If I simply zip the data ("the","la","house","casa"), results are better but, it
# seems to me, not as good as when i shuffle.

In [112]:
#3. Function words and prepositions do not return meaningul clusters

In [113]:
#4. Regarding content words, I get frequent translations and semantically related elements
# in both languages.  
madmod0.wv.most_similar("war")

[('guerra', 0.9212468862533569),
 ('terror', 0.7476201057434082),
 ('genocide', 0.714745044708252),
 ('genocidio', 0.7048125863075256),
 ('fría', 0.7038137912750244),
 ('cold', 0.6748931407928467),
 ('guerras', 0.6603832244873047),
 ('brutal', 0.6585128903388977),
 ('tyranny', 0.6539484262466431),
 ('bagdad', 0.6538013815879822)]

In [114]:
#5. If a word is (almost) always translated with another word, they should be very close
# in the space. 
madmod0.wv.most_similar("voz")

[('voice', 0.9031227827072144),
 ('sola', 0.527403712272644),
 ('speak', 0.5132758021354675),
 ('solidaridad', 0.5124000310897827),
 ('vision', 0.5023461580276489),
 ('hable', 0.49363863468170166),
 ('fortaleza', 0.4901423454284668),
 ('solidarity', 0.4867943525314331),
 ('peoples', 0.4831586182117462),
 ('defended', 0.46939510107040405)]

In [115]:
#5. If a word belongs to a semantically related cluster where most words have a 
# *consistent translation* in the corpus, such word has many bilingual, near neighbours.
madmod0.wv.most_similar("france")

[('francia', 0.9057453870773315),
 ('germany', 0.7950721979141235),
 ('alemania', 0.793376088142395),
 ('italy', 0.7632391452789307),
 ('italia', 0.7431871891021729),
 ('belgium', 0.7328687906265259),
 ('holland', 0.7214162349700928),
 ('bélgica', 0.7132459282875061),
 ('suecia', 0.7081535458564758),
 ('españa', 0.7032994031906128)]

In [116]:
#6. If instead a word does not belong to a group of closely related, consistently 
# translated concepts, but does have a consistent translation in the corpus,
# it tends to have a very close neighbourg followed by a "void" (the second nearest is not that near) 
madmod0.wv.most_similar("solidarity")

[('solidaridad', 0.9231983423233032),
 ('peoples', 0.6054810881614685),
 ('uprising', 0.5656891465187073),
 ('vecinos', 0.549111008644104),
 ('sympathy', 0.5339564085006714),
 ('pueblos', 0.5334646105766296),
 ('alliance', 0.5277429223060608),
 ('balkans', 0.5262425541877747),
 ('inspiración', 0.514620304107666),
 ('balcanes', 0.5130844712257385)]

In [117]:
#7. If a word is *not* consistently translated, it should display: 
#1 - various translations, but none very close:
madmod0.wv.most_similar("fear")

[('temo', 0.7443530559539795),
 ('miedo', 0.6412690877914429),
 ('temor', 0.5605615973472595),
 ('afraid', 0.5554525852203369),
 ('temen', 0.5006039142608643),
 ('feel', 0.4887058734893799),
 ('ordinary', 0.4820263981819153),
 ('sienten', 0.471314013004303),
 ('worry', 0.46651700139045715),
 ('sensación', 0.46651220321655273)]

In [118]:
#... or: 2 - no good translation at all, if the translation cannot be closed in one 
# single word for example. The word also appears isolated (closest neighbours are far away)
madmod0.wv.most_similar("somehow")

[('sencillamente', 0.46420466899871826),
 ('impresión', 0.4548487067222595),
 ('impression', 0.4505397081375122),
 ('fuera', 0.4258045554161072),
 ('claim', 0.42454245686531067),
 ('simply', 0.42331182956695557),
 ('foolish', 0.40560412406921387),
 ('wrong', 0.4022558331489563),
 ('simplemente', 0.4015910029411316),
 ('puerta', 0.39861059188842773)]

In [119]:
# this could be a special way of seeing (1) imbalances in translation
madmod0.wv.most_similar("gentes") 
#gentes does not seem to have a systematic translation in this specific corpus

[('indecibles', 0.45576515793800354),
 ('pueblo', 0.4443557560443878),
 ('helping', 0.43870389461517334),
 ('población', 0.43117275834083557),
 ('dictaduras', 0.42869263887405396),
 ('génova', 0.41671663522720337),
 ('pueblos', 0.41389721632003784),
 ('habitantes', 0.4101772904396057),
 ('fledgling', 0.4068302512168884),
 ('bi', 0.4042319357395172)]

In [120]:
# (2) strong divergence in polysemy or in function 
madmod0.wv.most_similar("but"),madmod0.wv.most_similar("the") 

([('however', 0.8158111572265625),
  ('nevertheless', 0.7060055732727051),
  ('yet', 0.671391487121582),
  ('although', 0.6627480983734131),
  ('nonetheless', 0.6286283731460571),
  ('though', 0.6247971653938293),
  ('whilst', 0.4798395335674286),
  ('albeit', 0.4697149097919464),
  ('ohio', 0.4690318703651428),
  ('while', 0.45984405279159546)],
 [('de', 0.24182571470737457),
  ('jefes', 0.24131551384925842),
  ('lanzamiento', 0.2320137917995453),
  ('sensibilidades', 0.2230995148420334),
  ('dup', 0.2217485010623932),
  ('sensitivities', 0.21217766404151917),
  ('presidentes', 0.2098350077867508),
  ('strengthening', 0.20492029190063477),
  ('reclamar', 0.20268955826759338),
  ('competences', 0.20201155543327332)])

In [121]:
#(3) contextual similarities beyond a word's translation...
madmod0.wv.most_similar("palestinian") 
#e.g. palestinian is close to israeli too

[('palestino', 0.9038184881210327),
 ('palestinos', 0.895012617111206),
 ('palestina', 0.8868119716644287),
 ('israeli', 0.8694062232971191),
 ('israelí', 0.8532701730728149),
 ('israelíes', 0.8434242606163025),
 ('palestinians', 0.824909508228302),
 ('hamas', 0.814632773399353),
 ('sharon', 0.8078178763389587),
 ('hamás', 0.7955443859100342)]

In [122]:
#and (4) DISsimilarities between sources and translations. 
madmod0.wv.most_similar("population")#

[('inhabitants', 0.6230965852737427),
 ('living', 0.6135353446006775),
 ('ageing', 0.6059086918830872),
 ('populations', 0.5475015640258789),
 ('starvation', 0.5466575622558594),
 ('viven', 0.5463178157806396),
 ('vive', 0.5347788333892822),
 ('envejecimiento', 0.5248936414718628),
 ('wars', 0.5197626352310181),
 ('rising', 0.5144315958023071)]

In [123]:
#words that do not show consistent translations in the space *tend* to cluster monolingually
# and to have more entropic clusters
madmod0.wv.most_similar("quiero"),madmod0.wv.most_similar("struggle")

([('quisiera', 0.8596170544624329),
  ('deseo', 0.7200720310211182),
  ('desearía', 0.6660672426223755),
  ('querría', 0.6044778823852539),
  ('gustaría', 0.5378729104995728),
  ('quieren', 0.5127868056297302),
  ('quieran', 0.5108271837234497),
  ('centrarme', 0.49832069873809814),
  ('voy', 0.48674386739730835),
  ('desean', 0.4450622498989105)],
 [('fighting', 0.6652612686157227),
  ('islámica', 0.6639626026153564),
  ('islamic', 0.6553363800048828),
  ('dictatorship', 0.6428117752075195),
  ('fundamentalista', 0.6270154118537903),
  ('dictadura', 0.6264300346374512),
  ('brutal', 0.612619161605835),
  ('jihad', 0.6038496494293213),
  ('communist', 0.6018915176391602),
  ('hatred', 0.5954686403274536)])

In [124]:
#words displaying a consistent translation in the space have less entropic distributions,
# depending on how systematic the translation might be (see the City-London translation)
print(madmod0.wv.most_similar("interesting"),"\n")
print(madmod0.wv.most_similar("city"),"\n")
print(madmod0.wv.most_similar("sucesor"),"\n")
print(madmod0.wv.most_similar("bosques"),"\n")

[('interesante', 0.895222544670105), ('interesantes', 0.6426743268966675), ('slightly', 0.5177820324897766), ('observar', 0.4860764145851135), ('extraordinary', 0.47382843494415283), ('subjects', 0.46645110845565796), ('plantearon', 0.4625413715839386), ('recogen', 0.45739126205444336), ('strange', 0.4562941789627075), ('ligeramente', 0.442565381526947)] 

[('ciudad', 0.9179441928863525), ('londres', 0.7915592193603516), ('london', 0.7884848117828369), ('condado', 0.7076954245567322), ('inglaterra', 0.7052062749862671), ('manchester', 0.6953339576721191), ('cork', 0.6923949718475342), ('edimburgo', 0.691721498966217), ('circunscripción', 0.6863150000572205), ('england', 0.6812542080879211)] 

[('successor', 0.7768109440803528), ('sajarov', 0.5914726257324219), ('shamefully', 0.5823409557342529), ('ambassador', 0.5802516937255859), ('robert', 0.5788925886154175), ('gordon', 0.5782333016395569), ('reverend', 0.5698007345199585), ('arzobispo', 0.564502477645874), ('roche', 0.5611132979393

In [125]:

#The original being english, we can look into the "english untranslatables".

In [40]:
# From a random site, a small list of palabras ingles sin traducion. 
sin_traducion = """struggle pump lock realise tip type stand spam insight weekend""".split()
for w in sin_traducion:
    print(w)
    print(madmod0.wv.most_similar(w,topn=3))
    print("\n")

struggle
[('fighting', 0.7119023203849792), ('violent', 0.673387348651886), ('intimidation', 0.6679258346557617)]


pump
[('drama', 0.5519206523895264), ('ingresos', 0.550617516040802), ('loans', 0.5242406129837036)]


lock
[('hacernos', 0.42701491713523865), ('analizan', 0.4033764898777008), ('ansiosos', 0.38876837491989136)]


realise
[('recognise', 0.5132564902305603), ('feel', 0.4402899742126465), ('show', 0.4151257276535034)]


tip
[('iceberg', 0.8228524923324585), ('conserva', 0.6419652700424194), ('contributes', 0.6406188607215881)]


type
[('depósitos', 0.5319403409957886), ('sort', 0.4850086569786072), ('kind', 0.47376930713653564)]


stand
[('defender', 0.5692262649536133), ('defend', 0.5606892704963684), ('send', 0.5229263305664062)]


spam
[('imperdonable', 0.6993660926818848), ('hija', 0.6965476274490356), ('fun', 0.6959306001663208)]


insight
[('maria', 0.5881803035736084), ('apreciación', 0.5480276346206665), ('marxist', 0.5433263182640076)]


weekend
[('semana', 0.6984

In [None]:
# none of them has a close neighbour in the other language, which is the behaviour we expect
# for elements without a systematic translation

In [33]:
#We can even perform Mikolov's man:woman = king:queen kind of operations! 

#woman:man = x:hombre
#man:woman = x:mujer
#sad:happy = x:feliz

#woman-man+hombre, man-woman+mujer, and so forth
print(madmod0.wv.most_similar(positive=["woman","hombre"],negative=["man"]))
# woman is to man as hombre is to ?
print(madmod0.wv.most_similar(positive=["man","mujer"],negative=["woman"]))
madmod0.wv.most_similar(positive=["sad","feliz"],negative=["happy"])
# sad is to happy as feliz is to ?

[('mujer', 0.7455636858940125), ('hijo', 0.6782130002975464), ('padre', 0.6462219953536987), ('murió', 0.6421608924865723), ('detenida', 0.6364172697067261), ('soldado', 0.6324895620346069), ('murieron', 0.6123976707458496), ('escuela', 0.6115037798881531), ('prisión', 0.6111001372337341), ('periodista', 0.6093836426734924)]
[('hombre', 0.7463650703430176), ('profesor', 0.5815767049789429), ('hijo', 0.5586555600166321), ('niño', 0.5477820038795471), ('islam', 0.5423353910446167), ('padre', 0.5271287560462952), ('tibetano', 0.5224722623825073), ('amigo', 0.5223640203475952), ('joven', 0.5094565749168396), ('persona', 0.5087794661521912)]


[('triste', 0.6450665593147278),
 ('historia', 0.5610065460205078),
 ('walked', 0.5431089997291565),
 ('guerras', 0.5388035178184509),
 ('beautiful', 0.5343602299690247),
 ('terrible', 0.5209183692932129),
 ('espectáculo', 0.5177900195121765),
 ('wars', 0.5154507756233215),
 ('empire', 0.5117136240005493),
 ('history', 0.5116448998451233)]

In [49]:
#Finally, we can somtimes sum words and find the translation of their sum!
# For example, the closest word to we+can is podemos, the closest to i+hope is espero etc.

madmod0.predict_output_word(["we","can"],topn=3),madmod0.predict_output_word(["we","know"],topn=3),madmod0.predict_output_word(["i","hope"],topn=3)

([('podemos', 0.9999895),
  ('puede', 1.05047075e-05),
  ('pueden', 1.4487546e-10)],
 [('sabemos', 1.0), ('conocemos', 5.5914613e-12), ('sé', 3.209026e-12)],
 [('espero', 1.0), ('confío', 3.298171e-13), ('hope', 1.5322472e-14)])

In [41]:
#This is more or less it. 

In [None]:
# Now I added a long Appendix about variations on this idea

In [None]:
#APPENDIX

In [43]:
### Anoher possibility is to mark every word's context with the id of the sentence it is occurring in.
alignedO = []
for i in range(len(palabras)):
    fra = [p for p in palabras[i]]
    sen = [w for w in words[i]]#
    
    multi = fra+sen
    fatica =  []#
    for w in multi:
        fatica.append(w)
        fatica.append(str(i))
    #random.shuffle(fatica)
    alignedO.append(fatica)


In [44]:
# so a sentence looks like this:
alignedO[20]

['quiero',
 '20',
 'desearle',
 '20',
 ',',
 '20',
 'en',
 '20',
 'nombre',
 '20',
 'del',
 '20',
 'parlamento',
 '20',
 ',',
 '20',
 'el',
 '20',
 'mayor',
 '20',
 'de',
 '20',
 'los',
 '20',
 'éxitos',
 '20',
 'durante',
 '20',
 'su',
 '20',
 'mandato',
 '20',
 '.',
 '20',
 'i',
 '20',
 'want',
 '20',
 'to',
 '20',
 'wish',
 '20',
 'you',
 '20',
 'every',
 '20',
 'success',
 '20',
 ',',
 '20',
 'on',
 '20',
 'behalf',
 '20',
 'of',
 '20',
 'parliament',
 '20',
 ',',
 '20',
 'during',
 '20',
 'your',
 '20',
 'term',
 '20',
 'of',
 '20',
 'office',
 '20',
 '.',
 '20']

In [51]:
#I train the model on the smallest context: words occurring in the same set of sentences should thus have the EXACT kind of context (the same set of ids)
madmod1= Word2Vec(alignedO[:],window=1,size=300)

In [54]:
#This system works very poorly for any content word
w = "germany"
print(madmod1.wv.most_similar(w, topn=3))
print(madmod0.wv.most_similar(w, topn=3))

[('britain', 0.9982935190200806), ('experience', 0.9982931613922119), ('incluido', 0.9982456564903259)]
[('alemania', 0.9471361637115479), ('italy', 0.8497835993766785), ('italia', 0.8462876081466675)]


In [56]:
# but it is working for function words and prepositions!!
madmod1.wv.most_similar("y"), madmod0.wv.most_similar("y")

([('and', 0.924795925617218),
  ('de', 0.828447699546814),
  ('of', 0.8263126611709595),
  ('the', 0.7952296137809753),
  ('los', 0.7800973653793335),
  ('las', 0.765472412109375),
  ('la', 0.7263686656951904),
  ('in', 0.6723071932792664),
  ('en', 0.6377344131469727),
  ('.', 0.6255273818969727)],
 [('e', 0.4978914260864258),
  ('arts', 0.20701898634433746),
  ('implicarnos', 0.20574888586997986),
  ('societies', 0.18744277954101562),
  ('desechar', 0.186610609292984),
  ('prosperity', 0.18592122197151184),
  ('ponerlos', 0.1841798722743988),
  ('óptimos', 0.1799958199262619),
  ('compaginar', 0.17966225743293762),
  ('owed', 0.17411628365516663)])

In [57]:
madmod1.wv.most_similar("to")

[('a', 0.8716167211532593),
 ('.', 0.7861998677253723),
 ('we', 0.7296077609062195),
 ('dwelling', 0.7217196822166443),
 ('griega', 0.6852811574935913),
 ('malliori', 0.6851369142532349),
 ('will', 0.678860068321228),
 ('que', 0.6760795712471008),
 ('have', 0.671581506729126),
 ('that', 0.671463668346405)]

In [44]:
#APPENDIX B: SAME IN GERMAN
deu = open(dire+"Translation/B7/epuds-parallel/words/epuds.en-de.de").read()
eng = open(dire+"Translation/B7/epuds-parallel/words/epuds.en-de.en").read()

In [45]:
wrt =  [nltk.wordpunct_tokenize(se) for se in deu.lower().split("\n")]
words = [nltk.wordpunct_tokenize(sen) for sen in eng.lower().split("\n")]
print(len(wrt), len(words))
wrt[0]

137814 137814


['herr',
 'präsident',
 ',',
 'vielen',
 'dank',
 ',',
 'daß',
 'sie',
 'mir',
 'gelegenheit',
 'geben',
 ',',
 'mich',
 'an',
 'dieses',
 'hohe',
 'haus',
 'zu',
 'wenden',
 '.']

In [46]:
for i in range(5):
    sen = words[i]
    print(" ".join(sen))
    print(" ".join(wrt[i]))
    print("\n")

mr president , i should like to thank you for giving me this opportunity to address the house .
herr präsident , vielen dank , daß sie mir gelegenheit geben , mich an dieses hohe haus zu wenden .


i want to register my protest at the lack of facilities for me , as a deputy within this chamber .
ich möchte meinen protest über mangelhafte einrichtungen für mich als mitglied dieses parlaments zum ausdruck bringen .


two and a half years ago , i went to the authorities who were dealing with the building of this new structure and informed them of my requirements as a person with a disability within the chamber .
vor zweieinhalb jahren wandte ich mich an die mit der errichtung dieses neuen gebäudes befaßten stellen und informierte sie über meine anforderungen als behinderter abgeordneter .


i was assured at that stage that every facility would be made available .
damals wurde mir versichert , daß alle voraussetzungen gegeben sein würden .


i arrived here last month and spoke with the arc

In [47]:
aligned = []
for i in range(len(wrt)):
    se = wrt[i]
    sen = words[i]
    multi = se+sen
    
    random.shuffle(multi) 
    aligned.append(multi)
    
aligned[0]

['.',
 ',',
 'the',
 'like',
 'to',
 'an',
 'i',
 '.',
 'daß',
 'president',
 'dieses',
 'giving',
 'gelegenheit',
 'vielen',
 'hohe',
 'to',
 'geben',
 'thank',
 'house',
 'zu',
 ',',
 'haus',
 'sie',
 'mir',
 'me',
 'wenden',
 'opportunity',
 'mich',
 'you',
 'mr',
 ',',
 ',',
 'for',
 'dank',
 'address',
 'this',
 'should',
 'herr',
 'präsident']

In [48]:
np.mean([len(s) for s in aligned]), np.std([len(s) for s in aligned])

(51.39989406011, 31.879100414691504)

In [49]:
#I train a model on the average + std deviation context. So its context is bilingual.
madmod_deu = Word2Vec(aligned,window=84,size=300)

In [52]:
madmod_deu.wv.most_similar("germany", topn=3)

[('deutschland', 0.9349461197853088),
 ('frankreich', 0.844580888748169),
 ('france', 0.8314818143844604)]

In [53]:
madmod_deu.wv.most_similar("kraft")

[('force', 0.6560993790626526),
 ('inkrafttreten', 0.5913261771202087),
 ('treten', 0.5754548907279968),
 ('tritt', 0.5448029041290283),
 ('ratify', 0.5393043756484985),
 ('ratifizieren', 0.5074322819709778),
 ('cartegena', 0.4983678162097931),
 ('verlängerung', 0.49187955260276794),
 ('protocol', 0.4881513714790344),
 ('abgeschlossen', 0.48623502254486084)]

In [54]:
madmod_deu.wv.most_similar("happy")

[('glücklich', 0.5722105503082275),
 ('grateful', 0.5053730010986328),
 ('freude', 0.5035439133644104),
 ('froh', 0.47786515951156616),
 ('erfreut', 0.47060608863830566),
 ('helpful', 0.4672287106513977),
 ('zufrieden', 0.46700045466423035),
 ('zusammengearbeitet', 0.46011000871658325),
 ('dankbar', 0.45920276641845703),
 ('speaking', 0.4589686393737793)]

In [55]:
madmod_deu.wv.most_similar("mord")

[('murder', 0.9396565556526184),
 ('brutale', 0.825391948223114),
 ('murdered', 0.820722222328186),
 ('brutal', 0.8183834552764893),
 ('ermordung', 0.8165302872657776),
 ('ermordet', 0.8146328926086426),
 ('killings', 0.8130627870559692),
 ('camp', 0.8076316118240356),
 ('bombing', 0.8040317296981812),
 ('zivilisten', 0.8013164401054382)]

In [56]:
#no near neighs
madmod_deu.wv.most_similar("wozu")

[('commissions', 0.3650393486022949),
 ('likewise', 0.34637245535850525),
 ('including', 0.3458613157272339),
 ('applies', 0.33967074751853943),
 ('worüber', 0.33508801460266113),
 ('arusha', 0.33462971448898315),
 ('welche', 0.3186159133911133),
 ('welchen', 0.31593114137649536),
 ('accumulated', 0.31573671102523804),
 ('jene', 0.3134268522262573)]

In [57]:
#no near english neighs
madmod_deu.wv.most_similar("darf")

[('kann', 0.691245973110199),
 ('dürfen', 0.6539732217788696),
 ('muß', 0.6143087148666382),
 ('könnte', 0.5920522212982178),
 ('muss', 0.5890470743179321),
 ('sollte', 0.5552568435668945),
 ('können', 0.5299324989318848),
 ('dürfe', 0.5298604965209961),
 ('müsse', 0.5247164964675903),
 ('soll', 0.5089102983474731)]

In [58]:
#no near german neighs
madmod_deu.wv.most_similar("wenn")

[('falls', 0.7583459615707397),
 ('sofern', 0.6681891083717346),
 ('wann', 0.6511678695678711),
 ('sobald', 0.48729443550109863),
 ('then', 0.4788084924221039),
 ('obwohl', 0.4600425660610199),
 ('comes', 0.44291406869888306),
 ('aufseiten', 0.4087792634963989),
 ('solange', 0.40547245740890503),
 ('ob', 0.39352846145629883)]

In [59]:
madmod_deu.wv.most_similar("fragen", topn=20)

[('questions', 0.713936984539032),
 ('themen', 0.6174116134643555),
 ('issues', 0.5843456983566284),
 ('beantworten', 0.5346881151199341),
 ('beantwortet', 0.5341669321060181),
 ('matters', 0.5292607545852661),
 ('anfragen', 0.5273733735084534),
 ('answered', 0.5213537812232971),
 ('aufgeworfenen', 0.5114202499389648),
 ('frage', 0.5085440874099731),
 ('raised', 0.4873928427696228),
 ('schlüsselfragen', 0.48508137464523315),
 ('arise', 0.48319876194000244),
 ('raises', 0.47569799423217773),
 ('answers', 0.46668359637260437),
 ('klären', 0.46573853492736816),
 ('klärung', 0.46054574847221375),
 ('addressed', 0.4575481414794922),
 ('question', 0.45689529180526733),
 ('aufgeworfen', 0.4535149335861206)]

In [60]:
w = "freedom"
madmod_deu.wv.most_similar(w), madmod0.wv.most_similar(w)

([('freiheit', 0.8800723552703857),
  ('expression', 0.808447003364563),
  ('meinungsfreiheit', 0.7776405215263367),
  ('meinungsäußerung', 0.720323920249939),
  ('freedoms', 0.6940377354621887),
  ('religionsfreiheit', 0.6920214891433716),
  ('liberty', 0.6800886988639832),
  ('gerechtigkeit', 0.671705961227417),
  ('freiheiten', 0.6646568179130554),
  ('gleichheit', 0.6554666757583618)],
 [('freiheit', 0.8770506381988525),
  ('expression', 0.7552639842033386),
  ('meinungsäußerung', 0.7279289960861206),
  ('meinungsfreiheit', 0.7247045040130615),
  ('freedoms', 0.7050155401229858),
  ('religiösen', 0.6755955219268799),
  ('pressefreiheit', 0.6700924038887024),
  ('gerechtigkeit', 0.6672602891921997),
  ('liberty', 0.6613081693649292),
  ('religionsfreiheit', 0.6567243337631226)])

In [61]:
madmod_deu.wv.most_similar(["freedom","press"]) #concepts at the intersection. 

[('pressefreiheit', 0.7436741590499878),
 ('meinungsfreiheit', 0.741266131401062),
 ('expression', 0.731926441192627),
 ('religionsfreiheit', 0.7109665870666504),
 ('journalists', 0.7109071016311646),
 ('presse', 0.6837912201881409),
 ('inhaftierung', 0.6783128976821899),
 ('journalisten', 0.6756174564361572),
 ('redefreiheit', 0.6665547490119934),
 ('freiheit', 0.654988169670105)]

In [62]:
madmod_deu.wv.most_similar(["religionsfreiheit"])

[('expression', 0.7611088752746582),
 ('meinungsfreiheit', 0.7404330372810364),
 ('religious', 0.7004653215408325),
 ('folter', 0.6977810859680176),
 ('freedom', 0.6920214891433716),
 ('degrading', 0.6678593158721924),
 ('torture', 0.6671411395072937),
 ('minorities', 0.648648202419281),
 ('religiösen', 0.6398242712020874),
 ('repression', 0.635739803314209)]

In [63]:
madmod_deu.wv.most_similar("erfreut")

[('delighted', 0.6927938461303711),
 ('pleased', 0.6917699575424194),
 ('froh', 0.685878336429596),
 ('enttäuscht', 0.667832612991333),
 ('glad', 0.6597051620483398),
 ('disappointed', 0.6068927049636841),
 ('freue', 0.5639588832855225),
 ('freut', 0.5476800203323364),
 ('gefreut', 0.5431020855903625),
 ('ermutigend', 0.5235954523086548)]

In [64]:
madmod_deu.wv.most_similar("unemployment".split())

[('arbeitslosigkeit', 0.908473014831543),
 ('arbeitslosenquote', 0.6700712442398071),
 ('produktivität', 0.6622342467308044),
 ('wachstum', 0.6487407684326172),
 ('productivity', 0.6370849609375),
 ('jobs', 0.6321849226951599),
 ('growth', 0.6256743669509888),
 ('arbeitsplätze', 0.6215499043464661),
 ('living', 0.6189567446708679),
 ('wohlstand', 0.596534252166748)]

In [65]:
#INTERPRETING

Interpreting was a bit complicated to pre-process due to the format of the data. I created a txt file with aligned english-german interpreting transcripts and here I will just use that as my source corpus

In [66]:
import codecs
f = codecs.open("English-German_aligned_interpreting.txt","r","utf8")#.read()
# here I will use it as it is, ofc you can remove </s> and similar signs before starting
mydata = f.read()
doubles = mydata.split("\n#\n")
len(doubles)
print(doubles[100])
print(doubles[100].split())

     <s id="1:7">and we need a little bit of honesty about that</s>    <s id="1:7">ich glaub da bräuchten wir doch n bisschen mehr Ehrlichkeit</s>
['<s', 'id="1:7">and', 'we', 'need', 'a', 'little', 'bit', 'of', 'honesty', 'about', 'that</s>', '<s', 'id="1:7">ich', 'glaub', 'da', 'bräuchten', 'wir', 'doch', 'n', 'bisschen', 'mehr', 'Ehrlichkeit</s>']


In [67]:
import random
tokd_alig = []
for d in doubles:
    magic = nltk.wordpunct_tokenize(d.lower())
    random.shuffle(magic)
    tokd_alig.append(magic)


In [68]:
len(tokd_alig), np.mean([len(e) for e in tokd_alig]), np.std([len(e) for e in tokd_alig])

(3397, 47.97438916691198, 30.19486692753781)

In [69]:
%%time

interp_model = Word2Vec(tokd_alig,window=77,size=300, min_count=1, negative=10, iter=30)

CPU times: user 26 s, sys: 265 ms, total: 26.3 s
Wall time: 11 s


In [70]:
interp_model.wv.most_similar("spain")

[('spanien', 0.9892104864120483),
 ('beneficiaries', 0.9067434668540955),
 ('italy', 0.9027279615402222),
 ('italien', 0.9006606340408325),
 ('hauptempfänger', 0.8998532891273499),
 ('bethe', 0.8667525053024292),
 ('alte', 0.8648932576179504),
 ('died', 0.8122330904006958),
 ('pflegeheimen', 0.8045418858528137),
 ('residential', 0.8031773567199707)]

In [92]:
len(tokd_alig)

3397

In [93]:
tokn = 0
for tokd in tokd_alig:
    tokn+=sum([len(w) for w in tokd])
print(tokn)

568230


In [None]:
# Finally we can make a down-sampled model trained on a translation corpus as small as an interpreting corpus

In [143]:
random.shuffle(aligned)
downsampled_model = Word2Vec(aligned[:len(tokd_alig)],window=80,size=300, min_count=1, negative=10, iter=30)

In [144]:
wort,n = "germany",5 #friends,
madmod_deu.wv.most_similar(wort, topn=n), downsampled_model.wv.most_similar(wort,topn=n), interp_model.wv.most_similar(wort,topn=n)

([('deutschland', 0.9487962126731873),
  ('spain', 0.8551216721534729),
  ('frankreich', 0.8506844639778137),
  ('france', 0.8486232757568359),
  ('belgien', 0.83827805519104)],
 [('deutschland', 0.8962137699127197),
  ('unemployment', 0.8538498878479004),
  ('arbeitslosenquoten', 0.8228492736816406),
  ('population', 0.7792320251464844),
  ('zivilisten', 0.7768558859825134)],
 [('deutschland', 0.9668587446212769),
  ('politically', 0.8618838787078857),
  ('politisch', 0.8386322855949402),
  ('tragbar', 0.8146785497665405),
  ('verbuchen', 0.6270592212677002)])

In [146]:
#some basic analysis
len(downsampled_model.wv.vocab), len(interp_model.wv.vocab)

(18114, 10524)

In [None]:
#from here on it is possible to compare the spaces' inner distances, avg closest neighbour and so on 

In [137]:
smaller_voc = [k for k in downsampled_model.wv.vocab if k in interp_model.wv.vocab]
print(len(smaller_voc))
smaller_voc = smaller_voc[:]
wow = [downsampled_model.wv.distances(w) for w in smaller_voc]
wow2 = [interp_model.wv.distances(w) for w in smaller_voc]
#hm2 = erste_model.wv.distances(wort)

3443


In [147]:
wow3 = [madmod_deu.wv.distances(w) for w in smaller_voc]

KeyboardInterrupt: 

In [None]:
1-np.mean(wow), np.std(wow), 1-np.mean(wow2), np.std(wow2), 1-np.mean(wow3)

In [None]:
1-np.mean(wow), np.std(wow), 1-np.mean(wow2), np.std(wow2)#, 1-np.mean(wow3)

In [None]:
#nearest neighbour avg distance, second nearest neighbour average distance.
nearests_downsampled = [downsampled_model.wv.most_similar(w, topn=10) for w in smaller_voc[:]]

In [None]:
nearests_erste = [erste_model.wv.most_similar(w, topn=10) for w in smaller_voc[:]]

In [None]:
#nearests
first_down = [e[0][1] for e in nearests_downsampled[:]]
first_ers = [e[0][1] for e in nearests_erste[:]]
np.mean(first_down), np.std(first_down), np.mean(first_ers), np.std(first_ers)

In [None]:
second_down = [e[-1][1] for e in nearests_downsampled]
second_ers = [e[-1][1] for e in nearests_erste]
np.mean(second_down), np.mean(second_ers)

In [None]:
second_down = [sum([w[1] for w in e]) for e in nearests_downsampled]
second_ers = [sum([w[1] for w in e]) for e in nearests_erste]
np.mean(second_down), np.mean(second_ers)

In [872]:
wort = "somehow" #friends,
downsampled_model.wv.most_similar(wort), erste_model.wv.most_similar(wort)

([('irgendwie', 0.8418327569961548),
  ('anhängen', 0.833200216293335),
  ('pollute', 0.8023205995559692),
  ('verschmutzt', 0.7767529487609863),
  ('beauftragter', 0.7750450372695923),
  ('friedens', 0.7689111232757568),
  ('einreden', 0.7673659324645996),
  ('island', 0.7669362425804138),
  ('suggesting', 0.7635915875434875),
  ('donated', 0.7626611590385437)],
 [('enjoy', 0.6445189714431763),
  ('speaks', 0.6365010738372803),
  ('volumes', 0.6305739879608154),
  ('historic', 0.5737761855125427),
  ('schlachthöfen', 0.5715336799621582),
  ('kontaminierte', 0.5693942308425903),
  ('vast', 0.5624562501907349),
  ('round', 0.5580256581306458),
  ('schweine', 0.5536667108535767),
  ('leier', 0.5522794127464294)])

In [930]:
wort = "schnell" #friends,doctors,doctor,politician,gefahr,gespielt,understand, vs should,could,must,will,very,gratulieren,verträge
downsampled_model.wv.most_similar(wort), interp_model.wv.most_similar(wort)

([('quickly', 0.722977876663208),
  ('angeboten', 0.6809149980545044),
  ('gut', 0.6630717515945435),
  ('equally', 0.648383378982544),
  ('inconvenient', 0.6144195795059204),
  ('recognise', 0.6134805083274841),
  ('geflügeltes', 0.6119312047958374),
  ('wirtschaftswissenschaftler', 0.6112147569656372),
  ('staatsform', 0.6111538410186768),
  ('zusammenkommen', 0.6096416711807251)],
 [('quickly', 0.7965059280395508),
  ('gelöst', 0.7955560684204102),
  ('asap', 0.784021258354187),
  ('b', 0.735526978969574),
  ('trinkwasser', 0.7343395948410034),
  ('clean', 0.7325656414031982),
  ('c', 0.7299182415008545),
  ('restore', 0.7273295521736145),
  ('passieren', 0.7115418910980225),
  ('strom', 0.7114201784133911)])

In [83]:
madmod_deu.wv.most_similar("sitte")

[('ausschußvorsitzender', 0.9808642268180847),
 ('mitberichterstatter', 0.726360559463501),
 ('schuldenerlässe', 0.7016815543174744),
 ('willensentscheidung', 0.6965516805648804),
 ('führerscheinen', 0.6951477527618408),
 ('vergnaud', 0.6947504878044128),
 ('versicherungsklima', 0.688271164894104),
 ('meeresüberwachung', 0.6873326301574707),
 ('ausblutet', 0.6771560907363892),
 ('vorbereitungszeit', 0.6764340996742249)]