## Data Pre-Processing

In [117]:
import pandas
import re
# load the dataset
dataset = pandas.read_csv('/home/dai/Desktop/project/frenchenglish-bilingual-pairs/fra-eng/fra.txt', delimiter = '\t',header=None)
dataset.head()

Unnamed: 0,0,1
0,Go.,Va !
1,Run!,Cours !
2,Run!,Courez !
3,Wow!,Ça alors !
4,Fire!,Au feu !


In [118]:
 dataset=dataset.rename(columns={0: "English", 1: "French"})

**Removing Punctuations**

In [119]:
dataset["new_column_French"] = dataset['French'].str.replace('[^\w\s]','')
dataset["new_column_English"] = dataset['English'].str.replace('[^\w\s]','')

**Converting to Lower Case**

In [120]:
dataset['new_column_English'] = dataset['new_column_English'].str.lower() 

**Converting French term to unidecode**

In [121]:
from unidecode import unidecode

def split_it(text):
    words= re.findall(r'\w+', text, re.U)
    ## French column converted into lowercase and unidecode
    cap_words = [unidecode(word).lower() for word in words]
    return cap_words

In [122]:
dataset['new_column_French1'] = dataset['new_column_French'].apply(lambda x: split_it(x))
dataset['new_column_French1'].head()

0           [va]
1        [cours]
2       [courez]
3    [ca, alors]
4      [au, feu]
Name: new_column_French1, dtype: object

In [123]:
seperator = ' '
dataset['new_column_French1']= dataset['new_column_French1'].apply(lambda x: seperator.join(x))
dataset['new_column_French1'].head() 
    

0          va
1       cours
2      courez
3    ca alors
4      au feu
Name: new_column_French1, dtype: object

In [124]:
dataset=dataset.drop(columns=['English', 'French','new_column_French'],axis=1)
dataset.head()

Unnamed: 0,new_column_English,new_column_French1
0,go,va
1,run,cours
2,run,courez
3,wow,ca alors
4,fire,au feu


**Removing stop words**

In [125]:
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
#print(stop)

In [126]:
dataset['new_column_English'] = dataset['new_column_English'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop]))


In [132]:
dataset.head(50)

Unnamed: 0,new_column_English,new_column_French1
0,go,va
1,run,cours
2,run,courez
3,wow,ca alors
4,fire,feu
5,help,laide
6,jump,saute
7,stop,ca suffit
8,stop,stop
9,stop,arretetoi


**for french**

In [128]:
import nltk
from nltk.corpus import stopwords
stop_french = stopwords.words('french')
print(stop_french)

['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'ayante', 'ayantes', 'ayants', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aur

In [131]:
dataset['new_column_French1'] = dataset['new_column_French1'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_french]))
dataset.head(50)

Unnamed: 0,new_column_English,new_column_French1
0,go,va
1,run,cours
2,run,courez
3,wow,ca alors
4,fire,feu
5,help,laide
6,jump,saute
7,stop,ca suffit
8,stop,stop
9,stop,arretetoi


## Converting dataframe column into a separate text file

In [133]:
for c in dataset.columns:
    dataset[c].to_csv(c + '.txt', index=False)

  


## Word Vector for English

In [136]:
import nltk
#nltk.download('punkt')

from nltk.tokenize import sent_tokenize, word_tokenize 
import warnings 
  
warnings.filterwarnings(action = 'ignore') 
  
import gensim 
from gensim.models import Word2Vec 
  
#  Reads ‘new_column_English.txt’ file 
sample = open("/home/dai/Desktop/Projectnew/new_column_English.txt", "r") 
s = sample.read() 
    
# Replaces escape character with space 
#f = s.replace("\n", " ") 
  

In [174]:
data = [] 
  
# iterate through each sentence in the file 
for i in sent_tokenize(s): 
    temp = [] 
      
    # tokenize the sentence into words 
    for j in word_tokenize(i): 
        temp.append(j.lower()) 
  
    data.append(temp) 
#print(data)
#Create CBOW model 
model_eng = gensim.models.Word2Vec(data, min_count = 5,  
                              size = 300, window = 5) 

In [None]:
model_eng.wv.save_word2vec_format('model_eng.txt', binary=False)

## Word Vector for French

In [134]:
sample_french = open("/home/dai/Desktop/Projectnew/new_column_French1.txt", "r") 
s_french = sample_french.read() 

In [137]:
data1 = [] 
  
# iterate through each sentence in the file 
for i in sent_tokenize(s_french): 
    temp1 = [] 
      
    # tokenize the sentence into words 
    for j in word_tokenize(i): 
        temp1.append(j.lower()) 
  
    data1.append(temp1) 

#print(data1[0])
#Create CBOW model 
model_fr = gensim.models.Word2Vec(data1, min_count = 5,  
                              size = 300, window = 5) 

In [178]:
model_fr.wv.save_word2vec_format('model_fr.txt', binary=False)

### Load the word-vector file and create a dictionary of words and their ID

In [4]:
import io
import numpy as np

In [5]:
def load_vec(emb_path, nmax=7000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        
        ## Creating dictionary 
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            #print("word:",word)
            #print(type(vect))

            vect = np.fromstring(vect, sep=' ')
            #print((vect))
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            #print(vectors)
            word2id[word] = len(word2id)
            #print(len(word2id))
            #print(word2id[word])
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    
    ##Creating a stack of vectors of words
    embeddings = np.vstack(vectors)
    #print(type(embeddings))
    return embeddings, id2word, word2id

In [6]:
src_path = '/home/dai/Desktop/Projectnew/wiki.en.align.vec'         ### Used aligned vectors
tgt_path = '/home/dai/Desktop/Projectnew/wiki.fr.align.vec'
nmax = 7000 # maximum number of word embeddings to load
#vectors_src
#vectors_tgt

src_embeddings, src_id2word, src_word2id= load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id= load_vec(tgt_path, nmax)

#print(vectors_src)

In [33]:
#vector_src_list=list(vectors_src)
#vector_str=' '.join(vector_src_list)
#vector_str
#print(vector_src_list)

In [59]:
#with open("/home/dai/Desktop/Projectnew/vec_eng.txt", "w") as output:
    #output.write(str(vector_src_list))

## To calculate nearest distance

In [7]:
def get_nn(word, src_emb, src_id2word, tgt_emb, tgt_id2word, K=5):
    #print("Nearest neighbors of \"%s\":" % word)
    #print(tgt_emb.shape)
    
    word2id = {v: k for k, v in src_id2word.items()}
    
    #if k in word2id:
       
    
    word_emb = src_emb[word2id[word]]
    #print(word_emb.shape)
    
    ## Calculating distance between words
    scores = (tgt_emb / np.linalg.norm(tgt_emb, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))
    #scores = (tgt_emb ).dot(word_emb)
    #print(scores.shape)
    
    
    k_best = scores.argsort()[-K:][::-1]  ## returns index of that score
   # print(k_best)
    
    for i, idx in enumerate(k_best):
        #print(tgt_id2word[idx])
        print('%.4f - %s' % (scores[idx], tgt_id2word[idx]))
        
    #else:
        #pass
    #return tgt_id2word[k_best]

In [9]:
# printing nearest neighbors in the target space
src_word = 'stop'
get_nn(src_word, src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=5)



Nearest neighbors of "stop":
0.4140 - arrêter
0.3325 - empêcher
0.3168 - continuer
0.3155 - bloquer
0.3035 - train


## creating a dictionary of word and its translation

In [44]:
dictionary={}
for k, v in src_word2id.items():
    word=k
    dictionary[word]=get_nn(word, src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=1)
    

Nearest neighbors of ",":
0.4591 - ,
Nearest neighbors of ".":
0.5293 - .
Nearest neighbors of "the":
0.4440 - la
Nearest neighbors of "</s>":
0.1880 - </s>
Nearest neighbors of "of":
0.4551 - de
Nearest neighbors of "-":
0.5552 - -
Nearest neighbors of "in":
0.4101 - dans
Nearest neighbors of "and":
0.5491 - et
Nearest neighbors of "'":
0.3636 - '
Nearest neighbors of ")":
0.5530 - )
Nearest neighbors of "(":
0.5674 - (
Nearest neighbors of "to":
0.3183 - à
Nearest neighbors of "a":
0.4401 - une
Nearest neighbors of "is":
0.5679 - est
Nearest neighbors of "was":
0.4915 - fut
Nearest neighbors of "on":
0.5049 - sur
Nearest neighbors of "s":
0.2107 - son
Nearest neighbors of "for":
0.4676 - pour
Nearest neighbors of "as":
0.3801 - comme
Nearest neighbors of "by":
0.6198 - par
Nearest neighbors of "that":
0.3700 - que
Nearest neighbors of "it":
0.3056 - elle
Nearest neighbors of "with":
0.5081 - avec
Nearest neighbors of "from":
0.3150 - provenant
Nearest neighbors of "at":
0.2966 - où
N

0.4634 - articles
Nearest neighbors of "john":
0.4917 - john
Nearest neighbors of "same":
0.3684 - identique
Nearest neighbors of "including":
0.4426 - notamment
Nearest neighbors of "could":
0.3677 - pouvait
Nearest neighbors of "english":
0.5238 - anglais
Nearest neighbors of "album":
0.5298 - album
Nearest neighbors of "number":
0.4789 - nombre
Nearest neighbors of "against":
0.5558 - contre
Nearest neighbors of "family":
0.6758 - famille
Nearest neighbors of "user":
0.3333 - utilisateur
Nearest neighbors of "based":
0.5817 - basée
Nearest neighbors of "area":
0.4045 - zone
Nearest neighbors of "became":
0.5127 - devint
Nearest neighbors of "york":
0.5441 - york
Nearest neighbors of "b":
0.5683 - b
Nearest neighbors of "life":
0.5398 - vie
Nearest neighbors of "me":
0.4493 - me
Nearest neighbors of "british":
0.6218 - britannique
Nearest neighbors of "international":
0.6470 - international
Nearest neighbors of "game":
0.5182 - jeu
Nearest neighbors of """:
0.4222 - "
Nearest neighbo

0.4857 - blanc
Nearest neighbors of "along":
0.2580 - côté
Nearest neighbors of "five":
0.5962 - cinq
Nearest neighbors of "central":
0.5450 - central
Nearest neighbors of "road":
0.4923 - route
Nearest neighbors of "children":
0.6464 - enfants
Nearest neighbors of "free":
0.5028 - libre
Nearest neighbors of "took":
0.3296 - pris
Nearest neighbors of "england":
0.5276 - angleterre
Nearest neighbors of "include":
0.4758 - citer
Nearest neighbors of "association":
0.5584 - association
Nearest neighbors of "down":
0.2153 - down
Nearest neighbors of "j":
0.3978 - j
Nearest neighbors of "given":
0.4119 - donné
Nearest neighbors of "source":
0.4322 - sources
Nearest neighbors of "x":
0.6423 - x
Nearest neighbors of "california":
0.5620 - californie
Nearest neighbors of "man":
0.3988 - man
Nearest neighbors of "version":
0.6046 - version
Nearest neighbors of "written":
0.5571 - écrit
Nearest neighbors of "created":
0.6135 - créé
Nearest neighbors of "media":
0.4263 - médias
Nearest neighbors 

0.3795 - acte
Nearest neighbors of "editor":
0.4499 - rédacteur
Nearest neighbors of "came":
0.2506 - arrivé
Nearest neighbors of "schools":
0.5696 - écoles
Nearest neighbors of "program":
0.5546 - programme
Nearest neighbors of "once":
0.2992 - fois
Nearest neighbors of "issue":
0.2784 - question
Nearest neighbors of "social":
0.4889 - social
Nearest neighbors of "germany":
0.5937 - allemagne
Nearest neighbors of "production":
0.6391 - production
Nearest neighbors of "male":
0.3140 - masculin
Nearest neighbors of "might":
0.3863 - pourrait
Nearest neighbors of "awards":
0.5254 - récompenses
Nearest neighbors of "points":
0.5898 - points
Nearest neighbors of "similar":
0.5413 - similaire
Nearest neighbors of "professional":
0.5832 - professionnel
Nearest neighbors of "say":
0.2999 - dire
Nearest neighbors of "background":
0.3073 - background
Nearest neighbors of "enough":
0.4317 - suffisamment
Nearest neighbors of "lead":
0.2227 - batterie
Nearest neighbors of "either":
0.3759 - ou
Nea

0.5672 - réseau
Nearest neighbors of "win":
0.5447 - remporter
Nearest neighbors of "shows":
0.3397 - montrant
Nearest neighbors of "wife":
0.5194 - épouse
Nearest neighbors of "returned":
0.4068 - retourne
Nearest neighbors of "night":
0.5518 - nuit
Nearest neighbors of "magazine":
0.5668 - magazine
Nearest neighbors of "centre":
0.5567 - centre
Nearest neighbors of "joined":
0.5613 - rejoint
Nearest neighbors of "usually":
0.5280 - généralement
Nearest neighbors of "middle":
0.2519 - moyen
Nearest neighbors of "completed":
0.5069 - achevé
Nearest neighbors of "elected":
0.5271 - élu
Nearest neighbors of "significant":
0.4299 - important
Nearest neighbors of "african":
0.5789 - africaine
Nearest neighbors of "able":
0.3948 - capable
Nearest neighbors of "google":
0.4988 - google
Nearest neighbors of "stage":
0.3840 - étapes
Nearest neighbors of "addition":
0.3760 - également
Nearest neighbors of "ireland":
0.6058 - irlande
Nearest neighbors of "today":
0.5036 - aujourd
Nearest neighbo

0.5991 - conçu
Nearest neighbors of "rule":
0.2450 - domination
Nearest neighbors of "etc":
0.5558 - etc
Nearest neighbors of "lists":
0.5474 - liste
Nearest neighbors of "paris":
0.5284 - paris
Nearest neighbors of "thought":
0.3299 - penser
Nearest neighbors of "brown":
0.3965 - brown
Nearest neighbors of "hand":
0.4289 - main
Nearest neighbors of "needs":
0.2843 - devrait
Nearest neighbors of "reliable":
0.3487 - sources
Nearest neighbors of "smith":
0.4769 - smith
Nearest neighbors of "generally":
0.4932 - généralement
Nearest neighbors of "base":
0.4773 - base
Nearest neighbors of "sometimes":
0.5295 - parfois
Nearest neighbors of "florida":
0.5221 - floride
Nearest neighbors of "capital":
0.4524 - capitale
Nearest neighbors of "valley":
0.5379 - vallée
Nearest neighbors of "bank":
0.5247 - banque
Nearest neighbors of "gave":
0.3367 - donne
Nearest neighbors of "ground":
0.2904 - terrain
Nearest neighbors of "reached":
0.5187 - atteint
Nearest neighbors of "italy":
0.5850 - italie

0.3358 - après
Nearest neighbors of "seasons":
0.5289 - saisons
Nearest neighbors of "journal":
0.4849 - revue
Nearest neighbors of "beginning":
0.4277 - début
Nearest neighbors of "software":
0.4873 - logiciel
Nearest neighbors of "famous":
0.5775 - célèbre
Nearest neighbors of "religious":
0.4997 - religieux
Nearest neighbors of "appear":
0.4489 - apparaissent
Nearest neighbors of "martin":
0.5229 - martin
Nearest neighbors of "el":
0.6402 - el
Nearest neighbors of "god":
0.5279 - dieu
Nearest neighbors of "bit":
0.2298 - peu
Nearest neighbors of "hours":
0.5851 - heures
Nearest neighbors of "running":
0.2426 - course
Nearest neighbors of "brought":
0.3121 - amené
Nearest neighbors of "missing":
0.2822 - disparu
Nearest neighbors of "economic":
0.5335 - économique
Nearest neighbors of "structure":
0.5296 - structure
Nearest neighbors of "rural":
0.4389 - rural
Nearest neighbors of "remained":
0.4327 - resta
Nearest neighbors of "decision":
0.5280 - décision
Nearest neighbors of "cert

0.6503 - mission
Nearest neighbors of "wp":
0.2732 - wp
Nearest neighbors of "lived":
0.5299 - vécu
Nearest neighbors of "claim":
0.2918 - prouver
Nearest neighbors of "seat":
0.4264 - sièges
Nearest neighbors of "bbc":
0.5549 - bbc
Nearest neighbors of "profile":
0.5168 - profil
Nearest neighbors of "dance":
0.5710 - danse
Nearest neighbors of "prize":
0.5774 - prix
Nearest neighbors of "doing":
0.2772 - faire
Nearest neighbors of "georgia":
0.5206 - géorgie
Nearest neighbors of "port":
0.5744 - port
Nearest neighbors of "pacific":
0.4938 - pacifique
Nearest neighbors of "castle":
0.5253 - château
Nearest neighbors of "pass":
0.3499 - passe
Nearest neighbors of "transport":
0.5583 - transport
Nearest neighbors of "organizations":
0.5068 - organisations
Nearest neighbors of "ratio":
0.2886 - pourcentage
Nearest neighbors of "recently":
0.6058 - récemment
Nearest neighbors of "fall":
0.4013 - chute
Nearest neighbors of "global":
0.4369 - global
Nearest neighbors of "era":
0.4166 - époqu

0.3911 - lourd
Nearest neighbors of "alexander":
0.4939 - alexander
Nearest neighbors of "alone":
0.2082 - seul
Nearest neighbors of "understand":
0.4281 - comprendre
Nearest neighbors of "episodes":
0.5835 - épisodes
Nearest neighbors of "gives":
0.4919 - donne
Nearest neighbors of "educational":
0.3605 - pédagogique
Nearest neighbors of "daily":
0.5462 - quotidien
Nearest neighbors of "williams":
0.4807 - williams
Nearest neighbors of "latin":
0.5637 - latine
Nearest neighbors of "completely":
0.5600 - totalement
Nearest neighbors of "products":
0.5069 - produits
Nearest neighbors of "dark":
0.4775 - sombre
Nearest neighbors of "attention":
0.3722 - attention
Nearest neighbors of "religion":
0.5637 - religion
Nearest neighbors of "referred":
0.4966 - appelé
Nearest neighbors of "von":
0.5592 - von
Nearest neighbors of "mind":
0.3666 - conscience
Nearest neighbors of "oppose":
0.1926 - voter
Nearest neighbors of "corps":
0.3317 - armée
Nearest neighbors of "administrative":
0.5610 - a

0.4849 - mur
Nearest neighbors of "immediately":
0.5161 - immédiatement
Nearest neighbors of "urban":
0.4808 - urbain
Nearest neighbors of "pakistan":
0.5484 - pakistan
Nearest neighbors of "becomes":
0.4802 - devient
Nearest neighbors of "marine":
0.4798 - marine
Nearest neighbors of "physical":
0.5149 - physiques
Nearest neighbors of "dec":
0.3831 - oct
Nearest neighbors of "troops":
0.5253 - troupes
Nearest neighbors of "interview":
0.6018 - interview
Nearest neighbors of "coming":
0.2840 - venue
Nearest neighbors of "semi":
0.4755 - semi
Nearest neighbors of "suggest":
0.3954 - suggère
Nearest neighbors of "emperor":
0.5641 - empereur
Nearest neighbors of "letter":
0.6500 - lettre
Nearest neighbors of "couple":
0.3028 - couple
Nearest neighbors of "fellow":
0.3409 - membre
Nearest neighbors of "duke":
0.4947 - duc
Nearest neighbors of "tell":
0.3024 - croire
Nearest neighbors of "gallery":
0.5709 - galerie
Nearest neighbors of "follow":
0.4800 - suivre
Nearest neighbors of "windows

0.5505 - cambridge
Nearest neighbors of "utc_offset":
0.3127 - tmax
Nearest neighbors of "ones":
0.2784 - ceux
Nearest neighbors of "composer":
0.5841 - compositeur
Nearest neighbors of "remove":
0.3980 - retirer
Nearest neighbors of ">u":
0.2888 - //www
Nearest neighbors of "agency":
0.4808 - agence
Nearest neighbors of "reserve":
0.5895 - réserve
Nearest neighbors of "atlantic":
0.4963 - atlantique
Nearest neighbors of "supreme":
0.5371 - suprême
Nearest neighbors of "weight":
0.4786 - poids
Nearest neighbors of "pp":
0.5839 - pp
Nearest neighbors of "ask":
0.4336 - demander
Nearest neighbors of "fighting":
0.5033 - combat
Nearest neighbors of "jackson":
0.4621 - jackson
Nearest neighbors of "widely":
0.4251 - largement
Nearest neighbors of "rose":
0.3980 - rose
Nearest neighbors of "operating":
0.2614 - opérations
Nearest neighbors of "treatment":
0.4395 - traitement
Nearest neighbors of "linked":
0.3526 - liée
Nearest neighbors of "andrew":
0.4190 - andrew
Nearest neighbors of "tri

0.3950 - caractéristique
Nearest neighbors of "defined":
0.5540 - défini
Nearest neighbors of "ocean":
0.5518 - océan
Nearest neighbors of "cell":
0.4908 - cellule
Nearest neighbors of "missouri":
0.4604 - missouri
Nearest neighbors of "concert":
0.6138 - concert
Nearest neighbors of "improve":
0.5656 - améliorer
Nearest neighbors of "biography":
0.5618 - biographie
Nearest neighbors of "loan":
0.4474 - prêt
Nearest neighbors of "shortly":
0.4136 - après
Nearest neighbors of "contact":
0.5986 - contact
Nearest neighbors of "holy":
0.3301 - saints
Nearest neighbors of "tennessee":
0.3541 - missouri
Nearest neighbors of "sub":
0.2569 - catégories
Nearest neighbors of "safety":
0.4697 - sécurité
Nearest neighbors of "competed":
0.3972 - remporté
Nearest neighbors of "stephen":
0.4158 - stephen
Nearest neighbors of "policies":
0.2898 - réformes
Nearest neighbors of "painting":
0.5518 - peinture
Nearest neighbors of "price":
0.3247 - achat
Nearest neighbors of "entirely":
0.4339 - entièreme

0.2541 - pattern_b
Nearest neighbors of "louisiana":
0.4672 - louisiane
Nearest neighbors of "lewis":
0.4747 - lewis
Nearest neighbors of "melbourne":
0.5127 - melbourne
Nearest neighbors of "austria":
0.5994 - autriche
Nearest neighbors of "brigade":
0.4911 - brigade
Nearest neighbors of "screen":
0.5389 - écran
Nearest neighbors of "risk":
0.5231 - risque
Nearest neighbors of "conducted":
0.4108 - menées
Nearest neighbors of "lats":
0.3064 - autres_projets
Nearest neighbors of "ban":
0.4036 - interdiction
Nearest neighbors of "da":
0.6073 - da
Nearest neighbors of "labor":
0.3399 - travailleurs
Nearest neighbors of "legislative":
0.4755 - législative
Nearest neighbors of "definition":
0.5027 - définition
Nearest neighbors of "indeed":
0.3007 - effectivement
Nearest neighbors of "#fefefe":
0.3579 - #aaa
Nearest neighbors of "draw":
0.3013 - match
Nearest neighbors of "application":
0.4403 - application
Nearest neighbors of "un":
0.4159 - onu
Nearest neighbors of "steel":
0.5237 - acie

0.2932 - angle
Nearest neighbors of "jesus":
0.5354 - jésus
Nearest neighbors of "tools":
0.5707 - outils
Nearest neighbors of "colonel":
0.5603 - colonel
Nearest neighbors of "weak":
0.3245 - faibles
Nearest neighbors of "chosen":
0.6153 - choisi
Nearest neighbors of "brand":
0.4321 - marque
Nearest neighbors of "resulting":
0.4022 - provoque
Nearest neighbors of "nfl":
0.4693 - nfl
Nearest neighbors of "rise":
0.2797 - chute
Nearest neighbors of "supply":
0.3860 - alimentation
Nearest neighbors of "tradition":
0.5922 - tradition
Nearest neighbors of "elementary":
0.3894 - primaire
Nearest neighbors of "household":
0.1971 - ménage
Nearest neighbors of "spirit":
0.4464 - esprit
Nearest neighbors of "task":
0.4051 - tâche
Nearest neighbors of "slightly":
0.5729 - légèrement
Nearest neighbors of "howard":
0.4267 - howard
Nearest neighbors of "incident":
0.5637 - incident
Nearest neighbors of "develop":
0.5793 - développer
Nearest neighbors of "southeast":
0.4162 - ouest
Nearest neighbors

Nearest neighbors of "apply":
0.4008 - appliquer
Nearest neighbors of "actresses":
0.3559 - actrice
Nearest neighbors of "competitions":
0.5196 - compétitions
Nearest neighbors of "aid":
0.4804 - aide
Nearest neighbors of "driver":
0.4826 - pilote
Nearest neighbors of "folk":
0.3664 - traditionnels
Nearest neighbors of "dan":
0.5184 - dan
Nearest neighbors of "khan":
0.5364 - khan
Nearest neighbors of "}}}":
0.3762 - }}}
Nearest neighbors of "baby":
0.4896 - bébé
Nearest neighbors of "denmark":
0.5989 - danemark
Nearest neighbors of "tokyo":
0.5966 - tokyo
Nearest neighbors of "billboard":
0.3377 - singles
Nearest neighbors of "calling":
0.2533 - disant
Nearest neighbors of "anne":
0.4555 - anne
Nearest neighbors of "happened":
0.2656 - eu
Nearest neighbors of "danish":
0.6036 - danois
Nearest neighbors of "wants":
0.4358 - souhaite
Nearest neighbors of "formula":
0.6111 - formule
Nearest neighbors of "interior":
0.5406 - intérieur
Nearest neighbors of "kevin":
0.4568 - kevin
Nearest n

0.2530 - fr/sites/default/files/rgc
Nearest neighbors of "clubnat":
0.3445 - bgcolor
Nearest neighbors of "remember":
0.3045 - rappeler
Nearest neighbors of "miami":
0.5032 - miami
Nearest neighbors of "promote":
0.5930 - promouvoir
Nearest neighbors of "values":
0.5677 - valeurs
Nearest neighbors of "spot":
0.2849 - place
Nearest neighbors of "progress":
0.3806 - progrès
Nearest neighbors of "nationalteam":
0.2630 - yyy
Nearest neighbors of "learn":
0.5683 - apprendre
Nearest neighbors of "planet":
0.6153 - planète
Nearest neighbors of "oh":
0.5381 - oh
Nearest neighbors of "or%":
0.2508 - ffffff
Nearest neighbors of "occupied":
0.4796 - occupée
Nearest neighbors of "usage":
0.4660 - usage
Nearest neighbors of "southwest":
0.4535 - ouest
Nearest neighbors of "refused":
0.5141 - refusé
Nearest neighbors of "borough":
0.2453 - circonscription
Nearest neighbors of "truth":
0.5492 - vérité
Nearest neighbors of "clark":
0.4577 - clark
Nearest neighbors of "sufficient":
0.3994 - nécessaire


0.5343 - controverse
Nearest neighbors of "reverted":
0.3195 - supprimé
Nearest neighbors of "expressed":
0.4552 - exprime
Nearest neighbors of "josé":
0.4476 - josé
Nearest neighbors of "bodies":
0.3408 - organes
Nearest neighbors of "conservation":
0.4970 - conservation
Nearest neighbors of "maps":
0.3633 - carte
Nearest neighbors of "ahead":
0.3605 - derrière
Nearest neighbors of "marie":
0.4697 - marie
Nearest neighbors of "arguments":
0.4689 - arguments
Nearest neighbors of "chain":
0.3814 - chaîne
Nearest neighbors of "focused":
0.2605 - essentiellement
Nearest neighbors of "readers":
0.5344 - lecteurs
Nearest neighbors of "carl":
0.5628 - carl
Nearest neighbors of "cm":
0.4713 - cm
Nearest neighbors of "violation":
0.2541 - interdiction
Nearest neighbors of "offices":
0.5443 - bureaux
Nearest neighbors of "wave":
0.4172 - vague
Nearest neighbors of "circle":
0.4953 - cercle
Nearest neighbors of "apart":
0.2328 - quelques
Nearest neighbors of "invasion":
0.6142 - invasion
Nearest

0.5145 - soir
Nearest neighbors of "singing":
0.3908 - chante
Nearest neighbors of "fifa":
0.5479 - fifa
Nearest neighbors of "gender":
0.3605 - sexe
Nearest neighbors of "venues":
0.3454 - concerts
Nearest neighbors of "lakes":
0.5391 - lacs
Nearest neighbors of "mail":
0.4764 - courrier
Nearest neighbors of "jeff":
0.3909 - jeff
Nearest neighbors of "electoral":
0.5277 - électorale
Nearest neighbors of "emergency":
0.5703 - urgence
Nearest neighbors of "mode":
0.4126 - mode
Nearest neighbors of "christopher":
0.4370 - christopher
Nearest neighbors of "heads":
0.2956 - têtes
Nearest neighbors of "proved":
0.2855 - avère
Nearest neighbors of "priest":
0.6089 - prêtre
Nearest neighbors of "funds":
0.4329 - financement
Nearest neighbors of "investment":
0.5521 - investissement
Nearest neighbors of "romanian":
0.5778 - roumain
Nearest neighbors of "session":
0.5954 - session
Nearest neighbors of "capture":
0.5447 - capture
Nearest neighbors of "aspects":
0.5433 - aspects
Nearest neighbors

0.5857 - bible
Nearest neighbors of "matthew":
0.3820 - matthew
Nearest neighbors of "depending":
0.3893 - varie
Nearest neighbors of "serbian":
0.5255 - serbe
Nearest neighbors of "instrument":
0.6295 - instrument
Nearest neighbors of "covering":
0.4742 - couvre
Nearest neighbors of "random":
0.3101 - hasard
Nearest neighbors of "represents":
0.5724 - représente
Nearest neighbors of "participants":
0.5635 - participants
Nearest neighbors of "thorough":
0.1630 - complet
Nearest neighbors of "mentions":
0.4738 - mentionne
Nearest neighbors of "portrait":
0.5940 - portrait
Nearest neighbors of "drivers":
0.5047 - pilotes
Nearest neighbors of "airlines":
0.5450 - airlines
Nearest neighbors of "franklin":
0.5086 - franklin
Nearest neighbors of "viewers":
0.3521 - audience
Nearest neighbors of "finnish":
0.6146 - finlandais
Nearest neighbors of "differences":
0.5692 - différences
Nearest neighbors of "venue":
0.3132 - salle
Nearest neighbors of "vocal":
0.3603 - chant
Nearest neighbors of "

0.2982 - accusé
Nearest neighbors of "filled":
0.2846 - constitué
Nearest neighbors of "nba":
0.5593 - nba
Nearest neighbors of "decide":
0.3568 - décident
Nearest neighbors of "breaking":
0.2433 - rupture
Nearest neighbors of "argentine":
0.5249 - argentin
Nearest neighbors of "resigned":
0.5200 - démissionne
Nearest neighbors of "oblast":
0.4194 - oblast
Nearest neighbors of "handed":
0.2121 - bâton
Nearest neighbors of "drew":
0.1774 - dessins
Nearest neighbors of "hawaii":
0.2943 - californie
Nearest neighbors of "brooklyn":
0.5131 - brooklyn
Nearest neighbors of "whilst":
0.2797 - tandis
Nearest neighbors of "historians":
0.5495 - historiens
Nearest neighbors of "pa":
0.3980 - pa
Nearest neighbors of "speaker":
0.2670 - président
Nearest neighbors of "moth":
0.2322 - aranéomorphes
Nearest neighbors of "⚡":
0.1299 - déposé
Nearest neighbors of "permission":
0.5901 - autorisation
Nearest neighbors of "wounded":
0.4625 - blessé
Nearest neighbors of "racial":
0.1964 - population
Neare

KeyboardInterrupt: 

## Manually Aligning the source and target vectors ##

In [242]:

from fasttext import FastVector   ## fasttext is a Python file 
en_dictionary = FastVector(vector_file='/home/dai/Desktop/Projectnew/model_eng.txt')  
fr_dictionary = FastVector(vector_file='/home/dai/Desktop/Projectnew/model_fr.txt')


reading word vectors from /home/dai/Desktop/Projectnew/model_eng.txt
reading word vectors from /home/dai/Desktop/Projectnew/model_fr.txt


In [246]:
en_vector = en_dictionary["awesome"]
fr_vector = fr_dictionary["fantastique"]
print(FastVector.cosine_similarity(en_vector, fr_vector))

0.032314497346109486


In [244]:
##Using a created matrix(stored in given path) to align source and target vectors

en_dictionary.apply_transform('/home/dai/Desktop/Projectnew/alignment_matrices/en.txt')
fr_dictionary.apply_transform('/home/dai/Desktop/Projectnew/alignment_matrices/fr.txt')

In [None]:
print(FastVector.cosine_similarity(en_dictionary["attack"], fr_dictionary["attaque"]))

**Other Method (Creating our own matrix to align)**

In [None]:
en_vector = en_dictionary["awesome"]
fr_vector = fr_dictionary["fantastique"]
print(FastVector.cosine_similarity(en_vector, fr_vector))   ##Might not give the req similarity

In [214]:
import numpy as np
import fasttext 

# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
def normalized(a,axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order,axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2,axis)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []

    for (source, target) in bilingual_dictionary:
        if source in source_dictionary and target in target_dictionary:
            source_matrix.append(source_dictionary[source])
            target_matrix.append(target_dictionary[target])
    
    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

In [1]:
### Creating a dictionary of same words of source and target vocabulary

en_words = set(en_dictionary.word2id.keys())
fr_words = set(fr_dictionary.word2id.keys())
overlap = list(en_words & fr_words)
bilingual_dictionary = [(entry, entry) for entry in overlap]
#print(bilingual_dictionary)

NameError: name 'en_dictionary' is not defined

In [240]:
# form the training matrices
source_matrix, target_matrix = make_training_matrices(
    en_dictionary, fr_dictionary, bilingual_dictionary)

# learn and apply the transformation
transform = learn_transformation(source_matrix, target_matrix)
fr_dictionary.apply_transform(transform)


**Checking again after alignment**

In [245]:
print(FastVector.cosine_similarity(en_dictionary["attack"], fr_dictionary["attaque"]))

0.050109074119420755
