## Environment setup

In [None]:
#Install packages. WARNING: if you run this on windows platform install 'pytorch' 
#!pip install nltk==3.7 pandas scipy==1.10.1 fasttext==0.9.2 gensim==4.3.2 scikit-learn==1.5.0 torch --quiet

We need a Facebook MUSE library to embedding alignment. You can only get this by cloning the Github repository into the project.
Go to the project folder and run the following command in the console:

    git clone https://github.com/facebookresearch/MUSE

After you have cloned it, go to the MUSE/src/utils.py file and change 'fastText' to 'fasttext' in lines 76 and 80 (i.e. change the capital T to t), otherwise the code will crash. Run the Muse script for alignment of embeddings from the previous step (CAUTION: Replace slashes with backslashes in the path specification if you have a Windows platform):

In [None]:
# Clone the Facebook MUSE repository into the current directory:
#!git clone https://github.com/facebookresearch/MUSE

#!sed -i 's/import fastText/import fasttext/' MUSE/src/utils.py
#!sed -i 's/return fastText\.load_model/return fasttext.load_model/' MUSE/src/utils.py

# Library Imports

In [None]:
#import fasttext
import nltk

# Dataset Generation

In [None]:
#Create datasets for embedding training and dictionaries for embedding alignement
def get_datasets():

    pd_file = open('input/pd.txt', 'w', encoding="utf8")
    cr_file = open('input/cr.txt', 'w', encoding="utf8")

    with open('input/C_merged_PD_in_CR_trunc_clean.txt', 'r', encoding='utf8') as f:
        vocab = {}
        for line in f:
            if len(line.strip()) > 0:
                id = line.split()[0]
                c = line.split()[1]
                text = " ".join(line.split()[2:]).strip()
                text = " ".join(nltk.wordpunct_tokenize(text)).lower()

                if c == "!CR":
                    cr_file.write(text + '\n')
                    words = text.split()
                    for w in words:
                        if w in vocab:
                            vocab[w][0] += 1
                        else:
                            vocab[w] = [1, 0]
                elif c == "!PD":
                    pd_file.write(text + '\n')
                    words = text.split()
                    for w in words:
                        if w in vocab:
                            vocab[w][1] += 1
                        else:
                            vocab[w] = [0, 1]
        pd_file.close()
        cr_file.close()
    words = []
    for word, freq in vocab.items():
        if freq[0] > 0 and freq[1] > 2:
            words.append((word, freq[0], freq[1]))

    words = sorted(words, reverse=True, key= lambda x: x[-1])
    train = open('input/en_en_dict_train.txt', 'w', encoding='utf8')
    test = open('input/en_en_dict_test.txt', 'w', encoding='utf8')

    counter = 0
    for w, f1, f2, in words[:5000]:
        if counter % 3 == 0:
            test.write(w + '\t' + w + '\n')
        else:
            train.write(w + '\t' + w + '\n')
        counter += 1
    train.close()
    test.close()


get_datasets()

# Embedding Learning

In [None]:
def make_embeddings(input, output):
    with open(input, "r", encoding="utf8") as f:
        text = " ".join(nltk.wordpunct_tokenize(f.read())).lower()
    filename = input.split('.')[0] + "_preprocessed." +  input.split('.')[1]
    with open(filename, "w", encoding="utf8") as f:
        f.write(text)
    model = fasttext.train_unsupervised(filename, min_count=6, model='skipgram')
    model.save_model(output + ".bin")

#make_embeddings('input/pd.txt', 'embeddings/pd')
#make_embeddings('input/cr.txt', 'embeddings/cr')

# Embedding Alignment

In [None]:
#!python MUSE/supervised.py --src_lang "cr" --tgt_lang "pd" --emb_dim 100 --max_vocab -1 --n_refinement 20 --dico_train "datasets/en_en_dict_train.txt" --dico_eval "datasets/en_en_dict_test.txt" --src_emb  "embeddings/cr.bin" --tgt_emb  "embeddings/pd.bin" --cuda 0

The aligned embedding models appear in the folder ./MUSE/dumped/debug/some_random_seed. Go to this folder and copy the files 'vectors-cr.txt' and 'vectors-pd.txt' into the folder 'embedding'.

In [None]:
#!LATEST_FOLDER=$(find MUSE/dumped/debug -type d -print0 | xargs -0 ls -td | head -n 1) && \
#cp "$LATEST_FOLDER/vectors-cr.txt" "$LATEST_FOLDER/vectors-pd.txt" embeddings/ && \
#echo "Files copied to embeddings/" || echo "Error: Files not found in the latest folder."

In the last step, we try to find new relationships between genes in the plant defense domain using the aligned models by using the seed relationships from the circadian rhythm domain. For each seed relationship, we obtain the 10 closest relationships according to the cosine similarity of genes in the plant defense domain for each seed relation.

In [None]:
import numpy as np
import io
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import nltk


def get_gene_list():
    with open("input/C_merged_PD_in_CR_trunc_clean_synonyms_B_genes_trunc.txt", "r", encoding='utf8') as f:

        vocab = defaultdict(int)
        for line in f:
            if len(line.strip()) > 0:
                c = line.split()[1]
                text = " ".join(line.split()[2:]).strip()
                text = " ".join(nltk.wordpunct_tokenize(text)).lower()
                if c == "!pd":
                    words = text.split()
                    for w in words:
                        vocab[w] += 1

        vocab = sorted(list(vocab.items()), reverse=True, key=lambda x: x[1])
        vocab = [x[0] for x in vocab]
        return set(vocab)


def load_fasttext(emb_path, nmax=1000000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

def get_emb(word, emb, word2id):
    avg_emb = []
    for word_part in word.split():
        word_emb = emb[word2id[word_part.lower()]].tolist()
        avg_emb.append(word_emb)
    avg = np.average(np.array(avg_emb), axis=0)
    word_emb = avg
    return word_emb

def embeds_to_dict(emb, word2id):
    dict = {}
    words = list(word2id.items())
    for w, id in words:
        dict[w] = emb[id]
    return dict

def get_most_similar(word, word_emb, embeds, n = 10, word_list=[]):
    neigh = []
    items = list(embeds.items())
    values = [v for k, v in items]
    keys = [k for k, v in items]
    #print(np.array(values).shape)
    cs = cosine_similarity(word_emb.reshape(1, -1), np.array(values)).squeeze()
    for i in range(len(keys)):
        neigh.append((keys[i], cs[i]))
    neigh = sorted(neigh, key=lambda x: x[1], reverse=True)
    counter = 0
    word_results = []
    emb_results = []
    for w, score in neigh:
        if word.lower() not in w.lower() and w.lower() not in word.lower():
            if len(word_list) == 0 or w in word_list:
                if counter >= n:
                    break
                counter += 1
                word_results.append((w, score))
                emb_results.append(embeds[w])
    return word_results, emb_results


def get_all_relations(embeds, word2id):
    gene_list = get_gene_list()
    words = list(word2id.keys())
    words = [x for x in words if x in gene_list]
    diffs = {}
    print("calculating all differences: ", len(words) * len(words))
    counter = 0

    for i in range(len(words)):
        for j in range(i + 1, len(words)):
            word_1 = words[i]
            word_2 = words[j]
            emb_1 = embeds[word2id[word_1]]
            emb_2 = embeds[word2id[word_2]]
            diffs[word_1 + '-' + word_2] = emb_1 + emb_2
            #diffs[word_2 + '-' + word_1] = emb_2 + emb_1
            counter += 1
            if counter % 1000000 == 0:
                print("processing diff: ", counter)
    print("Done")
    return diffs




def get_same_relations_in_domain_2(embeds_1, word2id_1, relations_1, embeds_2, word2id_2):

    diffs = get_all_relations(embeds_2, word2id_2)

    for rel in relations_1:
        el1, el2 = rel
        emb_1 = get_emb(el1, embeds_1, word2id_1)
        emb_2 = get_emb(el2, embeds_1, word2id_1)
        emb_rel = emb_1 + emb_2
        word_res, emb_res = get_most_similar(el1 + '-' + el2.lower(), emb_rel, diffs, n=10, word_list=[])
        print("Circadian rhythm: ", el1 + " rel. " + el2)
        print("Most similar plant defense rel:\n")
        print("rank\trelation\tcosine sim.")
        for idx, w in enumerate(word_res):
            score = w[1]
            w = w[0]
            w = w.replace('-', ' rel. ')
            print(str(idx + 1) + '.' + '\t' + w + "\t{:.4f}".format(score))
        print('------------------------------------------')
        print()


def get_analogy(word_1, embeds_1, word2id_1, genes_1, word_2, embeds_2, word2id_2):

    emb_1 = get_emb(word_1, embeds_1, word2id_1)
    emb_2 = get_emb(word_2, embeds_2, word2id_2)
    embeds_2 = embeds_to_dict(embeds_2, word2id_2)

    for gene in genes_1:
        emb_gene = get_emb(gene, embeds_1, word2id_1)
        emb_result = emb_1 + emb_gene - emb_2
        word_res, emb_res = get_most_similar(word_2.lower(), emb_result, embeds_2, n=10, word_list=get_gene_list())
        print("Circadian rhythm domain: ", word_1 + ' rel. ' + gene.lower())
        print("Most similar in plant defense domain:\n")
        print('rank\trelation\tcosine sim.')
        for idx, w in enumerate(word_res):
            score = w[1]
            w = w[0]
            w = 'plant defense rel. ' + w
            print(str(idx + 1) + '.' + '\t' + w + "\t{:.4f}".format(score))
        print('------------------------------------------')
        print()


#Set our seed relations
relations_1 = [['CCA1', 'PRR7'],
               ['CCA1', 'PRR9'],
               ['CCA1', 'PRR5'],
               ['CCA1', 'TOC1'],
               ['CCA1', 'ELF3'],
               ['CCA1', 'ELF4'],
               ['CCA1', 'LUX'],
               ['LHY', 'PRR7'],
               ['LHY', 'PRR9'],
               ['LHY', 'PRR5'],
               ['LHY', 'TOC1'],
               ['LHY', 'ELF3'],
               ['LHY', 'ELF4'],
               ['LHY', 'LUX']]

To get exactly the same results as in the paper (reproducibility of results), use the files 'cr-aligned-original.vec' and 'pd-aligned-original.vec' in the embedding folder instead of the generated aligned embeddings. To do this, change the file names to the original embedding files.

In [None]:
# If you want to exactly reproduce the results in the paper, instead use the 'cr-aligned-original.vec' and 'pd-aligned-original.vec' files in the embedding folder.
path_1 = 'embeddings/cr-aligned-original.vec'
path_2 = 'embeddings/pd-aligned-original.vec'
nmax = 500000  # maximum number of word embeddings to load
embeds_1, id2word_1, word2id_1 = load_fasttext(path_1, nmax)
embeds_2, id2word_2, word2id_2 = load_fasttext(path_2, nmax)
get_same_relations_in_domain_2(embeds_1, word2id_1, relations_1, embeds_2, word2id_2)