In [2]:
import os
import time
import multiprocessing

import gensim
from sklearn.manifold import TSNE
import seaborn as sns

import plotly.plotly as py
import plotly.graph_objs as go
import plotly.offline as offline


In [3]:
path = 'data'
comp_info = path + os.sep + 'comp_info.tsv'

# {compound_id: [compound_name, CAS_number]}
def load_compounds(path):
    compounds = {}
    compounds_list = []
    with open(path, 'r') as f:
        for line in f:
            if line[0] == '#':
                pass
            else:
                line_split = line.rstrip().split('\t')
                compounds_id = line_split[0]
                compounds_list = line_split[1:]
                compounds[compounds_id] = compounds_list
    return compounds

In [4]:
compounds = load_compounds(comp_info)
compounds

{'344': ['s-methyl_3-methylbutanethioate', '23747-45-7'],
 '0': ['jasmone', '488-10-8'],
 '346': ['4-(2,6,6-trimethyl-cyclohexa-1,3-dienyl)but-2-en-4-one',
  '23696-85-7'],
 '347': ['cinnamic_acid', '621-82-9'],
 '340': ['1-methylnaphthalene', '90-12-0'],
 '341': ['5-ethyl-3-hydroxy-4-methyl-2(5h)-furanone', '698-10-2'],
 '342': ['p-menthane-3,8-diol', '42822-86-6'],
 '343': ['isopropyl_myristate', '110-27-0'],
 '810': ['tannic_acid', '1401-55-4'],
 '811': ['myristic_acid', '544-63-8'],
 '812': ['2,2,3-trimethylcyclopent-3-en-1-yl_acetaldehyde', '4501-58-0'],
 '813': ['d-octalactone', '698-76-0'],
 '348': ['limonene_(d-,l-,_and_dl-)',
  '5989-27-5,7705-14-8,5989-54-8,5989-27-5'],
 '349': ['guaiene', '88-84-6'],
 '816': ['5-_and_6-decenoic_acid', '85392-03-6,85392-04-7'],
 '817': ['2-octen-4-one', '4643-27-0'],
 '595': ['2-octanone', '111-13-7'],
 '719': ['propionic_acid', '79-09-4'],
 '718': ['caryophyllene_alcohol', '4586-22-5'],
 '717': ['phenethyl_formate', '104-62-1'],
 '716': ['me

In [5]:
def compound2character(compounds):
    dict_comp_char = {}
    for comp_id in compounds:
        compound = compounds[comp_id][0]
        char_list = []
        for char in compound:
            char_list.append(char)
        dict_comp_char[compound] = char_list        
    return dict_comp_char

dict_compound2character = compound2character(compounds)

In [6]:
def read_corpus_char_level(dict_compound2character):
    for comp in dict_compound2character:
        #For training data, add tags
        compound = comp
        characters = dict_compound2character[comp]
        yield gensim.models.doc2vec.TaggedDocument(characters, [compound])

        
corpus = list(read_corpus_char_level(dict_compound2character))
corpus

[TaggedDocument(words=['j', 'a', 's', 'm', 'o', 'n', 'e'], tags=['jasmone']),
 TaggedDocument(words=['5', '-', 'm', 'e', 't', 'h', 'y', 'l', 'h', 'e', 'x', 'a', 'n', 'o', 'i', 'c', '_', 'a', 'c', 'i', 'd'], tags=['5-methylhexanoic_acid']),
 TaggedDocument(words=['l', '-', 'g', 'l', 'u', 't', 'a', 'm', 'i', 'n', 'e'], tags=['l-glutamine']),
 TaggedDocument(words=['1', '-', 'm', 'e', 't', 'h', 'y', 'l', '-', '3', '-', 'm', 'e', 't', 'h', 'o', 'x', 'y', '-', '4', '-', 'i', 's', 'o', 'p', 'r', 'o', 'p', 'y', 'l', 'b', 'e', 'n', 'z', 'e', 'n', 'e'], tags=['1-methyl-3-methoxy-4-isopropylbenzene']),
 TaggedDocument(words=['3', '-', 'm', 'e', 'r', 'c', 'a', 'p', 't', 'o', '-', '2', '-', 'm', 'e', 't', 'h', 'y', 'l', 'p', 'e', 'n', 't', 'a', 'n', '-', '1', '-', 'o', 'l', '_', '(', 'r', 'a', 'c', 'e', 'm', 'i', 'c', ')'], tags=['3-mercapto-2-methylpentan-1-ol_(racemic)']),
 TaggedDocument(words=['b', 'u', 't', 'y', 'l', '_', 'i', 's', 'o', 'b', 'u', 't', 'y', 'r', 'a', 't', 'e'], tags=['butyl_is

In [7]:
def make_plot_simple(name, points, labels, publish):
    traces = []
    traces.append(go.Scattergl(
            x = points[:, 0],
            y = points[:, 1],
            mode = 'markers',
            marker = dict(
                color = sns.xkcd_rgb["black"],
                size = 8,
                opacity = 0.6,
                #line = dict(width = 1)
            ),
            text = labels,
            hoverinfo = 'text',
        )
        )
                  
    layout = go.Layout(
        xaxis=dict(
            autorange=True,
            showgrid=False,
            zeroline=True,
            showline=True,
            autotick=True,
            ticks='',
            showticklabels=False
        ),
        yaxis=dict(
            autorange=True,
            showgrid=False,
            zeroline=True,
            showline=True,
            autotick=True,
            ticks='',
            showticklabels=False
        )
        )
                  
    fig = go.Figure(data=traces, layout=layout)
    if publish:
        plotter = py.iplot
    else:
        plotter = offline.plot
    plotter(fig, filename=name + '.html')

In [8]:
"""
Train Doc2Vec Model

"""
time_start = time.time()

cores = multiprocessing.cpu_count()

#dm/m,d50,n5,w5,mc5,s0.001,t3
#model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=5, iter=55)

# PV-DM w/ average
model = gensim.models.doc2vec.Doc2Vec(size=50, window=5, min_count=3, iter=100)
model.build_vocab(corpus, keep_raw_vocab=False)

print "Unique Character Count", len(model.wv.vocab)
print "Total Compoounds Count:", model.corpus_count

%time model.train(corpus, total_examples=model.corpus_count, epochs=model.iter)

print 'Doc2Vec training done! Time elapsed: {} seconds'.format(time.time()-time_start)


save_name = 'embeddings' + os.sep + 'embeddings_flavor_compounds_50dim.bin'
model.save_word2vec_format(save_name, doctag_vec=True, word_vec=False, prefix='*dt_', fvocab=None, binary=True)

Unique Character Count 46
Total Compoounds Count: 1107
CPU times: user 4.24 s, sys: 2.57 s, total: 6.81 s
Wall time: 5.05 s
Doc2Vec training done! Time elapsed: 5.10063791275 seconds


In [9]:
"""
TSNE of Doc2Vec

"""
time_start = time.time()
X = model.docvecs
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

print 't-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start)

t-SNE done! Time elapsed: 5.66017103195 seconds


In [10]:
labels = []

for doc_id in range(0, len(model.docvecs)):
    labels.append(model.docvecs.index_to_doctag(doc_id))

make_plot_simple(name='compound2vec_char2',
          points=X_tsne, 
          labels=labels, 
          publish=False)

In [36]:
load_name = 'embeddings' + os.sep + 'embeddings_flavor_compounds_50dim.bin'
#char_embbeding = gensim.models.Word2Vec.load(load_name)

from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format(load_name, binary=True)

TypeError: object of type 'KeyedVectors' has no len()