# Get vectors from fasttext

In [29]:
# common recipe to get access to the source inside src
import os
import sys
module_path = os.path.abspath(os.path.join('../../src'))
if module_path not in sys.path:
    sys.path.append(module_path)
##########################################################

import numpy as np
from pathlib import Path

from dataset.MultilabelDataModule import MultilabelDataModule       # pytorch lightning data module for captions and images

from dataset.utils import clean_str

In [30]:
batch_size = 32
csv_path = '/workspace/data/microscopy_captions_multilabel_kfolds.csv'
max_length_size = 200
num_workers = 72
random_state = 443

dm = MultilabelDataModule(batch_size,
                            csv_path,
                            20000,
                            max_length_size,
                            num_workers=num_workers,
                            random_state=random_state,
                            kfold_col='KFOLD_MULTI',
                            preprocess_fn=clean_str)
dm.prepare_data()
dm.setup(k_fold_idx=0)

In [31]:
import fasttext
from gensim.models import KeyedVectors

In [32]:
filename = '/workspace/data/biosentvec/BioWordVec_PubMed_MIMICIII_d200.vec.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [33]:
len(model.vocab.keys())

16545452

In [35]:
model['cell']

array([ 3.1645e-01,  5.1833e-01,  1.8313e-01, -2.1142e-01, -4.2523e-01,
       -2.2111e-01, -5.6964e-01, -1.1693e-01,  1.7035e-01, -7.9470e-02,
        1.0547e-01,  5.9332e-02,  1.8595e-01, -1.6181e-01, -5.8722e-02,
        4.1653e-01,  5.5877e-02, -2.6572e-01,  3.7170e-01,  1.6127e-01,
        3.9789e-01, -4.4439e-02, -5.3040e-01, -4.4360e-01,  3.2300e-02,
        2.5166e-01,  1.6691e-01,  1.6848e-01,  3.2593e-01, -1.4861e-01,
        4.3974e-01, -6.0691e-02, -4.9867e-01, -4.1572e-01,  4.4453e-02,
       -1.3696e-01,  9.0605e-01, -7.0538e-02,  2.0980e-01,  1.9117e-01,
       -1.2237e-01,  1.7283e-02,  3.2669e-01, -2.1159e-01, -2.8491e-01,
       -1.2638e-01,  5.4470e-01,  2.4571e-01,  5.5708e-02,  2.2276e-01,
       -4.2089e-01,  4.1498e-01, -7.3469e-01, -5.2261e-01, -8.1986e-01,
       -4.2651e-01, -4.4437e-02, -1.6447e-01, -5.0739e-01, -1.7647e-01,
       -3.4589e-01,  3.1612e-01,  1.9247e-01,  6.7935e-03,  7.1029e-01,
        1.5427e-01,  3.8743e-01,  3.3396e-01,  2.5345e-01,  4.83

In [36]:
d = model.vocab

In [37]:
i = 0
for k in d.keys():
    print(k)
    i += 1
    if i == 100:
        break

</s>
.
the
of
,
and
in
)
(
to
a
with
:
for
was
is
were
by
that
on
%
as
from
this
patients
at
or
are
an
be
we
not
study
results
these
;
cells
no
after
have
which
between
[
]
than
treatment
2
p
has
using
but
=
been
cell
1
during
group
may
it
two
both
also
had
their
can
all
more
disease
there
clinical
used
data
activity
analysis
increased
other
methods
effect
expression
compared
protein
associated
3
one
effects
levels
<
significant
studies
significantly
patient
cancer
found
human
blood
its
high
different
showed
use


In [38]:
word_index = dm.word_index
print(len(word_index))

7018


In [39]:
count = 0
for word, idx in word_index.items():
    if word not in model:    
        print(word)
        count += 1
count

î¼
î²
î±
ï¿½ï¿½
ã
â±
â
amnos
î³
â°
ferrt
ã¯
palillary
î´
411575
crb87
amsoxb2
â®
hts1103
ky535
ã¼
rm11430
pavp22
micdys
zgak
zaux
pf7a
ä¸
î¦
54h11
dmm3
pancrb
cg13377
5900lv
radiographicimage
rhabomeres
af272341
m95526
af510728
s72864
m32754
cb997542
by303064
bi082792
bp457576
mb162
aapearance
wtc4
evs4
bb127
em10ca
pexii
mircrographs
meriste
35c4
95i21
476d17
f06715
rp298
siglipr
bxd63
ps1de9xyfp
62.5x
mtdag8
phenyliondole
2000e
rhis23
âµ
384132
133475
229123
151012
398791
222921
314997
mucinuous
lobul
pgch1
hapothalamic
immnopositive
differentiatio
coexpressedin
250742
ã¶
cmt931
mvtv
e997a
fosmid1
fosmid2
fluorosense
41002b
dylight633
41008
sat701b001ea
dabx400
rb1wild
rb1arg621cys
rb1leu607ile
rb1arg698trp
hn2b
majetich
subseros
glutaraladehyde
ï¿½
amsox21
calcyx
pkkgdk
40.2d6
coatin
megagametophyt
porosit
xylema
polycloonal
ë
î¸
ï
ï
stepek
capscle
philophodia
mrnatagging
ureteras
dermi
î
»
mscsg
magnificatio
dx385
trabecul
otocyte
permissio
2842.6
3024.5
cg32572
wtmm4
seyd1
s

139

In [40]:
model['cell'].shape

(200,)