# Word2vec
___

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

In [2]:
# basepath, to access 01-... and 02-... folders
BASEPATH = Path().cwd().resolve().parent

In [3]:
# Load sequences data
PATH_DATA = Path('/home/jorge/AlgoLab/Tezi-Marzi/Tezi-Documentation/Master Thesis-20210607T163207Z-001/Master Thesis/Archive')
data = pd.read_csv(PATH_DATA.joinpath('Sequences_chr1_unique.csv'))
sequences_by_id = {ID: seq for ID, seq in zip(data.index, data.Sequences)}

In [4]:
data.head()

Unnamed: 0,Sequences,Splice_Junctions
0,TCTGTCCTGTAAACCAGGACCCAGGTTATGCCTCTGCGAGCAGAGC...,1
1,TTTTTCTTCCTTCACTCTGAAGCCTGTTTACTTTGTTCTTCTGGCC...,1
2,TCCCGCCAAATCCGAAGCCTTGCTTCCTCCGGGAAAAGAGTCTTTT...,1
3,ATAATGATATTACTTGTACATAGAGTATAACTCCAGATTCCTTGGT...,1
4,TTTTTTTCCCTAATCCATCAAAGACTCCATAGCATAGTCGTTAATA...,0


In [5]:
from gensim.test.utils import common_texts



In [6]:
#sequence = "ACGTTGG"
L = 60
k = 6
s = 1 # (since j starts from 0, stride)

# https://www.nature.com/articles/s41598-018-33321-1 | eq (1) below fig 4
N_kmer = np.ceil((L-k)/s) + 1
seq2kmers = lambda sequence: [ sequence[j:j+k] for j in range(0, L, s) if j < N_kmer ]

In [7]:
# Sequences as lists of kmers to train word2vec
seq_as_kmers = data.Sequences.apply(seq2kmers)

### Train word2vec

In [8]:
from gensim.models import Word2Vec
dim       = 20
window    = 10
min_count = 1 

model     = Word2Vec(sentences=seq_as_kmers, vector_size=dim, window=window, min_count=min_count, workers=4)

In [11]:
# Save model
word_vectors = model.wv
word_vectors.save(f"embeddings/word2vec/{k}-mer_{dim}-emb.wordvectors")

In [9]:
# Get embedding for a k-mer
# model.wv["AAGT"]

In [10]:
# Find most similar k-mer based on embedding
# model.wv.most_similar("GCC")

In [12]:
from gensim.models import KeyedVectors

In [13]:
wv = KeyedVectors.load(f"embeddings/word2vec/{k}-mer_{dim}-emb.wordvectors", mmap='r')

In [14]:
wv["AAGTGG"]

array([  7.372363  ,   1.255034  ,   3.0460577 ,  -0.32891965,
         4.110647  ,  -5.7864475 ,   3.233467  ,  -0.9224984 ,
         3.0546496 , -10.5233    ,  -1.1514121 , -10.732493  ,
         3.1258113 ,   3.5156765 ,  -2.777804  ,   1.2995012 ,
        -4.867206  ,  -3.5837922 ,   2.0511017 ,   1.6243751 ],
      dtype=float32)