In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

class GloVeEmbedding(nn.Module):
    def __init__(self, vocab_size=4384, embedding_dim=128, x_max=4384, alpha=0.75):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.x_max = x_max
        self.alpha = alpha

        # Center word vectors and context word vectors
        self.w = nn.Embedding(vocab_size, embedding_dim)         
        self.w_tilde = nn.Embedding(vocab_size, embedding_dim)   
        self.b = nn.Embedding(vocab_size, 1)                     #
        self.b_tilde = nn.Embedding(vocab_size, 1)               

        # Initialization
        for param in self.parameters():
            nn.init.uniform_(param, -0.5 / embedding_dim, 0.5 / embedding_dim)

    def forward(self, i_idx, j_idx, X_ij):
        # w_i, w_j, b_i, b_j
        w_i = self.w(i_idx)
        w_j = self.w_tilde(j_idx)
        b_i = self.b(i_idx).squeeze()
        b_j = self.b_tilde(j_idx).squeeze()

        dot = torch.sum(w_i * w_j, dim=1)
        log_X = torch.log(X_ij + 1e-8)

        # Weighting function f(X_ij)
        weight = (X_ij / self.x_max).clamp(max=1.0).pow(self.alpha)

        loss = weight * (dot + b_i + b_j - log_X).pow(2)
        return loss.mean()

    def get_vectors(self):
        # 최종 임베딩은 w + w_tilde 평균
        return (self.w.weight.data + self.w_tilde.weight.data) / 2


In [2]:
# Assume co_matrix is [V, V] numpy or torch tensor
def train_glove_from_cooccurrence(co_matrix, embedding_dim=128, epochs=100, lr=0.05, device='cpu'):
    vocab_size = 4384
    glove = GloVeEmbedding().to(device)
    optimizer = optim.Adam(glove.parameters(), lr=lr)
    # 1. DataFrame → NumPy

    # 2. Non-zero index 추출 (NumPy 방식)
    i_idx, j_idx = co_matrix.nonzero()           # tuple of arrays
    X_ij = co_matrix[i_idx, j_idx]               # 실제 값

    # 3. PyTorch로 변환
    i_idx = torch.LongTensor(i_idx)
    j_idx = torch.LongTensor(j_idx)
    X_ij = torch.FloatTensor(X_ij)

    
    vocab_size = co_matrix.shape[0]


    for epoch in range(epochs):
        optimizer.zero_grad()
        loss = glove(i_idx, j_idx, X_ij)
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

    return glove.get_vectors()



In [3]:
#cancer
import pandas as pd
import numpy as np

data = pd.read_csv('./data/train(1).csv')

data.head(1)

Unnamed: 0,ID,SUBCLASS,A2M,AAAS,AADAT,AARS1,ABAT,ABCA1,ABCA2,ABCA3,...,ZNF292,ZNF365,ZNF639,ZNF707,ZNFX1,ZNRF4,ZPBP,ZW10,ZWINT,ZYX
0,TRAIN_0000,KIPAN,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT


In [4]:
# 2번째 컬럼부터 유전자 정보 선택
df = data.iloc[:, 2:].copy()

# 유전자 부위를 숫자로 변환 (매핑 저장)
gene_map = {gene: idx for idx, gene in enumerate(df.columns)}



In [5]:
# "WT"가 아닌 값들을 0/1 변이 행렬로 변환
mutation_matrix = df.applymap(lambda x: 1 if x != "WT" else 0)

print(mutation_matrix)

# 공존 행렬 생성 (변이 행렬의 전치 행렬과 원본 행렬의 내적)
co_occurrence_matrix = mutation_matrix.T.dot(mutation_matrix)

# 자기 자신과의 공존(대각선 요소)은 0으로 설정
np.fill_diagonal(co_occurrence_matrix.to_numpy(), 0)


print(co_occurrence_matrix)

reverse_gene_map = {v: k for k, v in gene_map.items()}  # 숫자를 다시 문자로 변환

      A2M  AAAS  AADAT  AARS1  ABAT  ABCA1  ABCA2  ABCA3  ABCA4  ABCA5  ...  \
0       0     0      0      0     0      0      0      0      0      0  ...   
1       0     0      0      0     0      0      0      0      0      0  ...   
2       1     0      0      0     0      0      0      0      0      0  ...   
3       0     0      0      0     0      0      0      0      0      0  ...   
4       0     0      0      0     0      0      0      0      0      0  ...   
...   ...   ...    ...    ...   ...    ...    ...    ...    ...    ...  ...   
6196    0     0      0      0     0      0      0      0      0      0  ...   
6197    0     0      0      0     0      0      0      0      0      0  ...   
6198    0     0      0      0     0      0      0      0      0      0  ...   
6199    0     0      0      0     0      0      0      0      0      0  ...   
6200    0     0      0      0     0      0      0      0      0      0  ...   

      ZNF292  ZNF365  ZNF639  ZNF707  ZNFX1  ZNRF4 

In [None]:
occurrence_matrix = co_occurrence_matrix.to_numpy()

: 

In [None]:
embedding_vectors = train_glove_from_cooccurrence(occurrence_matrix, embedding_dim=128)
print(embedding_vectors.shape)  # [vocab_size, 128]