In [2]:
import torch
import torch.nn as nn
import torch.optim as optim



In [3]:
#cancer
import pandas as pd
import numpy as np

data = pd.read_csv('./data/train.csv')

data.head(1)

Unnamed: 0,ID,SUBCLASS,A2M,AAAS,AADAT,AARS1,ABAT,ABCA1,ABCA2,ABCA3,...,ZNF292,ZNF365,ZNF639,ZNF707,ZNFX1,ZNRF4,ZPBP,ZW10,ZWINT,ZYX
0,TRAIN_0000,KIPAN,WT,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT


In [4]:
# 2번째 컬럼부터 유전자 정보 선택
df = data.iloc[:, 2:].copy()

# 유전자 부위를 숫자로 변환 (매핑 저장)
gene_map = {gene: idx for idx, gene in enumerate(df.columns)}



In [8]:
# "WT"가 아닌 값들을 0/1 변이 행렬로 변환
mutation_matrix = df.applymap(lambda x: 1 if x != "WT" else 0)

print(mutation_matrix)

# 공존 행렬 생성 (변이 행렬의 전치 행렬과 원본 행렬의 내적)
co_occurrence_matrix = mutation_matrix.T.dot(mutation_matrix)

# 자기 자신과의 공존(대각선 요소)은 0으로 설정
np.fill_diagonal(co_occurrence_matrix.to_numpy(), 0)


print(co_occurrence_matrix)


      A2M  AAAS  AADAT  AARS1  ABAT  ABCA1  ABCA2  ABCA3  ABCA4  ABCA5  ...  \
0       0     0      0      0     0      0      0      0      0      0  ...   
1       0     0      0      0     0      0      0      0      0      0  ...   
2       1     0      0      0     0      0      0      0      0      0  ...   
3       0     0      0      0     0      0      0      0      0      0  ...   
4       0     0      0      0     0      0      0      0      0      0  ...   
...   ...   ...    ...    ...   ...    ...    ...    ...    ...    ...  ...   
6196    0     0      0      0     0      0      0      0      0      0  ...   
6197    0     0      0      0     0      0      0      0      0      0  ...   
6198    0     0      0      0     0      0      0      0      0      0  ...   
6199    0     0      0      0     0      0      0      0      0      0  ...   
6200    0     0      0      0     0      0      0      0      0      0  ...   

      ZNF292  ZNF365  ZNF639  ZNF707  ZNFX1  ZNRF4 

In [5]:
# "WT"가 아닌 값들을 0/1 변이 행렬로 변환
mutation_matrix = df.applymap(lambda x: 1 if x != "WT" else 0)




In [6]:
nonzero_samples = mutation_matrix.sum(axis=1) > 0
filtered_mutation_matrix = mutation_matrix[nonzero_samples]

# mutation_matrix와 동일한 index를 갖는다고 가정
filtered_data = data.loc[filtered_mutation_matrix.index]

# 이제 둘 다 같은 index이므로 쉽게 연결 가능
combined = filtered_mutation_matrix.copy()
combined['SUBCLASS'] = filtered_data['SUBCLASS']

In [7]:
# Step 4: SUBCLASS별 공출현 행렬 생성 + 가중치 곱해서 누적
genes = combined.columns.drop('SUBCLASS')
co_matrix_total = np.zeros((len(genes), len(genes)))
gene_idx = {gene: i for i, gene in enumerate(genes)}

# 가중치 예시: SUBCLASS별로 균등하게 (혹은 반비례)
subclass_counts = combined['SUBCLASS'].value_counts()
weight_dict = {cls: 1 / count for cls, count in subclass_counts.items()}  # 반비례 방식

for subclass, group in combined.groupby('SUBCLASS'):
    mat = group[genes]
    weight = weight_dict[subclass]
    co_matrix = mat.T.dot(mat) * weight
    co_matrix_total += co_matrix.to_numpy()

# Step 5: 대각선 제거
np.fill_diagonal(co_matrix_total, 0)

In [8]:
print(co_matrix_total.shape)

(4384, 4384)


In [9]:
occurrence_matrix = co_matrix_total

In [10]:
import numpy as np
from sklearn.preprocessing import normalize

# 3. SVD
U, S, Vt = np.linalg.svd(occurrence_matrix)

# 4. Embedding 선택
k = 150
gene_embeddings = U[:, :k] * np.sqrt(S[:k])  # shape: [num_genes, k]

# 5. normalize (선택)
gene_embeddings = normalize(gene_embeddings, axis=1)

In [13]:
import os

In [11]:
print(gene_embeddings)

[[-0.73160907  0.15940814  0.03827838 ... -0.10741644  0.05716845
  -0.03970308]
 [-0.55258616 -0.14163557 -0.27812404 ... -0.02078061  0.03043746
  -0.02195163]
 [-0.41857478 -0.09393661 -0.03545576 ... -0.02737577  0.03998154
   0.06319594]
 ...
 [-0.54302092 -0.12375312 -0.13935219 ... -0.00435396  0.01452151
   0.02298121]
 [-0.30531149 -0.06177987 -0.15816947 ...  0.00303714 -0.03073044
   0.00118343]
 [-0.48643672 -0.11259937 -0.19320324 ...  0.07371807 -0.06166273
  -0.05816168]]


In [14]:
np.save(os.path.join("data", "gene_embedSVDwithWeight.npy"), gene_embeddings)