<a href="https://colab.research.google.com/github/jyryu3161/lec_bioai/blob/main/featurization_protein.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 입력 파일 설정

In [1]:
input_file = 'training_data.csv' # 입력 파일명만 입력

## 환경 구성

In [2]:
!pip install fair-esm

Collecting fair-esm
  Downloading fair_esm-2.0.0-py3-none-any.whl (93 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/93.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m92.2/93.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fair-esm
Successfully installed fair-esm-2.0.0


# 프로그램 실행

In [3]:
import torch
import esm
import pandas as pd
import os
import tqdm

# 1단계: ESM 모델 로드
model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
model = model.eval()
if torch.cuda.is_available():
    model = model.cuda()

# 2단계: 데이터 불러오기
data = pd.read_csv(input_file)  # 여기서 'your_protein_data.csv'는 데이터 파일 이름입니다.
basename = os.path.basename(input_file).split('.')[0].strip()

# 3단계: 특징 추출
def extract_features(sequence):
    data = torch.tensor([alphabet.encode(sequence)])
    if torch.cuda.is_available():
        data = data.cuda()

    with torch.no_grad():
        results = model(data, repr_layers=[33])
        token_representations = results['representations'][33]
        return token_representations[0].mean(0).cpu().numpy()

# 4단계: 각 시퀀스에 대한 특징 추출
features = data['Seq'].apply(extract_features)

# 5단계: 특징을 별도의 컬럼으로 저장
feature_columns = [f'feature_{i+1}' for i in tqdm.tqdm(range(features.iloc[0].shape[0]))]
feature_df = pd.DataFrame(features.tolist(), columns=feature_columns)

# 6단계: 원본 데이터프레임과 특징 데이터프레임 병합
final_df = pd.concat([data, feature_df], axis=1)
final_df.to_csv('./output_%s.csv'%(basename), index=False)


Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm1b_t33_650M_UR50S.pt" to /root/.cache/torch/hub/checkpoints/esm1b_t33_650M_UR50S.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm1b_t33_650M_UR50S-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm1b_t33_650M_UR50S-contact-regression.pt
100%|██████████| 1280/1280 [00:00<00:00, 1948714.74it/s]
