# Random Sequence Generate
- sequence의 일부분을 random한 amino-acid로 변환해 sequence 생성
- csv file로 export
- sequence에 대해 structure prediction 진행

In [5]:
import torch
import random
import csv
import os
import multiprocessing

## sequence 입력
- 변환할 부분 지정

In [65]:
sequence = str(input('sequence를 입력하세요: '))
n_aa = int(input('변경 시작할 amino acid 번호를 입력하세요: ')) - 1 # str index 0부터 시작하므로 -1
n_change = int(input('변경할 amino acid 수를 입력하세요: '))

In [66]:
sequence[n_aa]

'V'

## Random AA로 변환

In [67]:
def random_amino_acid():
    """ 임의의 아미노산을 선택하는 함수 """
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'  # 20가지 아미노산
    return random.choice(amino_acids)

In [68]:
def new_protein_sequence(sequence, start_index, num_mutations):
    """ 단백질 서열의 일부를 변이시키는 함수 """
    new_sequence = list(sequence)  # 문자열을 리스트로 변환하여 가변(mutable)하게 만듦

    for i in range(start_index, start_index + num_mutations):
        if i < len(sequence):
            new_sequence[i] = random_amino_acid()  # 임의의 아미노산으로 변이

    return ''.join(new_sequence)  # 리스트를 다시 문자열로 변환하여 반환

In [69]:
new_sequence = new_protein_sequence(sequence, n_aa, n_change)
new_sequence

'QVQLVESGGGSVQAGGSLRLSCAASGSISSITYLGWFRQAPGKEREGVAALSTSSGTTYYADSVKGRFTVSLDNAKNTVYLQMNSLKPEDTALYYCAAATQVDDLPWRVGTKCDWGQGTQVTVS'

## CSV 파일로 저장

In [70]:
def generate_unique_sequences(num_sequences, sequence, start_index, num_mutations):
    """ 중복되지 않은 단백질 서열을 여러 개 생성하는 함수 """
    sequences = set()  # 중복을 허용하지 않는 집합을 사용하여 고유한 서열을 저장
    
    while len(sequences) < num_sequences:
        sequence = new_protein_sequence(sequence, start_index, num_mutations)
        sequences.add(sequence)
    
    return list(sequences)

def save_sequences_to_csv(sequences, filename):
    """ 생성된 서열을 CSV 파일로 저장하는 함수 """
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Sequence'])  # 헤더 추가
        
        for sequence in sequences:
            writer.writerow([sequence])

In [71]:
unique_sequences = generate_unique_sequences(1000, sequence, n_aa, n_change)
unique_sequences[0:5]

['QVQLVESGGGSVQAGGSLRLSCAASGSISSITYLGWFRQAPGKEREGVAALSTSSGTTYYADSVKGRFTVSLDNAKNTVYLQMNSLKPEDTALYYCAAALGQFVREWVNIKNYLWGQGTQVTVS',
 'QVQLVESGGGSVQAGGSLRLSCAASGSISSITYLGWFRQAPGKEREGVAALSTSSGTTYYADSVKGRFTVSLDNAKNTVYLQMNSLKPEDTALYYCAAANMEMIYNAWEAYLIFWGQGTQVTVS',
 'QVQLVESGGGSVQAGGSLRLSCAASGSISSITYLGWFRQAPGKEREGVAALSTSSGTTYYADSVKGRFTVSLDNAKNTVYLQMNSLKPEDTALYYCAAACLISQVRCWWTEGTMWGQGTQVTVS',
 'QVQLVESGGGSVQAGGSLRLSCAASGSISSITYLGWFRQAPGKEREGVAALSTSSGTTYYADSVKGRFTVSLDNAKNTVYLQMNSLKPEDTALYYCAAAKVKQKNVVTSDSQWAWGQGTQVTVS',
 'QVQLVESGGGSVQAGGSLRLSCAASGSISSITYLGWFRQAPGKEREGVAALSTSSGTTYYADSVKGRFTVSLDNAKNTVYLQMNSLKPEDTALYYCAAAQYEVPTAECAEPQGCWGQGTQVTVS']

In [72]:
name = input('파일 이름을 입력하세요: ')
csv_filename = f'{name}_replaced_unique_sequences.csv'
save_sequences_to_csv(unique_sequences, csv_filename)

## Structure Prediction

In [7]:
from ImmuneBuilder import NanoBodyBuilder2

In [20]:
def Run_NanoBodyBuilder2(csv_path):
    name = input('파일 이름을 입력하세요: ')
    output_dir = os.path.join(os.getcwd(), name)
    os.makedirs(output_dir, exist_ok=True)
    try:
        with open(csv_path, 'r', newline='') as csvfile:
            reader = csv.reader(csvfile)
            next(reader)  # 헤더를 건너뜁니다

            for idx, row in enumerate(reader):
                if len(row) > 0:
                    sequence = {'H': row[0]}
                    predictor = NanoBodyBuilder2()
                    output_file = os.path.join(output_dir, f'{name}_{idx}.pdb')
                    nanobody = predictor.predict(sequence)
                    nanobody.save(output_file)

    except FileNotFoundError:
        print(f"Error: 파일 '{csv_path}'을 찾을 수 없습니다.")

In [22]:
Run_NanoBodyBuilder2('Nb51_replaced_unique_sequences.csv')