In [8]:
!pip install biopython
!pip install huggingface_hub
!huggingface-cli login # login and get the token, make sure to paste your token
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from Bio import SeqIO
from scipy.cluster.hierarchy import linkage, fcluster
from Bio.Align import MultipleSeqAlignment
from Bio.Align.Applications import ClustalOmegaCommandline
import subprocess
import os

# Load Youshan Transformer Model
class YoushanAligner:
    def __init__(self, model_name="Rostlab/prot_bert_bfd", token=os.environ.get("HF_TOKEN")): # replace with your actual token
        # If HF_TOKEN is not found in the environment, check for 'HUGGING_FACE_HUB_TOKEN'
        if token is None:
            token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
        self.model = AutoModel.from_pretrained(model_name, token=token)

    def get_embedding(self, sequence):
        inputs = self.tokenizer(sequence, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            embedding = self.model(**inputs).last_hidden_state.mean(dim=1)
        return embedding.squeeze().numpy()

    def compute_similarity_matrix(self, sequences):
        embeddings = np.array([self.get_embedding(seq) for seq in sequences])
        similarity_matrix = np.dot(embeddings, embeddings.T)
        return similarity_matrix


# Hierarchical Clustering to Construct Guide Tree
def construct_guide_tree(similarity_matrix):
    distance_matrix = 1 - similarity_matrix  # Convert similarity to distance
    linkage_matrix = linkage(distance_matrix, method='average')
    return linkage_matrix

# Pairwise Alignment using Youshan Scoring
def pairwise_align(seq1, seq2, model):
    score = model.get_embedding(seq1).dot(model.get_embedding(seq2))
    return score

# Progressive MSA
def progressive_msa(sequences, model):
    similarity_matrix = model.compute_similarity_matrix(sequences)
    guide_tree = construct_guide_tree(similarity_matrix)

    sorted_indices = np.argsort(guide_tree[:, 2])  # Sort based on clustering order
    aligned_sequences = [sequences[i] for i in sorted_indices]

    # Save aligned sequences to FASTA file for Clustal Omega
    with open("aligned_sequences.fasta", "w") as f:
        for i, seq in enumerate(aligned_sequences):
            f.write(f">seq{i}\n{seq}\n")

    return "aligned_sequences.fasta"

# Run Clustal Omega
def run_msa(input_fasta, output_fasta):
    clustal_cmd = ClustalOmegaCommandline(infile=input_fasta, outfile=output_fasta, verbose=True, auto=True)
    subprocess.run(str(clustal_cmd), shell=True)
    return output_fasta

# Main Execution Pipeline
def main(fasta_file):
    sequences = [str(record.seq) for record in SeqIO.parse(fasta_file, "fasta")]
    model = YoushanAligner()
    aligned_fasta = progressive_msa(sequences, model)
    final_alignment = run_msa(aligned_fasta, "final_msa.fasta")
    print(f"MSA saved to: {final_alignment}")

# Example Usage
if __name__ == "__main__":
    main("/content/KERATIN100.fasta")

    def get_embedding(self, sequence):
        inputs = self.tokenizer(sequence, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            embedding = self.model(**inputs).last_hidden_state.mean(dim=1)
        return embedding.squeeze().numpy()

    def compute_similarity_matrix(self, sequences):
        embeddings = np.array([self.get_embedding(seq) for seq in sequences])
        similarity_matrix = np.dot(embeddings, embeddings.T)
        return similarity_matrix

# Hierarchical Clustering to Construct Guide Tree
def construct_guide_tree(similarity_matrix):
    distance_matrix = 1 - similarity_matrix  # Convert similarity to distance
    linkage_matrix = linkage(distance_matrix, method='average')
    return linkage_matrix

# Pairwise Alignment using Youshan Scoring
def pairwise_align(seq1, seq2, model):
    score = model.get_embedding(seq1).dot(model.get_embedding(seq2))
    return score

# Progressive MSA
def progressive_msa(sequences, model):
    similarity_matrix = model.compute_similarity_matrix(sequences)
    guide_tree = construct_guide_tree(similarity_matrix)

    sorted_indices = np.argsort(guide_tree[:, 2])  # Sort based on clustering order
    aligned_sequences = [sequences[i] for i in sorted_indices]

    # Save aligned sequences to FASTA file for Clustal Omega
    with open("aligned_sequences.fasta", "w") as f:
        for i, seq in enumerate(aligned_sequences):
            f.write(f">seq{i}\n{seq}\n")

    return "aligned_sequences.fasta"

# Run Clustal Omega
def run_msa(input_fasta, output_fasta):
    clustal_cmd = ClustalOmegaCommandline(infile=input_fasta, outfile=output_fasta, verbose=True, auto=True)
    subprocess.run(str(clustal_cmd), shell=True)
    return output_fasta

# Main Execution Pipeline
def main(fasta_file):
    sequences = [str(record.seq) for record in SeqIO.parse(fasta_file, "fasta")]
    model = YoushanAligner()
    aligned_fasta = progressive_msa(sequences, model)
    final_alignment = run_msa(aligned_fasta, "final_msa.fasta")
    print(f"MSA saved to: {final_alignment}")

# Example Usage
if __name__ == "__main__":
    main("/content/KERATIN100.fasta")


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineG

tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


model.safetensors:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

MSA saved to: final_msa.fasta


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


MSA saved to: final_msa.fasta
