<a href="https://colab.research.google.com/github/hoony6134/project/blob/main/sorting_fasta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DNA Sorting using FASTA, SVM

In [2]:
%pip install biopython

Collecting biopython
  Downloading biopython-1.82-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.82


In [17]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from Bio import SeqIO

In [18]:
# Function to read fasta files and extract sequences
def read_fasta(file_path):
    sequences = []
    with open(file_path, "r") as file:
        for record in SeqIO.parse(file, "fasta"):
            sequences.append(str(record.seq))
    return sequences

# Function to train SVM model
def train_svm(positive_files, negative_files, max_iter=100):
    # Read positive fasta files
    positive_sequences = []
    for file in positive_files:
        positive_sequences += read_fasta(file)

    # Read negative fasta files
    negative_sequences = []
    for file in negative_files:
        negative_sequences += read_fasta(file)

    # Assign labels for positive and negative examples
    positive_labels = [1] * len(positive_sequences)
    negative_labels = [0] * len(negative_sequences)

    # Combine sequences and labels
    all_sequences = positive_sequences + negative_sequences
    all_labels = positive_labels + negative_labels

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        all_sequences, all_labels, test_size=0.2, random_state=42
    )

    # Vectorize sequences (you may need to use a more sophisticated method depending on your data)
    # Here, we are using a simple example of counting nucleotide occurrences
    from sklearn.feature_extraction.text import CountVectorizer

    vectorizer = CountVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)

    # Train SVM model
    clf = svm.SVC(kernel='linear', max_iter=max_iter)
    clf.fit(X_train_vectorized, y_train)

    # Make predictions
    y_pred = clf.predict(X_test_vectorized)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred) * 100

    return clf, vectorizer, accuracy

# Function to predict using trained model
def predict(model, vectorizer, input_file):
    input_sequence = read_fasta(input_file)[0]  # Assuming there's only one sequence in the file
    input_sequence_vectorized = vectorizer.transform([input_sequence])
    prediction = model.predict(input_sequence_vectorized)[0]
    return prediction


In [19]:
# Example usage
positive_files = ["a1.fasta", "a2.fasta", "a3.fasta", "a4.fasta"]
negative_files = ["n1.fasta", "n2.fasta", "n3.fasta", "n4.fasta"]

# Train SVM model with 100 epochs
trained_model, trained_vectorizer, training_accuracy = train_svm(positive_files, negative_files, max_iter=100)
# print("Training Accuracy: {:.2f}%".format(training_accuracy))

# Example input sequence
example_file = "example.fasta"

# Make prediction
prediction = predict(trained_model, trained_vectorizer, example_file)

# Print the prediction
if prediction == 1:
    print("Prediction: air purification plant")
else:
    print("Prediction: non-air purification plant")

Prediction: air purification plant


a1.fasta: 고무나무   
https://www.ncbi.nlm.nih.gov/nuccore/OQ646782.1?report=fasta  
a2.fasta: 크리소카디움    
https://www.ncbi.nlm.nih.gov/nuccore/KU598186.1?report=fasta  
a3.fasta: 금전수  
https://www.ncbi.nlm.nih.gov/nuccore/ON962335.1?report=fasta  
a4.fasta: 디시디아  
https://www.ncbi.nlm.nih.gov/nuccore/NC_069567.1?report=fasta  


n1.fasta: 장미  
https://www.ncbi.nlm.nih.gov/nuccore/OR539742.1?report=fasta  

n2.fasta: 민들레  
https://www.ncbi.nlm.nih.gov/nuccore/NC_031395.1?report=fasta  

n3.fasta: 애기똥풀  
https://www.ncbi.nlm.nih.gov/nuccore/NC_046829.1?report=fasta  

n4.fasta: 벼  
https://www.ncbi.nlm.nih.gov/nuccore/NC_066488.1?report=fasta  


example.fasta: 시계꽃  
https://www.ncbi.nlm.nih.gov/nuccore/NC_053697.1?report=fasta