## Import packages

In [None]:
import Bio 
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

print(os.getcwd())


## Create reverse complement of old sequences

In [None]:
input_file = "raw/Rasit.nex"
output_file = "raw/Rasit_revcomp.nex"
revcomp_records = []

for seq_record in SeqIO.parse(input_file, "nexus"):
    reverse_complement_seq = seq_record.seq.reverse_complement()
    reverse_complement_record = SeqRecord(
        reverse_complement_seq,
        id=seq_record.id,
        description=seq_record.description + " reverse_complemented",
    )
    reverse_complement_record.annotations["molecule_type"] = "DNA"
    revcomp_records.append(reverse_complement_record)

# Write all at once
with open(output_file, "w") as output_handle:
    SeqIO.write(revcomp_records, output_handle, "nexus")


## Join all sequences together and trim to overlapping region

In [23]:
# List your input NEXUS files
input_files = ["raw/Rasit_revcomp.nex", "raw/Sardinia.nexus", "raw/Megan.nexus"]
# input_files = ["raw/Rasit.nex", "raw/Sardinia.nexus"]

# Output FASTA file
output_file = "combined_CO1.fasta"

# Collect all sequences
all_records = []
for f in input_files:
    for record in SeqIO.parse(f, "nexus"):
        all_records.append(record)

# Write all sequences (unaligned) to one FASTA
SeqIO.write(all_records, output_file, "fasta")


316

sites 106-588 common between new and old sequences

In [None]:
## attempting to trim out excess
input_file = "processed/combined_CO1.fasta"
output_file = "processed/combined_CO1_trimmed.fasta"

start = 106  # 1-based inclusive
end = 588    # 1-based inclusive

trimmed_records = []

for record in SeqIO.parse(input_file, "fasta"):
    trimmed_seq = record.seq[start-1:end]
    record.seq = trimmed_seq
    record.description += f" trimmed_{start}_{end}"
    trimmed_records.append(record)

SeqIO.write(trimmed_records, output_file, "fasta")

## Align/trim sequences and write nexus

In [None]:
## align with MAFFT 
# mafft --auto processed/input.fasta > processed/output.fasta


In [None]:
from Bio.Nexus import Nexus
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment

file_list = ["raw/Megan.nexus", "raw/Rasit_revcomp.nex", "raw/Sardinia.nexus"]
all_records = []
for f in file_list:
    alignment = AlignIO.read(f, "nexus")
    for record in alignment:
        all_records.append(record)

combined = MultipleSeqAlignment(all_records) # must all be same length -fix
AlignIO.write(combined, "processed/combined_CO1.nex", "nexus")
