In [1]:
## Loading fasta files of two pathogen DNA sequence records

In [2]:
%%script true
from Bio import SeqIO

filename_acinetobacter='acinetobacter.fna'
filename_pseudomonas='pseudomonas.fna'
count = 0
for record in SeqIO.parse(filename_acinetobacter, "fasta"):
    count = count + 1

print("There were " + str(count) + " records in file " + filename_acinetobacter)

count = 0
for record in SeqIO.parse(filename_pseudomonas, "fasta"):
    count = count + 1

print("There were " + str(count) + " records in file " + filename_pseudomonas)

In [3]:
%%script true
print("Choosing first record from each file and writing to disk")
record_acinetobacter = next(SeqIO.parse(filename_acinetobacter, "fasta"))
SeqIO.write(record_acinetobacter, "acinetobacter_record.fasta", "fasta")
record_pseudomonas = next(SeqIO.parse(filename_pseudomonas, "fasta"))
SeqIO.write(record_pseudomonas, "pseudomonas_record.fasta", "fasta")
acinetobacter_seq = next(SeqIO.parse(filename_acinetobacter, "fasta")).seq
pseudomonas_seq = next(SeqIO.parse(filename_pseudomonas, "fasta")).seq

## Ran Mummer on these sequences to get matching segments

Download mummer from  [gitHub](https://github.com/mummer4/mummer). MUMmer is a versatile alignment tool for DNA and protein sequences.

```bash
./mummer -maxmatch -s ../acinetobacter_record.fasta ../pseudomonas_record.fasta > ../acinetobacter_pseudomonas_matches.txt
```

Now, load mummer output and only select unique strands.

In [4]:
%%script true
def load_filtered_file(filepath):
    valid_chars = set("agtc")
    filtered_lines = []
    
    with open(filepath, 'r') as file:
        for line in file:
            line_clean = line.strip().lower()
            # keep lines that aren't empty and consist only of a, g, t, c
            if line_clean and all(char in valid_chars for char in line_clean):
                filtered_lines.append(line_clean)
    return "\n".join(filtered_lines)

# Specify the file path accordingly
filepath = "acinetobacter_pseudomonas_matches.txt"  
matching_segments = load_filtered_file(filepath)
matching_segments = matching_segments.swapcase()
matches = matching_segments.splitlines()
unique_matches = set(matches)
print("Unique matches: " + str(len(unique_matches)))

Dump the unique outputs to fasta and create pdbs using python bindings to pymol
Tools used are: 
1. [BioPython](https://biopython.org/)
2. [Pymol fnab](https://pymolwiki.org/index.php/Fnab)



In [5]:
%%script true
fasta_str = "".join(f">\n{match}\n" for match in unique_matches)
with open("unique_matches.fasta", "w") as f:
    f.write(fasta_str)

In [6]:
%%script true
from Bio import SeqIO
import numpy as np
import pymol
import os
from multiprocessing import Pool
pdb_dir = "dna_strand_pdb_files/"
os.makedirs(pdb_dir, exist_ok=True)
fasta_file = "unique_matches.fasta"
nprocs = 12
with open(fasta_file, 'r') as handle:
    records = list(SeqIO.parse(handle, 'fasta'))


    def process_record(args):
        count, record = args
        print(f"Writing sequence {count} to disk")
        dna_sequence = record.seq
        pymol.cmd.do(f'fnab {dna_sequence}, name={dna_sequence}, mode=DNA, form=B, dbl_helix=1')
        pymol.cmd.save(f"{pdb_dir}/dna_sequence_{count}.pdb", f"{dna_sequence}")
        return count

    with Pool(processes=nprocs) as pool:
        counts = pool.map(process_record, enumerate(records))
    

    print(f"Total {count+1} Sequences")

In [7]:
import os
import subprocess
import numpy as np
from multiprocessing import Pool

dna_strands_dir="dna_strand_pdb_files"

peptide_models_dir="peptide_models"
peptide_model_file = "fold_2024_12_28_00_29_model_3.pdb"
peptide_pdb = os.path.join(peptide_models_dir, peptide_model_file)

home_dir = os.environ.get("HOME", "")
path_to_hdock = os.path.join(home_dir,".local", "HDOCKlite-v1.1")
hdock_bin = os.path.join(path_to_hdock, "hdock")

filenames = [os.path.splitext(f)[0] for f in os.listdir(dna_strands_dir) if os.path.isfile(os.path.join(dna_strands_dir, f))]
nstrands = len(filenames)
print(f"Total {nstrands} DNA strands")

def hdock_score_best(dna_pdb, peptide_pdb):
    fname = os.path.splitext(os.path.basename(dna_pdb))[0] + "_" + os.path.splitext(os.path.basename(peptide_pdb))[0]
    output_file = f"{fname}.out"
    command = f"{hdock_bin} {peptide_pdb} {dna_pdb} -out {output_file}"
    subprocess.run(command, shell=True, stdout=subprocess.DEVNULL, check=True)
    #subprocess.run(command, shell=True, check=True)
    with open(f"{fname}.out", "r") as f:
        lines = f.readlines()
    score = float(lines[5].split()[6])
    return score

def compute_confidence_score(docking_score):
        """
        Calculate the confidence score from the docking score.
        
        The confidence score is computed using the formula:
        Confidence_score = 1.0 / (1.0 + np.exp(0.02 * (docking_score + 150)))
        
        Information obtained from http://hdock.phys.hust.edu.cn/help.php
        """
        return 1.0 / (1.0 + np.exp(0.02 * (docking_score + 150)))

def process_score(args):
    i, dna_pdb = args
    docking_score = hdock_score_best(f"{dna_strands_dir}/{dna_pdb}.pdb", peptide_pdb)
    confidence_score = compute_confidence_score(docking_score)
    return dna_pdb, docking_score, confidence_score

nprocs = int(os.environ.get('PBS_NCPUS', '12'))
with Pool(processes=nprocs) as pool:
    results = pool.map(process_score, list(enumerate(filenames)))

print(results)

Total 169 DNA strands
[('dna_sequence_0', -165.95, 0.5790805251725788), ('dna_sequence_1', -167.38, 0.5860354644374585), ('dna_sequence_10', -161.13, 0.5554213413049133), ('dna_sequence_100', -162.26, 0.5609947068545766), ('dna_sequence_101', -183.88, 0.6632028319474459), ('dna_sequence_102', -170.48, 0.6009919624749556), ('dna_sequence_103', -164.62, 0.5725835908786807), ('dna_sequence_104', -170.34, 0.6003203312457148), ('dna_sequence_105', -164.94, 0.5741491406652448), ('dna_sequence_106', -160.07, 0.5501804962832794), ('dna_sequence_107', -167.29, 0.5855987207016227), ('dna_sequence_108', -169.29, 0.5952712272615408), ('dna_sequence_109', -167.94, 0.5887499168908743), ('dna_sequence_11', -161.41, 0.5568037081212369), ('dna_sequence_110', -178.42, 0.6383939015911498), ('dna_sequence_111', -157.02, 0.5350424553622494), ('dna_sequence_112', -163.49, 0.5670438045983401), ('dna_sequence_113', -155.83, 0.5291170189576817), ('dna_sequence_114', -176.07, 0.6274750748203015), ('dna_sequence

In [8]:
import pandas as pd

# Create the dataframe using the 'results' variable, and add the peptide file name
df = pd.DataFrame(results, columns=["DNA Sequence", "Docking Score", "Confidence Score"])
df["Peptide Model"] = peptide_pdb

# Reorder columns and sort by increasing docking score
df = df[["DNA Sequence", "Peptide Model", "Docking Score", "Confidence Score"]].sort_values(by="Docking Score", ascending=True)
print(df)
df.to_json("best_docked_scores.json", orient='records', indent=4)

         DNA Sequence                                     Peptide Model  \
4    dna_sequence_101  peptide_models/fold_2024_12_28_00_29_model_0.pdb   
139   dna_sequence_72  peptide_models/fold_2024_12_28_00_29_model_0.pdb   
153   dna_sequence_85  peptide_models/fold_2024_12_28_00_29_model_0.pdb   
14   dna_sequence_110  peptide_models/fold_2024_12_28_00_29_model_0.pdb   
140   dna_sequence_73  peptide_models/fold_2024_12_28_00_29_model_0.pdb   
..                ...                                               ...   
132   dna_sequence_66  peptide_models/fold_2024_12_28_00_29_model_0.pdb   
15   dna_sequence_111  peptide_models/fold_2024_12_28_00_29_model_0.pdb   
154   dna_sequence_86  peptide_models/fold_2024_12_28_00_29_model_0.pdb   
17   dna_sequence_113  peptide_models/fold_2024_12_28_00_29_model_0.pdb   
117   dna_sequence_52  peptide_models/fold_2024_12_28_00_29_model_0.pdb   

     Docking Score  Confidence Score  
4          -183.88          0.663203  
139        -182.87   