In [1]:
import pandas as pd
from Bio import Restriction, SeqIO
import os
import pandas as pd

# Prepare Ref Accessions

In [2]:
main_accessions = pd.read_csv('data/ref_query_accessions_seq.csv')
main_accessions.head()

Unnamed: 0,accession,species,sequence
0,NR_025227.1,Pseudomonas umsongensis,AACGCTGGCGGCAGGCCTAACACATGCAAGTCGAGCGGATGAAGAG...
1,NR_157609.1,Bacillus haynesii,AGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCCTAATA...
2,NR_116064.1,Curtobacterium oceanosedimentum,GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGATGAT...
3,NR_104839.1,Curtobacterium oceanosedimentum,TCAGGACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGA...
4,NR_115988.1,Acinetobacter rudis,CCAGGGATTTGATTATGGCTCAGATTGAACGCTGGCGGCAGGCTTA...


In [3]:
# Fetch similar accessions from blast serach on refDB accessions 
similar_accessions = pd.DataFrame(columns=['accession', 'strain_ref', 'sequence'])
for root, dirs, files in os.walk("./data/similar_ref_seq", topdown=False):
    for file in files:
        filepath = f"./data/similar_ref_seq/{file}"
        if file.endswith('.fasta') and os.path.isfile(filepath):
            records = list(SeqIO.parse(filepath, 'fasta'))
            for r in records:
                similar_accessions = similar_accessions.append({
                    "accession": r.id,
                    "strain_ref": os.path.splitext(file)[0],
                    "sequence": str(r.seq)
                }, ignore_index=True)

similar_accessions.head()

Unnamed: 0,accession,strain_ref,sequence
0,MN513225.1,NR_025357.1,ATTGAACGCTAGCGGGATGCTTTACACATGCAAGTCGAACGGCAGC...
1,JF710959.1,NR_025357.1,TACACATGCAAGTCGAACGGCAGCACGAGAGAGCTTGCTCTCTTGG...
2,KT988067.1,NR_025357.1,ATTGAACGCTAGCGGGATGCTTTACACATGCAAGTCGAACGGCAGC...
3,KP224304.1,NR_025357.1,GAGTTTGATCCTGGCTCAGATTGAACGCTAGCGGGATGCTTTACAC...
4,KF534470.1,NR_025357.1,GAGTTTGATCCTGGCTCAGATTGAACGCTAGCGGGATGCTTTACAC...


In [4]:
main_indexed_accessions = main_accessions.set_index("accession")

# Assign Species
similar_accessions['species'] = similar_accessions.apply(lambda c: main_indexed_accessions.loc[c.strain_ref].species, axis=1)
similar_accessions.head()

Unnamed: 0,accession,strain_ref,sequence,species
0,MN513225.1,NR_025357.1,ATTGAACGCTAGCGGGATGCTTTACACATGCAAGTCGAACGGCAGC...,Alcaligenes faecalis
1,JF710959.1,NR_025357.1,TACACATGCAAGTCGAACGGCAGCACGAGAGAGCTTGCTCTCTTGG...,Alcaligenes faecalis
2,KT988067.1,NR_025357.1,ATTGAACGCTAGCGGGATGCTTTACACATGCAAGTCGAACGGCAGC...,Alcaligenes faecalis
3,KP224304.1,NR_025357.1,GAGTTTGATCCTGGCTCAGATTGAACGCTAGCGGGATGCTTTACAC...,Alcaligenes faecalis
4,KF534470.1,NR_025357.1,GAGTTTGATCCTGGCTCAGATTGAACGCTAGCGGGATGCTTTACAC...,Alcaligenes faecalis


In [5]:
# Collect all ref accessions together
ref_accessions = similar_accessions[['accession', 'species', 'sequence']].append(
    main_indexed_accessions.reset_index()[['accession', 'species', 'sequence']], 
    ignore_index=True
)
ref_accessions.head()

Unnamed: 0,accession,species,sequence
0,MN513225.1,Alcaligenes faecalis,ATTGAACGCTAGCGGGATGCTTTACACATGCAAGTCGAACGGCAGC...
1,JF710959.1,Alcaligenes faecalis,TACACATGCAAGTCGAACGGCAGCACGAGAGAGCTTGCTCTCTTGG...
2,KT988067.1,Alcaligenes faecalis,ATTGAACGCTAGCGGGATGCTTTACACATGCAAGTCGAACGGCAGC...
3,KP224304.1,Alcaligenes faecalis,GAGTTTGATCCTGGCTCAGATTGAACGCTAGCGGGATGCTTTACAC...
4,KF534470.1,Alcaligenes faecalis,GAGTTTGATCCTGGCTCAGATTGAACGCTAGCGGGATGCTTTACAC...


In [6]:
# All availabe Species
similar_accessions['species'].unique()

array(['Alcaligenes faecalis', 'Janibacter melonis',
       'Acinetobacter lwoffii', 'Bacillus subtilis',
       'Stenotrophomonas rhizophilia', 'Sporosarcina globispora',
       'Stenotrophomonas maltophilia', 'Brevundimonas diminuta',
       'Lysinibacillus fusiformis', 'Bacillus aerius',
       'Ochrobactrum pseudogrignonense', 'Pseudomonas putida',
       'Sporosarcina koreensis', 'Rhodococcus qingshengii',
       'Paenibacillus taichungensis', 'Ochrobactrum anthropi',
       'Bacillus cereus', 'Advenella kashmirensis',
       'Curtobacterium oceanosedimentum', 'Exiguobacterium aquaticum',
       'Brevibacillus agri', 'Pseudomonas parafulva', 'Bacillus safensis',
       'Serratia marcescens', 'Acinetobacter radioresistens',
       'Bacillus pseudomycoides', 'Paenibacillus lautus',
       'Bacillus aryabhattai', 'Acinetobacter rudis',
       'Achromobacter marplatensis', 'Bacillus velezensis',
       'Brevundimonas naejangsanensis', 'Bacillus kochii',
       'Sporosarcina psychrophi

In [8]:
# write to csv
ref_accessions.to_csv('./outputs/all_ref_accessions_seq.csv')

# Prepare Sample Accessions

In [9]:
# Import sample sequences
sample_sequences = pd.read_csv('./data/isolates_details_seq.csv')
sample_sequences.head()

Unnamed: 0,accession,isolate,sequence,species,genera,phylum
0,MN493874.1,K2,AAGGGGTGGCCTACACATGCAAGTCGAACGGCAGCACAGGAGAGCT...,Stenotrophomonas maltophilia,Stenotrophomonas,Proteobacteria
1,MN493875.1,K4,GGCGCAGGCCTACACATGCAAGTCGAACGAACTCTTCGGAGTTAGT...,Brevundimonas naejangsanensis,Brevundimonas,Proteobacteria
2,MN493876.1,K7,AATGCGGGGCCTACACATGCAAGTCGAACGGCAGCACAGGAGAGCT...,Stenotrophomonas pavanii,Stenotrophomonas,Proteobacteria
3,MN493877.1,K9,CCAAGGGCGGCCTTACCATGCAAGTCGAGCGCCCCGCAAGGGGAGC...,Ochrobactrum anthropi,Ochrobactrum,Proteobacteria
4,MN493878.1,K10,GCAGCTTACCATGCAAGTCGAGCGCCCCGCAAGGGGAGCGGCAGAC...,Ochrobactrum anthropi,Ochrobactrum,Proteobacteria


In [14]:
# Add remark column to both ref and sample
sample_sequences['remark'] = "S"
ref_accessions['remark'] = "R"

In [15]:
combined = ref_accessions.append(sample_sequences[['accession', 'species', 'sequence', 'remark']])
combined.head()

Unnamed: 0,accession,species,sequence,remark
0,MN513225.1,Alcaligenes faecalis,ATTGAACGCTAGCGGGATGCTTTACACATGCAAGTCGAACGGCAGC...,R
1,JF710959.1,Alcaligenes faecalis,TACACATGCAAGTCGAACGGCAGCACGAGAGAGCTTGCTCTCTTGG...,R
2,KT988067.1,Alcaligenes faecalis,ATTGAACGCTAGCGGGATGCTTTACACATGCAAGTCGAACGGCAGC...,R
3,KP224304.1,Alcaligenes faecalis,GAGTTTGATCCTGGCTCAGATTGAACGCTAGCGGGATGCTTTACAC...,R
4,KF534470.1,Alcaligenes faecalis,GAGTTTGATCCTGGCTCAGATTGAACGCTAGCGGGATGCTTTACAC...,R


In [17]:
def prepare_id_column(r):
    return f"{r['remark']}_{r['accession']}"

combined['name'] = combined.apply(prepare_id_column, axis=1)
combined.head()

Unnamed: 0,accession,species,sequence,remark,name
0,MN513225.1,Alcaligenes faecalis,ATTGAACGCTAGCGGGATGCTTTACACATGCAAGTCGAACGGCAGC...,R,R_MN513225.1
1,JF710959.1,Alcaligenes faecalis,TACACATGCAAGTCGAACGGCAGCACGAGAGAGCTTGCTCTCTTGG...,R,R_JF710959.1
2,KT988067.1,Alcaligenes faecalis,ATTGAACGCTAGCGGGATGCTTTACACATGCAAGTCGAACGGCAGC...,R,R_KT988067.1
3,KP224304.1,Alcaligenes faecalis,GAGTTTGATCCTGGCTCAGATTGAACGCTAGCGGGATGCTTTACAC...,R,R_KP224304.1
4,KF534470.1,Alcaligenes faecalis,GAGTTTGATCCTGGCTCAGATTGAACGCTAGCGGGATGCTTTACAC...,R,R_KF534470.1


In [18]:
combined.to_csv('./outputs/combined_seq.csv')