In [1]:
import pandas as pd
import os
import shutil
import gzip
import glob
import csv
import numpy as np
from ete3 import Tree
from Bio import SeqIO

In [21]:
pwd

'/Users/katharineshalvarjian/Documents/gtdb/1-pyl/pyl_users'

### 1. Generate hit list 
Make lists of locus ID's for hits for genes of interest (pylRS, comprised of pylS, pylSn, and pylSc, and pylBCD). Note that we exclude pylT since it is a nucleotide sequence instead of amino acid sequence.

In [23]:
# load in strict pyl csv (hits for all three pylB, pylC, pylD)
df = pd.read_csv('../../analysis_files/2024-10-28_pyl-users_STRICT.csv', sep=',')
pyl = df['genome'].tolist()

In [8]:
# pull individual gene sequences from bulk seq out
genes = ['pylB', 'pylC', 'pylD']
df_reference = pd.read_csv('../../archaea_info/ar53_metadata_tax.csv', sep=',')
taxonomy_mapping = dict(zip(df_reference['mod_accession'], df_reference['gtdb_taxonomy'].apply(lambda x: x.split('s__')[-1])))

for gene in genes: 
    with open(f"./{gene}_parse.fa", 'w') as g: 
        for genome in genomes: 
            for record in SeqIO.parse(f"./seq/{gene}_seq.fa", 'fasta'):
                if record.id[:15] == genome: 
                    common = '_'.join(taxonomy_mapping[record.id[:15]].split(' '))
                    g.writelines(f">{common}\n{record.seq}\n")

In [16]:
genomes = []
for record in SeqIO.parse('../concat_trees/plus_anme/pylB_parse.fa', 'fasta'): 
    genomes.append(record.id)

print(len(genomes))


102


### 2. Concatenate 
We first aligned all sequences using MAFFT and trimmed the alignments with TrimAl to 90% column occupancy. Then, we concatenated the alignments for the biosynthetic machinery (pylBCD) and the incoporation machinery (pylRS) below.

In [18]:
# concatenate pylBCD alignments with each other
genes = ['pylB', 'pylC', 'pylD']
with open('../concat_trees/plus_anme/trimal-0.9_pylBCD.fa', 'w') as f: 
    for genome in genomes:
        f.writelines(f">{genome}\n")
        for gene in genes: 
            for record in SeqIO.parse(f"../concat_trees/plus_anme/trimal-0.9_mafft_{gene}_parse.fa", 'fasta'):
                if record.id == genome:
                    f.writelines(f"{record.seq}\n")

In [18]:
# concatenate pylRS sequences
genes = ['pylS', 'pylSc', 'pylSnSc']
with open('./pylS_comp.fa', 'w') as f: 
    for gene in genes: 
        for record in SeqIO.parse(f"./seq/{gene}_seq.fa", 'fasta'):
            if record.id[:15] in genomes:
                f.writelines(f">{record.id[:15]}\n{record.seq}\n")

In [43]:
# define a sequence counting function to make sure that output is the same as the input
def seq_counter(file):
    with open(file, 'r') as f: 
        i=0
        for line in f: 
            if '>' in line: 
                i+=1
        print(i)

### 3. Rename sequences 
Since we use Dendroscope to assemble the tanglegrams, we renamed all of the sequences to their species name using a taxonomy converting dictionary (below)

In [49]:
file = './pylBCD_seq.fa'
df_reference = pd.read_csv('../../archaea_info/ar53_metadata_tax.csv', sep=',')
taxonomy_mapping = dict(zip(df_reference['mod_accession'], df_reference['gtdb_taxonomy'].apply(lambda x: x.split('s__')[-1])))

with open(f"{file.strip().split('/')[1].split('.fa')[0]}_genomes2.fa", 'w') as f:
    for record in SeqIO.parse(file, 'fasta'):
        genome = '_'.join(taxonomy_mapping[record.id].split(' '))
        f.writelines(f">{genome}\n{record.seq}\n")
        