### Load the Libraries

In [2]:
from Bio import SeqIO
import re
# import math

In [3]:
# Primary libraries
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# Secondary libraries
from tqdm import tqdm
# from scipy import stats
import scipy
# from glob import glob
import os
# from matplotlib import dates as mpl_dates
# import datetime
# from datetime import date
# import matplotlib.patches as mpatches
# from matplotlib import cm
# from colorspacious import cspace_converter
# from collections import OrderedDict
# import matplotlib.ticker as ticker

### Readings for the analysis

We are dealing with .gb files with single records each.

From (https://warwick.ac.uk/fac/sci/moac/people/students/peter_cock/python/genbank/)-

    Depending on the type of GenBank file(s) you are interested in, they will either contain a single record, or multiple records. You can easily determine this by looking at the raw file - each record will start with a LOCUS line, followed by various other header lines, usually a list of features, the sequence data, and ends with a // line (slash slash).

Locations provided by **BioPython** is optimum for python purposes.
- start and end provided by it are 1938 and 3075 respectively (0-based indexing and it assumes that end position is not included),
- while in our actual file it is 1939 and 3075 (1-based indexing and it assumes that both start and end positions are included).

This way we can directly slice seq string using locations provided to obtain the seq for the features of our interest.

1. **The DDBJ/ENA/GenBank Feature Table Definition**: 
    Documentation of features in genbank files. Very good document, must go-through this once.
    
    Source:
    - (https://www.insdc.org/documents/feature_table.html)



2. **Locus tag:**
    Locus_tags are identifiers that are systematically applied to every gene in a genome.
    These tags have become surrogate gene names by the biological community. If two
    submitters of two different genomes use the same systematic names to describe two very
    different genes in two very different genomes, it can be very confusing. In order to
    prevent this from happening INSD has created a registry of locus_tag prefixes.
    Submitters of eukaryotic and prokaryotic genomes should register their prefix prior to
    submitting their genome. All components of a project (such as multiple chromosomes or
    plasmids, etc) should use the same locus_tag prefix. 
    
    Source:
    - (https://www.ncbi.nlm.nih.gov/genomes/locustag/Proposal.pdf)
    - (https://www.ncbi.nlm.nih.gov/genbank/genomesubmit_annotation/#locus_tag)

### Extracting desirable information

Useful videos for the analysis done later-
1. https://www.youtube.com/watch?v=LdQV3cbUwEE&list=PLe1-kjuYBZ05T9iHV_z60B9mpFt201ND5&index=8
2. https://www.youtube.com/watch?v=HP7ThAj_f1E

_Both videos are on Youtube @Bioinformatics Coach_

KeyError resolution -

    gene_name = gene.qualifiers['gene'][0]
    gene_name = gene.qualifiers.get('gene',['unavailable'])[0]

_Source_: https://bioinformatics.stackexchange.com/questions/15454/keyerror-when-getting-features-from-a-genbank-file-with-biopython-with-some-acce/15456#15456

In [None]:
# desirable FLANK length
FLANK_LENGTH = 200

In [33]:
def genbank_file_reader(file_name):
    """Takes in genbank file name in the folder ./genbank_files/;
    Outputs the dataframe for that genbank file
    """
    gb_record = SeqIO.read(open(f"./genbank_files/{file_name}", 'r'), 'genbank')
    print(f'Name {gb_record.name}, {len(gb_record.features)} features')
    
    data = []

    allgenes = (
        feature
        for feature in gb_record.features
        if feature.type == 'gene'
    )
    
    for gene in allgenes:

        gene_name = gene.qualifiers.get('gene',['unavailable'])[0]
        gene_ID = gene.qualifiers.get('db_xref', ['GeneID:unavailable'])[0][7:]
        locus_tag = gene.qualifiers.get('locus_tag', ['unavailable'])[0]

        start_pos = gene.location.nofuzzy_start
        # start_pos = gene.location.nofuzzy_start + 1 # if 1-based indexing required
        end_pos = gene.location.nofuzzy_end
        strand_sense = gene.strand

        gene_seq = gene.extract(gb_record).seq

        # For +ve sense strand
        if strand_sense == 1:
            if len(gb_record.seq[:start_pos]) >= FLANK_LENGTH:
                upstream_flank = gb_record.seq[start_pos-FLANK_LENGTH:start_pos]
            else:
                upstream_flank = gb_record.seq[:start_pos]

            if len(gb_record.seq[end_pos:]) >= FLANK_LENGTH:
                downstream_flank = gb_record.seq[end_pos:end_pos+FLANK_LENGTH]
            else:
                downstream_flank = gb_record.seq[end_pos:]
        # For -ve sense strand
        elif strand_sense == -1:
            if len(gb_record.seq[:start_pos]) >= FLANK_LENGTH:
                downstream_flank = gb_record.seq[start_pos-FLANK_LENGTH:start_pos].reverse_complement()
            else:
                downstream_flank = gb_record.seq[:start_pos].reverse_complement()

            if len(gb_record.seq[end_pos:]) >= FLANK_LENGTH:
                upstream_flank = gb_record.seq[end_pos:end_pos+FLANK_LENGTH].reverse_complement()
            else:
                upstream_flank = gb_record.seq[end_pos:].reverse_complement()
        
        
        pattern_length = 4
        # Added a FLANK_LENGTH + (pattern_length - 1) bp motif instead of FLANK_LENGTH to handle the 0 count for boundary cases
        # For +ve sense strand
        if strand_sense == 1:
            if len(gb_record.seq[:start_pos]) >= (FLANK_LENGTH+(pattern_length-1)):
                upstream_flank_adjusted = gb_record.seq[start_pos-FLANK_LENGTH:(start_pos+(pattern_length-1))]
            else:
                upstream_flank_adjusted = gb_record.seq[:(start_pos+(pattern_length-1))]

            if len(gb_record.seq[end_pos:]) >= (FLANK_LENGTH+(pattern_length-1)):
                downstream_flank_adjusted = gb_record.seq[end_pos:(end_pos+FLANK_LENGTH+(pattern_length-1))]
            else:
                downstream_flank_adjusted = gb_record.seq[end_pos:]
        # For -ve sense strand
        elif strand_sense == -1:
            if len(gb_record.seq[:start_pos]) >= (FLANK_LENGTH+(pattern_length-1)):
                downstream_flank_adjusted = gb_record.seq[start_pos-(FLANK_LENGTH+(pattern_length-1)):start_pos].reverse_complement()
            else:
                downstream_flank_adjusted = gb_record.seq[:start_pos].reverse_complement()

            if len(gb_record.seq[end_pos:]) >= (FLANK_LENGTH+(pattern_length-1)):
                upstream_flank_adjusted = gb_record.seq[end_pos-(pattern_length-1):end_pos+FLANK_LENGTH].reverse_complement()
            else:
                upstream_flank_adjusted = gb_record.seq[end_pos-(pattern_length-1):].reverse_complement()                
        # Addition ends ---------------------------------------------------------------            
                
        data.append((locus_tag, gene_ID, gene_name, start_pos, end_pos, strand_sense, 
                     str(gene_seq), str(upstream_flank), str(downstream_flank), 
                     str(upstream_flank_adjusted), str(downstream_flank_adjusted)))    


    df = pd.DataFrame(data, columns=['locus_tag', 
                                     'gene_ID', 
                                     'gene_name', 
                                     'start_position', 
                                     'end_position', 
                                     'strand_sense', 
                                     'gene_seq', 
                                     'upstream_flank', 
                                     'downstream_flank', 
                                     'upstream_flank_adjusted', 
                                     'downstream_flank_adjusted'])

    df.to_csv('./csv_files_for_promoters/consensus02_'+file_name[:-3]+'.csv', index=False)

How to iterate over a given directory:
https://stackoverflow.com/questions/10377998/how-can-i-iterate-over-files-in-a-given-directory

In [34]:
# To do the flank calculations for all files
directory = os.fsencode('./genbank_files/')

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith('.gb'):
        print(filename)
        genbank_file_reader(filename)

escherichia_coli_BW25113.gb
Name CP009273, 9462 features
bacillus_subtilis.gb
Name NC_000964, 9074 features
streptococcus_pneumoniae.gb
Name NZ_CP020549, 4328 features
klebsiella_pneumoniae.gb
Name NC_016845, 10894 features


### Loading the csvs

In [8]:
mapping_dict = {}

mapping_values_list = [i for i in range(-FLANK_LENGTH, 0, 1)]
mapping_keys_list = [i for i in range(1, (FLANK_LENGTH+1), 1)]

for i in range(len(mapping_keys_list)):
    mapping_dict[mapping_keys_list[i]] = mapping_values_list[i]
    
print(mapping_dict)

{1: -200, 2: -199, 3: -198, 4: -197, 5: -196, 6: -195, 7: -194, 8: -193, 9: -192, 10: -191, 11: -190, 12: -189, 13: -188, 14: -187, 15: -186, 16: -185, 17: -184, 18: -183, 19: -182, 20: -181, 21: -180, 22: -179, 23: -178, 24: -177, 25: -176, 26: -175, 27: -174, 28: -173, 29: -172, 30: -171, 31: -170, 32: -169, 33: -168, 34: -167, 35: -166, 36: -165, 37: -164, 38: -163, 39: -162, 40: -161, 41: -160, 42: -159, 43: -158, 44: -157, 45: -156, 46: -155, 47: -154, 48: -153, 49: -152, 50: -151, 51: -150, 52: -149, 53: -148, 54: -147, 55: -146, 56: -145, 57: -144, 58: -143, 59: -142, 60: -141, 61: -140, 62: -139, 63: -138, 64: -137, 65: -136, 66: -135, 67: -134, 68: -133, 69: -132, 70: -131, 71: -130, 72: -129, 73: -128, 74: -127, 75: -126, 76: -125, 77: -124, 78: -123, 79: -122, 80: -121, 81: -120, 82: -119, 83: -118, 84: -117, 85: -116, 86: -115, 87: -114, 88: -113, 89: -112, 90: -111, 91: -110, 92: -109, 93: -108, 94: -107, 95: -106, 96: -105, 97: -104, 98: -103, 99: -102, 100: -101, 101: -1

How to iterate rows of a dataframe: https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas

In [10]:
def motif_analysis(df, file_name):
    """Input: dataframe loaded from csv file and its file name
    Output: dataframes for upstream- and downstream- flank's motif analysis
    """
    upstream_dict = {}
    downstream_dict = {}
    for key in keys_list:
        upstream_dict[key] = 0
        downstream_dict[key] = 0

    # Resetting indices to make sure indices pair with number of rows
    df = df.reset_index()
    
    for index, row in df.iterrows():
        
        # not pd.isnull() statement handles the boundary regions, 
        # i.e., regions with no flanks for which we'll have 'NaN'
        if not pd.isnull(row['upstream_flank_adjusted']):
            for m in re.finditer(pattern, row['upstream_flank_adjusted']):
                upstream_dict[m.start()+1] += 1
        if not pd.isnull(row['downstream_flank_adjusted']):
            for m in re.finditer(pattern, row['downstream_flank_adjusted']):
                downstream_dict[m.start()+1] += 1
    
    mapped_upstream_dict = {}
    for key in upstream_dict:
        mapped_upstream_dict[mapping_dict[key]] = upstream_dict[key]    
    
    df_up = pd.DataFrame.from_dict(mapped_upstream_dict, orient='index')
    df_down = pd.DataFrame.from_dict(downstream_dict, orient='index')
    df_up.to_csv('./csv_files_for_promoters_consensus_motifs/consensus02_upstream_'+file_name[12:])
    df_down.to_csv('./csv_files_for_promoters_consensus_motifs/consensus02_downstream_'+file_name[12:])
    del df_up, df_down


**Always remember to update pattern_length using 'find and replace' when you change a pattern**

In [37]:
keys_list = [i for i in range(1, (FLANK_LENGTH+1))]

# # for consensus 01
# pattern = re.compile(r'T[AT]AT')
# pattern = re.compile(r'TATAAT')

# for consensus 02
pattern = re.compile(r'TTG[AT]')
# pattern = re.compile(r'TTGACA')

# To do the flank calculations for all files
directory = os.fsencode('./csv_files_for_promoters/')


for file in os.listdir(directory):
    
    filename = os.fsdecode(file)
    if filename.startswith('consensus02') and filename.endswith('.csv'):
        print(filename)
        df_temp = pd.read_csv('./csv_files_for_promoters/'+filename)
        print(len(df_temp.index))
        motif_analysis(df_temp, filename)

consensus01_streptococcus_pneumoniae.csv
2157
consensus01_escherichia_coli_BW25113.csv
4490
consensus01_bacillus_subtilis.csv
4536
consensus01_klebsiella_pneumoniae.csv
5404
