This notebook creates .csv file from NCBI annotations with the gene sequence for each records.

# 1. Load 
### 1.1 Load packages

In [184]:
import pandas as pd
from Bio import pairwise2
import rglonDB_parse as rp
from Bio import SeqIO
import numpy as np
import time

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

### 1.2 Load NCBI files
### 1.2.1 NCBI ann

In [192]:
# read ncbi annotation file
ncbi_gene_file = "HelpFiles/NCBI_rel606_annotation.csv"
ncbi_ann = pd.read_csv(ncbi_gene_file, header=0, index_col=0)
ncbi_ann.head(3)

Unnamed: 0,locus_tag,name,location,old_locus_tag,product,note,start,end
0,ECB_RS00005,thrL,[189:255](+),ECB_00001,thr operon leader peptide,Derived by automated computational analysis us...,189,255
1,ECB_RS00010,thrA,[335:2798](+),ECB_00002,bifunctional aspartate kinase/homoserine dehyd...,Derived by automated computational analysis us...,335,2798
2,ECB_RS00015,thrB,[2799:3732](+),ECB_00003,homoserine kinase,Derived by automated computational analysis us...,2799,3732


In [193]:
ncbi_ann.shape

(4488, 8)

### 1.2.2 NCBI sequence

In [194]:
# I'll need the actual sequence for each ECB#
ncbi_seq_file = "HelpFiles/NCBI_rel606/NCBI_EcoliBREL606_Fall2019.fa"
gen_ids = []
seq_list = []
for record in SeqIO.parse(ncbi_seq_file, "fasta"):
    if "NC_012967" in record.id:
        seq = str(record.seq)
        des = record.description
        des = des.split()
        for words in des:
            if "locus_tag" in words:
                ecb = words[11:-1]
        gen_ids.append(ecb)
        seq_list.append(seq)
        #print(record, des, ecb, seq, "\n")

In [195]:
ncbi_seq = pd.DataFrame({"id": gen_ids, "sequence": seq_list})
ncbi_seq.head(3)

Unnamed: 0,id,sequence
0,ECB_RS00005,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...
1,ECB_RS00010,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...
2,ECB_RS00015,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...


In [196]:
ncbi_seq.shape

(4488, 2)

### 1.3.3 Merge NCBI ann and sequence

In [197]:
ncbi_info = pd.merge(
    ncbi_ann, ncbi_seq,
    how = "left",
    left_on = 'locus_tag',
    right_on = "id"
)
ncbi_info.head(3)

Unnamed: 0,locus_tag,name,location,old_locus_tag,product,note,start,end,id,sequence
0,ECB_RS00005,thrL,[189:255](+),ECB_00001,thr operon leader peptide,Derived by automated computational analysis us...,189,255,ECB_RS00005,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...
1,ECB_RS00010,thrA,[335:2798](+),ECB_00002,bifunctional aspartate kinase/homoserine dehyd...,Derived by automated computational analysis us...,335,2798,ECB_RS00010,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...
2,ECB_RS00015,thrB,[2799:3732](+),ECB_00003,homoserine kinase,Derived by automated computational analysis us...,2799,3732,ECB_RS00015,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...


In [198]:
ncbi_info.drop(["id"], axis=1, inplace=True)
ncbi_info.head(3)

Unnamed: 0,locus_tag,name,location,old_locus_tag,product,note,start,end,sequence
0,ECB_RS00005,thrL,[189:255](+),ECB_00001,thr operon leader peptide,Derived by automated computational analysis us...,189,255,ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCA...
1,ECB_RS00010,thrA,[335:2798](+),ECB_00002,bifunctional aspartate kinase/homoserine dehyd...,Derived by automated computational analysis us...,335,2798,ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAAC...
2,ECB_RS00015,thrB,[2799:3732](+),ECB_00003,homoserine kinase,Derived by automated computational analysis us...,2799,3732,ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCG...


In [199]:
ncbi_info.name.isna().sum()

2151

In [200]:
ncbi_info.shape

(4488, 9)

In [201]:
ncbi_info.to_csv("HelpFiles/NCBI_EcoliBREL606_annseq.csv")