In [1]:
import pandas as pd
import os
from Bio import SeqIO

In [2]:
infile = "MAPT_Morgan_Copy.LIB"
record_dict = SeqIO.to_dict(SeqIO.parse(infile, "fasta"))
print("Number of Records:", len(record_dict))
print("MAPTHU1 Sequence:")
print(record_dict["MAPTPHU1"].seq)
print("MAPTHU1 Desc:")
print(record_dict["MAPTPHU1"].description)
print("MAPTHU1 Length:", len(record_dict["MAPTPHU1"].seq))
print("")

Number of Records: 296
MAPTHU1 Sequence:
MAEQPQDTTMMEDHAPGQEKHFSSGYPLQIPVDDGSDEPVSETSDAKSTPTAEDATAPLVEEGDHEDQGGVEQHGEIPEGTTAEEAGVGATPSLEDHAAGDATQEEPSPPCATATLKEGSGGQERDEDRDIDETAEQGLPSPADQRVSLGPEEGSCPAVAKDAREECDGEDKSKGVLRDVPGGAVLAEAESRKAGEDQEEKPQLLGGEGGPDVSLSEPSESVSQNQAEPKDGEGSGPVLETAKLPAEAADDVKDKAAPLADAGGRRTPRRKPGGLAADKASRVPLLKGRVDKEGTEADEKKPKTSSPCSAKPPGSLPPLRHAAPPKPPCSPASACKRSASSGVQETKAKGPDARGGSKTGSARAGQAQRNSTNATRIPAKTPTAPKTPPSSGRKEQKKPPPAAAKTEKGEQPKSGDRSGYSSPGSPGTPGSRSRTPSLPTPPAREPKKVAVVRTPPKSPASAKTRVQPSAAPMPDLKNVKSKIGSTDNLKHQPGGGKVQIINKKLDFSSVQSKCGSKDNIKHIPGGGSVQIVYKPVDLSHVTSKCGSLGNIHHKPGGGQVEVKSEKLDFKDKVQSKIGSLDNISHVPGGGNKKIETHKLTFRENAKAKTDHGAEIVYKSPTISGDASPRRLSNVSSTGSINLVDSPQLATLADEVSASLAKQGL
MAPTHU1 Desc:
MAPTPHU1 C-664 aa ORF2 17-2008 664 aa/2058 bp MAPT Pseudopodoces humilis (Tibetan ground-tit) EXONS 2-15. 664 bp
MAPTHU1 Length: 664



In [3]:
# Add LIB data to df
ids = []
names = []
seqs = []
descs = []

for record in record_dict:
    ids.append(record_dict[record].id)
    names.append(record_dict[record].name)
    seqs.append(record_dict[record].seq)
    descs.append(record_dict[record].description)

mapt_morgan_df = pd.DataFrame({
    "ID": ids,
    "Name": names,
    "Sequence": seqs,
    "Description": descs
})
print("LIB as pandas df:")
print(mapt_morgan_df.head())
print("")
# Accessing a sequence
print("Sequence of First Entry (MAP2HSA4)")
print(mapt_morgan_df["Sequence"][0])
print("")

LIB as pandas df:
         ID      Name                                           Sequence  \
0  MAP2HSA4  MAP2HSA4  (M, A, D, E, R, K, D, E, A, K, A, P, H, W, T, ...   
1  MAP2CJA2  MAP2CJA2  (M, A, D, E, R, K, D, E, A, K, A, P, H, W, T, ...   
2  MAP2GGO2  MAP2GGO2  (M, A, D, E, R, K, D, E, A, K, A, P, H, W, T, ...   
3  MAP2MFA2  MAP2MFA2  (M, A, D, E, R, K, D, E, A, K, A, P, H, W, T, ...   
4  MAP2MML2  MAP2MML2  (M, A, D, E, R, K, D, E, A, K, A, P, H, W, T, ...   

                                         Description  
0  MAP2HSA4 C-1915 aa ORF2 452-6196 1915 aa/6325 ...  
1  MAP2CJA2 C-1918 aa ORF1 1-5754 1918 aa/5917 bp...  
2  MAP2GGO2 C-1915 aa ORF1 1-5745 1915 aa/5748 bp...  
3  MAP2MFA2 C-1915 aa MAP2 Macaca fascicularis (c...  
4  MAP2MML2 C-1915 aa ORF3 30-5774 1915 aa/5903 b...  

Sequence of First Entry (MAP2HSA4)
MADERKDEAKAPHWTSAPLTEASAHSHPPEIKDQGGAGEGLVRSANGFPYREDEEGAFGEHGSQGTYSNTKENGINGELTSADRETAEEVSARIVQVVTAEAVAVLKGEQEKEAQHKDQTAALPLAAEETANLPPSPPPSPASEQTVTVEEDLLTASKM

In [7]:
len(mapt_morgan_df["Sequence"][0])

1915

In [12]:
# Add aa_cnt as a column
mapt_morgan_df["aa_cnt"] = mapt_morgan_df["Sequence"].str.len()
mapt_morgan_df.sample(20)

Unnamed: 0,ID,Name,Sequence,Description,aa_cnt
95,MAP4CHI1,MAP4CHI1,"(M, A, D, L, S, L, A, D, A, L, T, E, P, S, P, ...","MAP4CHI1 C-1045 aa MAP4 Capra hircus (goat), g...",1045
201,MAPTCLF1,MAPTCLF1,"(M, A, E, P, R, Q, E, F, T, V, M, E, D, H, A, ...",MAPTCLF1 C-783 aa ORF3 337-2627 783 aa/2627 bp...,783
262,MAPTPAD1,MAPTPAD1,"(M, A, E, Q, R, Q, D, V, T, V, M, E, D, H, A, ...",MAPTPAD1 C-681 aa ORF1 1-2043 681 aa/2043 bp M...,681
105,MAP4ECA2,MAP4ECA2,"(M, A, D, L, S, L, A, D, A, L, T, D, P, P, P, ...",MAP4ECA2 C-1101 aa ORF1 1-3303 1101 aa/3546 bp...,1101
125,MAP4MOC1,MAP4MOC1,"(M, A, D, L, S, L, V, D, A, L, T, E, P, P, P, ...",MAP4MOC1 1094+38 aa gi|532008429|ref|XP_0053...,1132
5,MAP2NLE2,MAP2NLE2,"(M, A, D, E, R, K, D, E, A, K, A, P, H, W, T, ...",MAP2NLE2 C-1915 aa ORF3 51-5895 1915 aa/5924 b...,1915
240,MAPTTBE1,MAPTTBE1,"(X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, ...",MAPTTBE1 P-634/770 aa ORF1 1-2310 770 aa/2313 ...,770
9,MAP2PPAN2,MAP2PPAN2,"(M, A, D, E, R, K, D, E, A, K, A, P, H, W, T, ...",MAP2PPAN2 C-1915 aa ORF3 30-5774 1915 aa/5903 ...,1915
36,MAP2LVE1,MAP2LVE1,"(M, A, D, D, R, K, D, E, A, K, A, P, H, W, T, ...",MAP2LVE1 C-1926 aa ORF2 440-6217 1926 aa/6613 ...,1926
253,MAPTCPB1,MAPTCPB1,"(M, A, E, Q, R, Q, D, F, N, M, M, E, D, H, S, ...",MAPTCPB1 C-697 aa ORF3 18-2108 697 aa/2347 bp ...,697


In [21]:
mapt_morgan_df.head()

Unnamed: 0,ID,Name,Sequence,Description,aa_cnt
0,MAP2HSA4,MAP2HSA4,"(M, A, D, E, R, K, D, E, A, K, A, P, H, W, T, ...",MAP2HSA4 C-1915 aa ORF2 452-6196 1915 aa/6325 ...,1915
1,MAP2CJA2,MAP2CJA2,"(M, A, D, E, R, K, D, E, A, K, A, P, H, W, T, ...",MAP2CJA2 C-1918 aa ORF1 1-5754 1918 aa/5917 bp...,1918
2,MAP2GGO2,MAP2GGO2,"(M, A, D, E, R, K, D, E, A, K, A, P, H, W, T, ...",MAP2GGO2 C-1915 aa ORF1 1-5745 1915 aa/5748 bp...,1915
3,MAP2MFA2,MAP2MFA2,"(M, A, D, E, R, K, D, E, A, K, A, P, H, W, T, ...",MAP2MFA2 C-1915 aa MAP2 Macaca fascicularis (c...,1915
4,MAP2MML2,MAP2MML2,"(M, A, D, E, R, K, D, E, A, K, A, P, H, W, T, ...",MAP2MML2 C-1915 aa ORF3 30-5774 1915 aa/5903 b...,1915


In [20]:
# Iterate through the df, keeping track of the total aa cnt
# When total count exceeds 1000000, wait on this protein, write the previous set to a partial LIB file,
# and continue

tot_cnt = 0
df_chunk = pd.DataFrame()
chunk_num = 0
outfile_base = os.path.join("LIB_Chunks", "MAPT_Chunk_%s.LIB" % chunk_num)
# iterate through rows in LIB df
for index, row in mapt_morgan_df.iterrows():
    # Add the current rows aa_cnt to the total_cnt
    tot_cnt += row["aa_cnt"]
    # If under the limit, add the current protein to the current chunk
    if tot_cnt < 100000:
        df_chunk = df_chunk.append(row, ignore_index=True)
    # Otherwise, write current chunk to file, reset the total_cnt, advance the chunk_num
    else:
        # Writing all proteins in chunk to file - make record for each protein in chunk
        for chunk_idx, chunk_row in df_chunk.iterrrows():
            # Create SeqIO record 
    break

1915
                                         Description        ID      Name  \
0  MAP2HSA4 C-1915 aa ORF2 452-6196 1915 aa/6325 ...  MAP2HSA4  MAP2HSA4   

                                            Sequence  aa_cnt  
0  (M, A, D, E, R, K, D, E, A, K, A, P, H, W, T, ...  1915.0  
