## Implementing the File Parser-cum-Dataframe Generator

In [1]:
from Bio import SeqIO
import pandas as pd

def get_a_view(filename) :

    parsed_data = SeqIO.parse(filename, "fasta")
    counter = 0
    data = []
    
    for each_data in parsed_data :
        
        id = each_data.id.split("/")[-1].split('fq')[0][:-1]
        seq = str(each_data.seq)
        label = each_data.id.split("/")[-2]
        
        data.append([id,seq,label])
        
    df = pd.DataFrame(data , columns = ["Accession_ID" , "250bp_READ" , "LABEL"])

    return (df)

In [2]:
import time
tic= time.time()

dataframe = get_a_view("datasets/pathogenic_val.fasta")

toc=time.time()
print(toc-tic)

25.445895195007324


In [3]:
dataframe["Accession_ID"].nunique()

39

## Statistics for Validation_Data

In [4]:
dataframe_path_val = get_a_view("datasets/pathogenic_val.fasta")
dataframe_path_val["Accession_ID"].nunique()

39

In [5]:
dataframe_nonpath_val = get_a_view("datasets/nonpathogenic_val.fasta")
dataframe_nonpath_val["Accession_ID"].nunique()

6

In [11]:
print("Length of Total_Val_Data :", len(dataframe_nonpath_val)+len(dataframe_path_val))

Length of Total_Val_Data : 2500025


In [12]:
print("Length of Pathogenic_Val_Data :", len(dataframe_path_val))
print("Length of NonPathogenic_Val_Data :", len(dataframe_nonpath_val))

Length of Pathogenic_Val_Data : 1250020
Length of NonPathogenic_Val_Data : 1250005


In [13]:
# adding both the pathogenic + nonpathogenic dataframes using concat() and also resetting the index
# to get a new ordered index

concat_data_val = pd.concat([dataframe_path_val, dataframe_nonpath_val]).reset_index(drop = True)

In [14]:
concat_data_val

Unnamed: 0,Accession_ID,250bp_READ,LABEL
0,GCA_000007705.1,GCCGACTTGCGCCGGCGGCAGCGGCCGGGTTTCCACGGTGCCGCCT...,pathogenic
1,GCA_000007705.1,TGCCGAAGCCCGCGCCTTCGCGTAGGCGCCGCCGCCCATCCCTGTT...,pathogenic
2,GCA_000007705.1,ACAACGAGCTGCTGTTCGACGACACCCCCGGCGAAGTGCGCGCCAA...,pathogenic
3,GCA_000007705.1,GACTGGGCGCGCCGCTGTTCCAGCCATTCGCGGCCATAGGCGGTGA...,pathogenic
4,GCA_000007705.1,CGGCGAAGCCCCATCCCGCTATTCGTAGCTGGCGCTGACGGCGACG...,pathogenic
5,GCA_000007705.1,CAAGATATTGCCATCGCTGATATCGCCATGCAGTATTCCCAGATCG...,pathogenic
6,GCA_000007705.1,CGAGTCCACGCTGCGCTTCGCCCAGATCGCGCTGGACAGCACCGAG...,pathogenic
7,GCA_000007705.1,GCCGCGCTGACCCGCGAATGGCTGAAGGCCCAGGTGCTGACGGTGC...,pathogenic
8,GCA_000007705.1,GTTGGCCACTGGAGCAGAAAGCCCAGCATGATGACGACAAAAGCGA...,pathogenic
9,GCA_000007705.1,GTCGAATTCTTCCTGGTTGATCAGGCCGGTCTGCAAGGCCTCCGCC...,pathogenic


In [15]:
concat_data_val["Accession_ID"].nunique()

45

##   

## Combining Data for both Pathogen Test Files 1 & 2 

In [16]:
dataframe_path_test_1 = get_a_view("datasets/pathogenic_test_1.fasta")
dataframe_path_test_1["Accession_ID"].nunique()

39

In [17]:
dataframe_path_test_2 = get_a_view("datasets/pathogenic_test_2.fasta")
dataframe_path_test_2["Accession_ID"].nunique()

39

In [18]:
print("Length of Pathogenic_Test_1_Data :", len(dataframe_path_test_1))
print("Length of Pathogenic_Test_2_Data :", len(dataframe_path_test_2))

Length of Pathogenic_Test_1_Data : 625019
Length of Pathogenic_Test_2_Data : 625019


In [19]:
print("Length of Total_Test_1_Data :", len(dataframe_path_test_1)+len(dataframe_path_test_2))

Length of Total_Test_1_Data : 1250038


In [20]:
# adding both the pathogenic_test_1 + pathogenic_test_2 dataframes using concat() and also resetting the index
# to get a new ordered index

concat_data_path_test = pd.concat([dataframe_path_test_1, dataframe_path_test_2]).reset_index(drop = True)

In [21]:
concat_data_path_test.head()

Unnamed: 0,Accession_ID,250bp_READ,LABEL
0,GCA_000019345.1,ACTAAAATAAATTTTCTTTCGGCGTTTCGCTTTCATTTTTAAAGCT...,pathogenic
1,GCA_000019345.1,TTGTCAAGTATCTGTGTGTGAAAATTATTTATTAGTCTTATCTCTT...,pathogenic
2,GCA_000019345.1,TTCGGAGTATAACCCATTGTTTTGCTTCATATATTTATTAAAGATT...,pathogenic
3,GCA_000019345.1,ACGACGAGGGGATGACCTGTGGATAGTGGTGAAATTCCAATCGAAC...,pathogenic
4,GCA_000019345.1,GACCTATAAAAGAGGCTTTGCTTTTTAGCAAATAACGATAGACATT...,pathogenic


In [27]:
concat_data_path_test.tail()

Unnamed: 0,Accession_ID,250bp_READ,LABEL
1250033,GCA_001272835.1,ATCCCACGGCTTTCATAATCCTTTTCCTAAACAAGAAGTAAACAAT...,pathogenic
1250034,GCA_001272835.1,ATGGGTTATCGGCTAAACTACCTTACGTTAACACTAACGGTAATTA...,pathogenic
1250035,GCA_001272835.1,CCAGCACCAGAATAGACAATGTTGACACCCTTACCACAGGTAACAG...,pathogenic
1250036,GCA_001272835.1,AGCTGTACTGGTTGGGTTTCACTAAGTTGAGGTGGTACTTTAACTC...,pathogenic
1250037,GCA_001272835.1,GCTTTGAGTTTGGTTAAATCTCATTGACTTTGCTCCACTAGCTTTT...,pathogenic


In [22]:
concat_data_path_test["Accession_ID"].nunique()

39

In [23]:
concat_data_path_test["LABEL"].nunique()

1

##   

## Combining Data for both NonPathogen Test Files 1 & 2 

In [24]:
dataframe_nonpath_test_1 = get_a_view("datasets/nonpathogenic_test_1.fasta")
dataframe_nonpath_test_1["Accession_ID"].nunique()

6

In [25]:
dataframe_nonpath_test_2 = get_a_view("datasets/nonpathogenic_test_2.fasta")
dataframe_nonpath_test_2["Accession_ID"].nunique()

6

In [26]:
print("Length of NonPathogenic_Test_1_Data :", len(dataframe_nonpath_test_1))
print("Length of NonPathogenic_Test_2_Data :", len(dataframe_nonpath_test_2))

Length of NonPathogenic_Test_1_Data : 625003
Length of NonPathogenic_Test_2_Data : 625003


In [28]:
print("Total Length of NonPathogenic_Test_Data :", len(dataframe_nonpath_test_1)+len(dataframe_nonpath_test_2))

Total Length of NonPathogenic_Test_Data : 1250006


In [29]:
# adding both the nonpathogenic_test_1 + nonpathogenic_test_2 dataframes using concat() and also resetting the index
# to get a new ordered index

concat_data_nonpath_test = pd.concat([dataframe_nonpath_test_1, dataframe_nonpath_test_2]).reset_index(drop = True)

In [30]:
concat_data_nonpath_test

Unnamed: 0,Accession_ID,250bp_READ,LABEL
0,GCA_000017585.1,ATAACACTAGTATAGGTATAAAACTAAGTAGAGAATTAAAAGAAGA...,nonpathogenic
1,GCA_000017585.1,ATAAATNGAACAATTACACGACGTAAAAATTTATTGTTTTAAAATG...,nonpathogenic
2,GCA_000017585.1,GTTTGAAAACGATTTAAAGGTATTTTATCCAAATACAATGCTTATT...,nonpathogenic
3,GCA_000017585.1,ACAATCATACGAATCGCTCAATATCGTTTATCGTTCTGCTAATCAG...,nonpathogenic
4,GCA_000017585.1,TTGGTTTTAAGGGTAAAAAATATTTTTGGTTTAAGTATAACTTTAA...,nonpathogenic
5,GCA_000017585.1,ACTCGGCGGTAAGTGAAATTCATCCGTTTGTATTTTTAAATTTTAC...,nonpathogenic
6,GCA_000017585.1,AAAATGACGGCCGGCATTTGCAAAATACTTTGCAAACTTGCTCCAA...,nonpathogenic
7,GCA_000017585.1,TCTCCTGTTGCTGAAGTATCTTTAGCAACTGTTGTGTTTTCTCCAT...,nonpathogenic
8,GCA_000017585.1,AAAGTATGGTAAAAAGATTACTTGAAGCTTTCNGAGAATCTTAATG...,nonpathogenic
9,GCA_000017585.1,AGCATACAGAACGTCACAAGTATAAAAGAGATGATTCTAGGAATTT...,nonpathogenic


In [31]:
concat_data_nonpath_test["Accession_ID"].nunique()

6

In [32]:
concat_data_nonpath_test["LABEL"].nunique()

1

##   

## Combining Data for PathogenTest  and NonPathogenTest 

In [33]:
len(concat_data_path_test)

1250038

In [34]:
len(concat_data_nonpath_test)

1250006

In [35]:
# concatenating data from both sets 1 & 2 to include both pathogenic as well as nonpathogenic
concat_data_test = pd.concat([concat_data_path_test,concat_data_nonpath_test]).reset_index(drop=True)

In [36]:
concat_data_test

Unnamed: 0,Accession_ID,250bp_READ,LABEL
0,GCA_000019345.1,ACTAAAATAAATTTTCTTTCGGCGTTTCGCTTTCATTTTTAAAGCT...,pathogenic
1,GCA_000019345.1,TTGTCAAGTATCTGTGTGTGAAAATTATTTATTAGTCTTATCTCTT...,pathogenic
2,GCA_000019345.1,TTCGGAGTATAACCCATTGTTTTGCTTCATATATTTATTAAAGATT...,pathogenic
3,GCA_000019345.1,ACGACGAGGGGATGACCTGTGGATAGTGGTGAAATTCCAATCGAAC...,pathogenic
4,GCA_000019345.1,GACCTATAAAAGAGGCTTTGCTTTTTAGCAAATAACGATAGACATT...,pathogenic
5,GCA_000019345.1,ATTATTGATTCTTTGAGCAATAATAATCGAATTCTTAAATTTAATT...,pathogenic
6,GCA_000019345.1,CAACTCAAATGTTGAAGCACTAAAATATCAGAGATTTGTAAAAAAT...,pathogenic
7,GCA_000019345.1,ACAGTTGCAATTGTTACAATTGCTGTTAAAACTGTCACGACTCAAA...,pathogenic
8,GCA_000019345.1,TCTTTAGTTCTAAAAGATGACAAGCACAATTAATGTTTAGTGAATT...,pathogenic
9,GCA_000019345.1,ATTAGCCGATAAACTAGATGAAGCATAAACAAGAATTAGCACAAAT...,pathogenic


In [37]:
len(concat_data_test)

2500044

##  
## Combining both val_data + test_data

In [39]:
print("Length of Total_Val_Data ", len(concat_data_val))

Length of Total_Val_Data  2500025


In [40]:
print("Length of Total_Test_Data ", len(concat_data_test))

Length of Total_Test_Data  2500044


In [41]:
total_data = pd.concat([concat_data_val,concat_data_test]).reset_index(drop=True)

In [42]:
total_data

Unnamed: 0,Accession_ID,250bp_READ,LABEL
0,GCA_000007705.1,GCCGACTTGCGCCGGCGGCAGCGGCCGGGTTTCCACGGTGCCGCCT...,pathogenic
1,GCA_000007705.1,TGCCGAAGCCCGCGCCTTCGCGTAGGCGCCGCCGCCCATCCCTGTT...,pathogenic
2,GCA_000007705.1,ACAACGAGCTGCTGTTCGACGACACCCCCGGCGAAGTGCGCGCCAA...,pathogenic
3,GCA_000007705.1,GACTGGGCGCGCCGCTGTTCCAGCCATTCGCGGCCATAGGCGGTGA...,pathogenic
4,GCA_000007705.1,CGGCGAAGCCCCATCCCGCTATTCGTAGCTGGCGCTGACGGCGACG...,pathogenic
5,GCA_000007705.1,CAAGATATTGCCATCGCTGATATCGCCATGCAGTATTCCCAGATCG...,pathogenic
6,GCA_000007705.1,CGAGTCCACGCTGCGCTTCGCCCAGATCGCGCTGGACAGCACCGAG...,pathogenic
7,GCA_000007705.1,GCCGCGCTGACCCGCGAATGGCTGAAGGCCCAGGTGCTGACGGTGC...,pathogenic
8,GCA_000007705.1,GTTGGCCACTGGAGCAGAAAGCCCAGCATGATGACGACAAAAGCGA...,pathogenic
9,GCA_000007705.1,GTCGAATTCTTCCTGGTTGATCAGGCCGGTCTGCAAGGCCTCCGCC...,pathogenic


In [44]:
print("Length of Total_Data :", len(total_data))

Length of Total_Data : 5000069


# Shuffling the Data before Storing

In [45]:
import sklearn

In [50]:
from sklearn.utils import shuffle
total_data = shuffle(total_data).reset_index(drop=True)

In [51]:
total_data

Unnamed: 0,Accession_ID,250bp_READ,LABEL
0,GCA_000154485.1,AGCATCCGTATTACCAGTTCCACCATTTCTAACTTGAACCGCTTGA...,pathogenic
1,GCA_000154205.1,CTATTTCCGCAGCCATGCCCCCGCTACCTGCAAGACTTCTCCTGGT...,nonpathogenic
2,GCA_000167995.1,GGTGATGCGCACCTATACCTTGCGCGCGCTGCGTGCAGAGCAAAAC...,pathogenic
3,GCA_000512375.1,ATGGGGATAGCCAAGAGATAACATTCTTATCCTCACAACTATGGCC...,pathogenic
4,GCA_000154305.1,TGAAAAGATACTGAGAGAAGATCCTGCAATCACTCATGCGGCTATG...,nonpathogenic
5,GCA_000157955.1,TTCGCCCAAAAGCACACAAAAAAGCCACACAATGATCCAAATCAGA...,nonpathogenic
6,GCA_000238635.1,CCGCACAGAGAAAGGATGCCGGATATGAGCGAGACACATTTCCCCC...,nonpathogenic
7,GCA_000156195.1,TATGACGAAGGAGGCATCATTGCAGCTCCCGGTTCGGCAATAGAAT...,nonpathogenic
8,GCA_000022745.1,TTTAGGGTACGGTCTATATGCAGGAGCTATTTCCTGGAANCGCTTC...,pathogenic
9,GCA_000153925.1,GTTTACATACATGGCAAGACCCTGATCTTTATCCTGCACCCATGTA...,nonpathogenic


## Storing the DataFrame for later use

In [52]:
tic = time.time()
total_data.to_pickle("datasets/total_data_SHUFFLED_without_training")
toc = time.time()
toc-tic

39.01052951812744

## Checking to see if the Total_Shuffled_Data can be parsed back

In [53]:
tic = time.time()
check_data = pd.read_pickle("datasets/total_data_SHUFFLED_without_training")
toc = time.time()
toc-tic

8.108045816421509

In [54]:
check_data

Unnamed: 0,Accession_ID,250bp_READ,LABEL
0,GCA_000154485.1,AGCATCCGTATTACCAGTTCCACCATTTCTAACTTGAACCGCTTGA...,pathogenic
1,GCA_000154205.1,CTATTTCCGCAGCCATGCCCCCGCTACCTGCAAGACTTCTCCTGGT...,nonpathogenic
2,GCA_000167995.1,GGTGATGCGCACCTATACCTTGCGCGCGCTGCGTGCAGAGCAAAAC...,pathogenic
3,GCA_000512375.1,ATGGGGATAGCCAAGAGATAACATTCTTATCCTCACAACTATGGCC...,pathogenic
4,GCA_000154305.1,TGAAAAGATACTGAGAGAAGATCCTGCAATCACTCATGCGGCTATG...,nonpathogenic
5,GCA_000157955.1,TTCGCCCAAAAGCACACAAAAAAGCCACACAATGATCCAAATCAGA...,nonpathogenic
6,GCA_000238635.1,CCGCACAGAGAAAGGATGCCGGATATGAGCGAGACACATTTCCCCC...,nonpathogenic
7,GCA_000156195.1,TATGACGAAGGAGGCATCATTGCAGCTCCCGGTTCGGCAATAGAAT...,nonpathogenic
8,GCA_000022745.1,TTTAGGGTACGGTCTATATGCAGGAGCTATTTCCTGGAANCGCTTC...,pathogenic
9,GCA_000153925.1,GTTTACATACATGGCAAGACCCTGATCTTTATCCTGCACCCATGTA...,nonpathogenic


In [55]:
check_data == total_data

Unnamed: 0,Accession_ID,250bp_READ,LABEL
0,True,True,True
1,True,True,True
2,True,True,True
3,True,True,True
4,True,True,True
5,True,True,True
6,True,True,True
7,True,True,True
8,True,True,True
9,True,True,True
