## Creating DataFrame

In [23]:
from Bio import SeqIO
import pandas as pd

def generate_dataframe(filename) :

    parsed_data = SeqIO.parse(filename, "fasta")
    data = []
    
    for each_data in parsed_data :
        
        id = each_data.id.split("/")[-1].split('fq')[0][:-1]
        seq = str(each_data.seq)
        
        data.append([id,seq])
        
        
    df = pd.DataFrame(data , columns = ["Accession_ID" , "250bp_READ"])

    return df

## Implementing K-mer Genrator

In [24]:
import itertools

def k_mer_generator(size) :
    
    k_mers = []
    bases = "ACGT"

    for size in range(1,size+1) :
        perm = ["".join(x) for x in itertools.product("ACGT" , repeat = size)]
        k_mers += perm
    
    return(k_mers)
    

In [28]:
k_mer_generator(3)

['A',
 'C',
 'G',
 'T',
 'AA',
 'AC',
 'AG',
 'AT',
 'CA',
 'CC',
 'CG',
 'CT',
 'GA',
 'GC',
 'GG',
 'GT',
 'TA',
 'TC',
 'TG',
 'TT',
 'AAA',
 'AAC',
 'AAG',
 'AAT',
 'ACA',
 'ACC',
 'ACG',
 'ACT',
 'AGA',
 'AGC',
 'AGG',
 'AGT',
 'ATA',
 'ATC',
 'ATG',
 'ATT',
 'CAA',
 'CAC',
 'CAG',
 'CAT',
 'CCA',
 'CCC',
 'CCG',
 'CCT',
 'CGA',
 'CGC',
 'CGG',
 'CGT',
 'CTA',
 'CTC',
 'CTG',
 'CTT',
 'GAA',
 'GAC',
 'GAG',
 'GAT',
 'GCA',
 'GCC',
 'GCG',
 'GCT',
 'GGA',
 'GGC',
 'GGG',
 'GGT',
 'GTA',
 'GTC',
 'GTG',
 'GTT',
 'TAA',
 'TAC',
 'TAG',
 'TAT',
 'TCA',
 'TCC',
 'TCG',
 'TCT',
 'TGA',
 'TGC',
 'TGG',
 'TGT',
 'TTA',
 'TTC',
 'TTG',
 'TTT']

## Frequency Builder

In [25]:
import regex as re
def get_frequency (text , search_for) :
    
    return len(re.findall(search_for, text, overlapped = True))

In [31]:
df = generate_dataframe("datasets/pathogenic_test_1.fasta")

In [32]:
df

Unnamed: 0,Accession_ID,250bp_READ
0,GCA_000019345.1,ACTAAAATAAATTTTCTTTCGGCGTTTCGCTTTCATTTTTAAAGCT...
1,GCA_000019345.1,TTGTCAAGTATCTGTGTGTGAAAATTATTTATTAGTCTTATCTCTT...
2,GCA_000019345.1,TTCGGAGTATAACCCATTGTTTTGCTTCATATATTTATTAAAGATT...
3,GCA_000019345.1,ACGACGAGGGGATGACCTGTGGATAGTGGTGAAATTCCAATCGAAC...
4,GCA_000019345.1,GACCTATAAAAGAGGCTTTGCTTTTTAGCAAATAACGATAGACATT...
5,GCA_000019345.1,ATTATTGATTCTTTGAGCAATAATAATCGAATTCTTAAATTTAATT...
6,GCA_000019345.1,CAACTCAAATGTTGAAGCACTAAAATATCAGAGATTTGTAAAAAAT...
7,GCA_000019345.1,ACAGTTGCAATTGTTACAATTGCTGTTAAAACTGTCACGACTCAAA...
8,GCA_000019345.1,TCTTTAGTTCTAAAAGATGACAAGCACAATTAATGTTTAGTGAATT...
9,GCA_000019345.1,ATTAGCCGATAAACTAGATGAAGCATAAACAAGAATTAGCACAAAT...


## Generating Frequency Based DataFrame

In [49]:
def get_frequency_dataframe (filename, max_kmer_size) :

    parsed_data = SeqIO.parse(filename, "fasta")
    data = []
    kmers = k_mer_generator(max_kmer_size)
#     count = 0
    for each_data in parsed_data :
        
        id = each_data.id.split("/")[-1].split('fq')[0][:-1]
        seq = str(each_data.seq)
        
        kmer_counts = []
        for each_kmer in kmers :
            kmer_counts.append(get_frequency(seq, each_kmer))
            
        data.append([id,seq]+kmer_counts)
        
#         count += 1
#         if(count > 100) :
#             break
        
    df = pd.DataFrame(data , columns = ["Accession_ID" , "250bp_READ"]+kmers)

    return df

In [50]:
import time
tic = time.time()
df = get_frequency_dataframe("datasets/pathogenic_test_1.fasta" , 3)
toc = time.time()
toc-tic

662.2532227039337

In [63]:
df.head(15)

Unnamed: 0,Accession_ID,250bp_READ,A,C,G,T,AA,AC,AG,AT,...,TCG,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT
0,GCA_000019345.1,ACTAAAATAAATTTTCTTTCGGCGTTTCGCTTTCATTTTTAAAGCT...,75,44,28,103,32,11,4,28,...,5,8,5,2,2,0,10,14,5,22
1,GCA_000019345.1,TTGTCAAGTATCTGTGTGTGAAAATTATTTATTAGTCTTATCTCTT...,87,23,39,99,38,10,16,23,...,2,4,4,0,3,6,18,0,3,23
2,GCA_000019345.1,TTCGGAGTATAACCCATTGTTTTGCTTCATATATTTATTAAAGATT...,79,35,22,114,21,7,8,43,...,2,6,4,1,0,3,19,5,7,22
3,GCA_000019345.1,ACGACGAGGGGATGACCTGTGGATAGTGGTGAAATTCCAATCGAAC...,72,47,65,66,18,12,16,25,...,5,4,6,0,6,4,5,4,2,3
4,GCA_000019345.1,GACCTATAAAAGAGGCTTTGCTTTTTAGCAAATAACGATAGACATT...,97,21,18,113,49,6,6,34,...,0,7,3,2,0,4,17,6,4,35
5,GCA_000019345.1,ATTATTGATTCTTTGAGCAATAATAATCGAATTCTTAAATTTAATT...,66,42,40,101,21,6,15,24,...,2,7,5,7,0,6,7,8,11,19
6,GCA_000019345.1,CAACTCAAATGTTGAAGCACTAAAATATCAGAGATTTGTAAAAAAT...,91,38,41,80,42,10,9,29,...,4,2,8,4,1,6,8,5,10,8
7,GCA_000019345.1,ACAGTTGCAATTGTTACAATTGCTGTTAAAACTGTCACGACTCAAA...,101,36,32,81,52,14,7,28,...,3,0,4,4,3,4,8,3,11,13
8,GCA_000019345.1,TCTTTAGTTCTAAAAGATGACAAGCACAATTAATGTTTAGTGAATT...,107,26,28,89,46,10,13,38,...,1,4,9,0,1,2,18,4,3,12
9,GCA_000019345.1,ATTAGCCGATAAACTAGATGAAGCATAAACAAGAATTAGCACAAAT...,96,34,41,79,43,12,15,26,...,2,0,8,3,2,4,12,3,8,11


In [62]:
df.groupby(["Accession_ID"]).mean()

Unnamed: 0_level_0,A,C,G,T,AA,AC,AG,AT,CA,CC,...,TCG,TCT,TGA,TGC,TGG,TGT,TTA,TTC,TTG,TTT
Accession_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCA_000019345.1,93.088627,31.961085,31.953302,92.686166,40.215667,10.502134,9.961838,31.913131,13.877479,4.398443,...,1.173989,3.119006,5.372835,2.288225,2.115742,3.989455,12.621642,4.549586,6.062516,16.487572
GCA_000023005.1,84.078497,40.556059,40.595673,84.475808,31.075428,11.160064,14.004678,27.412805,12.451542,7.148224,...,1.752083,4.309019,3.985821,3.468791,1.867417,3.140038,9.764216,4.430054,4.874726,12.092677
GCA_000046685.1,76.250627,48.480248,48.648884,76.328559,28.343,11.889844,13.448622,22.178542,17.144528,9.459721,...,2.150376,4.353383,5.00728,4.358754,3.570354,4.155627,5.575964,5.25182,6.283327,11.172455
GCA_000143845.1,44.129138,80.699195,80.61981,44.252515,6.752607,13.596488,14.556704,8.990763,15.153924,22.580849,...,7.078288,2.936528,3.020669,5.041613,4.916408,2.098592,0.416682,2.767057,2.601884,0.943113
GCA_000146835.1,85.527891,39.263175,39.307845,85.602707,36.469689,9.051161,14.506546,25.057872,13.648036,6.796457,...,1.280559,4.411046,4.457916,3.603697,2.516118,3.00154,8.746397,4.71372,5.848828,17.04775
GCA_000146855.1,84.320807,40.497796,40.688756,84.191408,30.058722,11.328651,15.043407,27.444079,14.404632,7.008965,...,1.108853,5.201121,4.943668,2.998282,2.30579,4.132163,7.333433,4.767949,5.555099,12.168771
GCA_000154485.1,85.580864,39.33314,39.378255,85.40951,32.05938,11.167643,12.377617,29.527051,15.063903,6.95129,...,1.623137,3.82076,5.374717,2.844535,2.933952,3.831487,9.324732,4.917831,5.677704,11.81757
GCA_000157255.1,73.874407,50.953607,50.844388,74.027527,26.174715,12.094269,13.742169,21.487186,16.901459,10.039511,...,3.277943,4.448208,5.455268,3.877432,3.730838,3.756407,4.169613,5.773968,6.353346,9.817691
GCA_000157875.1,53.470497,71.353735,71.466453,53.410897,14.742386,10.665358,12.301987,15.484,16.493219,17.926005,...,5.467999,2.777659,3.909826,5.756365,4.421128,2.374138,1.278313,5.072567,4.094337,4.162562
GCA_000160815.2,79.231736,45.705737,45.698324,79.067361,28.207671,13.480125,12.00462,25.137086,16.513859,8.408144,...,2.771272,3.949076,5.548023,3.192415,3.232381,4.406854,6.78223,5.638483,6.041362,9.567254
