In [28]:
import pandas as pd
import sklearn
import time

## Implementing K-mer Genrator

In [29]:
import itertools

def k_mer_generator(sizes) :
    """
    Input :
        sizes = a list containing all the sizes of which k-mers are required to be genrated
    Output :
        returns a list of all possible kmers which are required to be generated
        """
    k_mers = []
    bases = "ACGT"

    for size in (sizes) :
        perm = ["".join(x) for x in itertools.product("ACGT" , repeat = size)]
        k_mers += perm
    
    return(k_mers)
    

In [30]:
k_mer_generator([1,2])

['A',
 'C',
 'G',
 'T',
 'AA',
 'AC',
 'AG',
 'AT',
 'CA',
 'CC',
 'CG',
 'CT',
 'GA',
 'GC',
 'GG',
 'GT',
 'TA',
 'TC',
 'TG',
 'TT']

## Frequency Builder

In [31]:
import regex as re
def get_frequency (text , search_for) :
    
    return len(re.findall(search_for, text, overlapped = True))

## Loading the total_dataset

In [32]:
import time
tic = time.time()
data = pd.read_pickle("datasets/total_data_SHUFFLED_without_training")
toc = time.time()
toc-tic

13.310059785842896

In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000069 entries, 0 to 5000068
Data columns (total 3 columns):
Accession_ID    object
250bp_READ      object
LABEL           object
dtypes: object(3)
memory usage: 114.4+ MB


In [34]:
data

Unnamed: 0,Accession_ID,250bp_READ,LABEL
0,GCA_000154485.1,AGCATCCGTATTACCAGTTCCACCATTTCTAACTTGAACCGCTTGA...,pathogenic
1,GCA_000154205.1,CTATTTCCGCAGCCATGCCCCCGCTACCTGCAAGACTTCTCCTGGT...,nonpathogenic
2,GCA_000167995.1,GGTGATGCGCACCTATACCTTGCGCGCGCTGCGTGCAGAGCAAAAC...,pathogenic
3,GCA_000512375.1,ATGGGGATAGCCAAGAGATAACATTCTTATCCTCACAACTATGGCC...,pathogenic
4,GCA_000154305.1,TGAAAAGATACTGAGAGAAGATCCTGCAATCACTCATGCGGCTATG...,nonpathogenic
5,GCA_000157955.1,TTCGCCCAAAAGCACACAAAAAAGCCACACAATGATCCAAATCAGA...,nonpathogenic
6,GCA_000238635.1,CCGCACAGAGAAAGGATGCCGGATATGAGCGAGACACATTTCCCCC...,nonpathogenic
7,GCA_000156195.1,TATGACGAAGGAGGCATCATTGCAGCTCCCGGTTCGGCAATAGAAT...,nonpathogenic
8,GCA_000022745.1,TTTAGGGTACGGTCTATATGCAGGAGCTATTTCCTGGAANCGCTTC...,pathogenic
9,GCA_000153925.1,GTTTACATACATGGCAAGACCCTGATCTTTATCCTGCACCCATGTA...,nonpathogenic


## Generating Frequency Based DataFrame

In [35]:
def build_frequency_dataframe(df , kmer_sizes) :
    """
    Input :
        df = dataframe for which the frequency of kmers is to be generated
        kmer_sizes = list containing the sizes of kmers to be considered for building
        
    Output :
        df = dataframe with new added features having frequency count of kmers"""
    
    kmers = k_mer_generator(kmer_sizes)
    for each_kmer in kmers :
        
        df[each_kmer] = df.apply(lambda x: get_frequency(x["250bp_READ"] , each_kmer) , axis = 1)
        
    return df
    

# Testing on Small Datasets

# Test_Dataset_1

In [36]:
import time
test_dataset_1 = data[0:10000]
tic = time.time()
build_frequency_dataframe(test_dataset_1,[1,2,3,4])
toc = time.time()
print(toc-tic)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


124.85884737968445


In [37]:
test_dataset_1.to_pickle("Generated_Datasets/Frequency_Data_0_9999")

In [38]:
test_dataset_1

Unnamed: 0,Accession_ID,250bp_READ,LABEL,A,C,G,T,AA,AC,AG,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
0,GCA_000154485.1,AGCATCCGTATTACCAGTTCCACCATTTCTAACTTGAACCGCTTGA...,pathogenic,67,59,31,93,18,18,6,...,0,1,4,2,1,3,4,2,4,3
1,GCA_000154205.1,CTATTTCCGCAGCCATGCCCCCGCTACCTGCAAGACTTCTCCTGGT...,nonpathogenic,64,78,56,51,16,23,12,...,0,3,0,1,0,0,1,2,1,0
2,GCA_000167995.1,GGTGATGCGCACCTATACCTTGCGCGCGCTGCGTGCAGAGCAAAAC...,pathogenic,48,67,89,46,9,15,11,...,0,0,0,3,1,1,0,0,1,0
3,GCA_000512375.1,ATGGGGATAGCCAAGAGATAACATTCTTATCCTCACAACTATGGCC...,pathogenic,70,57,51,72,21,13,13,...,3,3,1,1,2,0,1,2,0,4
4,GCA_000154305.1,TGAAAAGATACTGAGAGAAGATCCTGCAATCACTCATGCGGCTATG...,nonpathogenic,69,45,64,72,18,9,17,...,0,4,2,1,1,0,1,2,1,4
5,GCA_000157955.1,TTCGCCCAAAAGCACACAAAAAAGCCACACAATGATCCAAATCAGA...,nonpathogenic,78,59,59,54,35,15,13,...,1,1,1,0,2,1,2,0,1,4
6,GCA_000238635.1,CCGCACAGAGAAAGGATGCCGGATATGAGCGAGACACATTTCCCCC...,nonpathogenic,63,72,79,36,18,16,15,...,0,0,0,0,0,0,1,2,0,1
7,GCA_000156195.1,TATGACGAAGGAGGCATCATTGCAGCTCCCGGTTCGGCAATAGAAT...,nonpathogenic,73,51,68,58,21,8,22,...,3,3,0,2,2,0,0,2,0,2
8,GCA_000022745.1,TTTAGGGTACGGTCTATATGCAGGAGCTATTTCCTGGAANCGCTTC...,pathogenic,57,74,52,65,13,17,11,...,3,0,0,0,1,0,3,3,0,0
9,GCA_000153925.1,GTTTACATACATGGCAAGACCCTGATCTTTATCCTGCACCCATGTA...,nonpathogenic,70,62,37,80,20,17,7,...,0,3,0,0,0,2,4,3,1,2


# Test_Dataset_2

In [39]:
import time
test_dataset_2 = data[10000:100000]
tic = time.time()
build_frequency_dataframe(test_dataset_2,[1,2,3,4])
toc = time.time()
print(toc-tic)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


974.5343379974365


In [40]:
test_dataset_2.to_pickle("Generated_Datasets/Frequency_Data_10000_99999")

In [41]:
test_dataset_2

Unnamed: 0,Accession_ID,250bp_READ,LABEL,A,C,G,T,AA,AC,AG,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
10000,GCA_000153925.1,ACCGATATCATAGCTTCCTTCGATGGCAGATGTCATACCTGTCGTT...,nonpathogenic,59,65,45,81,10,18,9,...,2,4,1,0,1,1,0,2,2,3
10001,GCA_000174855.1,CAATCCCTCCACATAGACAAAGTCCCTTGCCAGAGGGAGTAAAATA...,pathogenic,94,42,59,55,37,15,28,...,0,0,4,1,0,1,0,0,4,4
10002,GCA_000153885.1,TAGCCTATTGACGTATACGATCAAGCAAGTACAATAAAAACACTTA...,nonpathogenic,102,27,51,70,39,10,22,...,0,1,3,1,0,2,1,0,3,1
10003,GCA_000156495.1,ATAACAATACAATTCCGTTGTGCGATACGGAAGTCACTCTTTCCGG...,nonpathogenic,65,53,69,63,14,16,10,...,1,0,0,1,1,1,1,3,2,2
10004,GCA_000221825.2,ATGATTTAGATTCTGGTAAATGTATATTTTGTGATGAAATAATGTA...,pathogenic,72,34,58,86,25,9,10,...,1,2,1,1,2,3,3,1,2,7
10005,GCA_000238635.1,ATGGCGGTCTGCGTTATTTTTTTGAACCGTTTCCCGCTAATCTCGT...,nonpathogenic,78,51,58,63,31,9,17,...,1,0,3,0,0,0,2,2,1,9
10006,GCA_000157955.1,GGAAATCGCGCTGCAGATCAACGCCCTCAAGGGCATGAAGGAGATG...,nonpathogenic,57,81,71,41,13,13,16,...,1,0,0,0,0,0,0,0,0,0
10007,GCA_000238635.1,GCTCAGACGGTCTTGGCTTTGGAGGAAAATCCGGAGCAAGATTTTC...,nonpathogenic,69,52,68,61,21,14,14,...,0,1,1,1,2,0,2,2,1,3
10008,GCA_000969835.1,TCCCCCCAATAATACATTTCTGATTATGATATTCATATTGTTATTT...,nonpathogenic,71,49,39,91,21,12,8,...,2,3,1,0,0,4,2,5,2,3
10009,GCA_000153885.1,CTCCTATATAGTAGTTTATTAAAAAAATACTATATAGGAGCTTTAT...,nonpathogenic,75,46,27,102,21,11,11,...,0,3,2,0,0,1,3,6,2,5


In [47]:
# checking to see if there are any errors by comparing the accession ids in data and test_dataset_2

(test_dataset_2["Accession_ID"] == data[10000:100000]["Accession_ID"]).sum()

90000

# Test_Dataset_3

In [48]:
import time
test_dataset_3 = data[100000:200000]
tic = time.time()
build_frequency_dataframe(test_dataset_3,[1,2,3,4])
toc = time.time()
print(toc-tic)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


747.6057674884796


In [49]:
test_dataset_3.to_pickle("Generated_Datasets/Frequency_Data_100000_199999")

In [50]:
test_dataset_3

Unnamed: 0,Accession_ID,250bp_READ,LABEL,A,C,G,T,AA,AC,AG,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
100000,GCA_000157175.1,GAGAAAACGATGAAATTAAGAGAGTTATTATCGATAGGTGACTTAA...,pathogenic,90,28,58,74,37,8,22,...,2,2,4,1,3,0,2,2,2,5
100001,GCA_000168015.1,ACGACAACCGCCTCTACGGACGTGATATTTGCCATTGTGACCAGCC...,pathogenic,65,68,64,52,13,22,11,...,1,0,0,2,1,1,0,0,1,0
100002,GCA_000017585.1,TTTTAAAAAATATAAAAACGATAATGCTTTTTTAAGAAAAATTCTG...,nonpathogenic,95,38,30,87,50,7,9,...,1,1,0,2,0,1,7,1,1,12
100003,GCA_000174855.1,TCTCTGTTGAGATATTGTTTTTAAAATATTGTACTAAATGATTGAT...,pathogenic,86,23,34,107,35,11,7,...,1,2,4,0,0,7,4,2,4,13
100004,GCA_000242215.1,TTTGGAAGCGGATAATTTTTCATGTTCATGAATCACATTATTCTTA...,pathogenic,73,59,37,81,22,18,14,...,1,2,1,0,2,1,3,4,2,11
100005,GCA_000154525.1,ATGGGTCTGTTGGCCAAGAACGCCATCCTGATCGTAGAGTTCGCCC...,nonpathogenic,44,68,72,66,10,9,7,...,2,2,1,1,1,1,0,0,1,0
100006,GCA_000154205.1,CCATAGACTTTATTGCTTTTTTATCCATCTGAGATTATCAGCCAGC...,nonpathogenic,55,69,57,68,6,12,12,...,1,1,0,5,1,1,2,0,2,3
100007,GCA_000154525.1,CTCTTTCCCATCTCACGCAGCATCCAGCCCACCGCTTTCCGCATCA...,nonpathogenic,63,78,55,54,19,12,15,...,0,1,1,1,0,1,0,3,2,2
100008,GCA_000969835.1,TATTGAAAGAGGCACTGTATTATTATCCGAATGGTCGTCGTTCACT...,nonpathogenic,66,49,58,76,20,9,12,...,3,2,1,1,0,0,2,4,0,4
100009,GCA_000156195.1,CTATATCAGAATACGTACAAAAAGCCCCCCAACTGGCTTCGTATGC...,nonpathogenic,89,65,40,56,32,22,12,...,1,2,1,1,0,0,0,2,0,0


In [51]:
# checking to see if there are any errors by comparing the accession ids in data and test_dataset_3

(test_dataset_3["Accession_ID"] == data[100000:200000]["Accession_ID"]).sum()

100000

# Test_Dataset_4

In [52]:
import time
test_dataset_4 = data[200000:300000]
tic = time.time()
build_frequency_dataframe(test_dataset_4,[1,2,3,4])
toc = time.time()
print(toc-tic)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


765.397216796875


In [53]:
test_dataset_4.to_pickle("Generated_Datasets/Frequency_Data_200000_299999")

In [54]:
test_dataset_4

Unnamed: 0,Accession_ID,250bp_READ,LABEL,A,C,G,T,AA,AC,AG,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
200000,GCA_000154205.1,GGCATGGTTTCGCTCAGTTGGCATGCGCGCAATCCCAAGACGGGCG...,nonpathogenic,48,60,86,55,12,12,11,...,2,0,1,2,2,0,0,2,1,0
200001,GCA_000182255.1,GGTCCCGACACAGAACGTCGGCAGGTACACCATCATCGACGGCAGC...,pathogenic,50,77,78,45,10,17,12,...,2,2,0,0,1,0,0,1,0,0
200002,GCA_000318255.2,CACTTTGCGGTCTGGGCGGATTTGCATAGGAATTTGGAAGGTAGCA...,pathogenic,66,62,49,73,20,22,9,...,1,3,0,4,2,1,2,2,4,4
200003,GCA_000318255.2,GTTAATTTTCTCTTGGGTTTTGCGCTCTTCGTTGTTCATTGCCTCC...,pathogenic,26,47,56,121,5,3,6,...,1,3,1,3,5,14,5,1,7,12
200004,GCA_000305715.2,GGGAATTGGGTTTGTGTTGCTTCATGCCTTGGTCAACCCACTGACG...,pathogenic,53,54,70,73,9,15,10,...,0,1,0,3,6,3,2,0,2,2
200005,GCA_000208525.2,TCGGGCAGCAGTTGGTTAACGCCGTCAGATGCTCCGGGGTTAACTT...,pathogenic,55,77,69,49,12,15,18,...,0,1,0,0,1,0,0,3,0,1
200006,GCA_000156195.1,TTGATTGGAAACGCTGCCGACGGCACCTTGTACAGATTCTCAAAAA...,nonpathogenic,73,60,44,72,24,17,9,...,1,2,2,0,1,3,1,0,2,3
200007,GCA_000156195.1,GATCGTTAACGGTCAAGCTATCCGTATCACAGCTGAAAGAAATCCG...,nonpathogenic,80,47,56,67,34,13,15,...,2,2,2,1,0,3,0,0,2,0
200008,GCA_000168015.1,AAGGGCTAGAATGTGTCACAGGTAAGTATTTGTTTATTGTCTGAAG...,pathogenic,45,56,84,65,10,8,14,...,0,1,1,0,3,4,1,1,5,1
200009,GCA_000156195.1,CATCAGTAAAATGACAAGTTTATGTTAAAATTAAACTTAGCCAATA...,nonpathogenic,98,45,31,76,38,17,11,...,0,2,2,0,2,0,2,4,0,2


# Test_Dataset_5

In [55]:
import time
test_dataset_5 = data[300000:400000]
tic = time.time()
build_frequency_dataframe(test_dataset_5,[1,2,3,4])
toc = time.time()
print(toc-tic)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


769.2150673866272


In [56]:
test_dataset_5.to_pickle("Generated_Datasets/Frequency_Data_300000_399999")

In [57]:
test_dataset_5

Unnamed: 0,Accession_ID,250bp_READ,LABEL,A,C,G,T,AA,AC,AG,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
300000,GCA_000146855.1,AATTGCTATATTTTTATTCATCTTATTCTCCATATTATTAGATTTG...,pathogenic,101,33,18,98,44,11,8,...,0,2,1,2,0,2,6,1,2,6
300001,GCA_000735345.1,TGGGCTGGTGGGGGAGTCGGGGAGTGGCAAAACCACCGTCGGCAGA...,pathogenic,57,65,84,44,22,10,16,...,2,0,1,0,1,0,1,3,0,0
300002,GCA_000154305.1,GCTGATTCCGGTTTGTGCCGTCTGTGTGGCCAGCGCAGGAAATATC...,nonpathogenic,54,52,71,73,13,8,13,...,1,2,0,0,2,3,1,2,2,0
300003,GCA_000156195.1,ATGAGGCTTGTTCTTAATGGAGCAGTTCCTAATGGCTATTTATATG...,nonpathogenic,74,40,51,85,26,5,13,...,0,2,2,1,3,4,1,0,3,3
300004,GCA_000238635.1,CGCTGTTCTTCCCGGCGTCTCGAAGGCCGGCCGCCGGGTCATAACG...,nonpathogenic,50,82,80,38,15,10,15,...,1,2,1,0,0,0,0,1,0,1
300005,GCA_000169015.1,AAAAGATAGAAGAAAGTACTTTCGTACTAAAAGACTATTTATCAAA...,pathogenic,92,44,51,63,28,19,19,...,2,1,1,0,2,0,2,2,0,0
300006,GCA_000216615.2,AGTCTCTGCGTCAAGTCCTTCTCTCGGCGGATCCAATACTACAACA...,pathogenic,71,60,40,79,24,15,12,...,4,2,1,1,0,1,3,4,1,6
300007,GCA_000019405.1,TGCGTTTATGTGCCAATCATCCTATGCCAACATGCTATACAAACAG...,pathogenic,79,37,54,79,32,9,7,...,0,0,2,2,3,1,4,1,2,1
300008,GCA_000182255.1,CGCGCCTGTCGTTTCGTCTCGAGGCTCGCGGCGCTCGTGCCGACGA...,pathogenic,34,85,87,44,7,9,13,...,3,1,0,0,0,0,0,1,0,0
300009,GCA_000156195.1,CGCTTCCGGATATGCGAAGATGCTGGAAGAGAGCAGCGGTATGCTG...,nonpathogenic,74,58,72,46,22,21,15,...,0,0,0,0,0,1,0,1,0,0
