In [2]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

## Loading Sample Datasets

In [3]:
dataset_1 = pd.read_pickle("Generated_Datasets/Frequency_Data_0_9999")
dataset_2 = pd.read_pickle("Generated_Datasets/Frequency_Data_10000_99999")
dataset_3 = pd.read_pickle("Generated_Datasets/Frequency_Data_100000_199999")
dataset_4 = pd.read_pickle("Generated_Datasets/Frequency_Data_200000_299999")
dataset_5 = pd.read_pickle("Generated_Datasets/Frequency_Data_300000_399999")

In [4]:
raw_data = pd.concat([dataset_1, dataset_2, dataset_3, dataset_4, dataset_5])

In [5]:
raw_data

Unnamed: 0,Accession_ID,250bp_READ,LABEL,A,C,G,T,AA,AC,AG,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
0,GCA_000154485.1,AGCATCCGTATTACCAGTTCCACCATTTCTAACTTGAACCGCTTGA...,pathogenic,67,59,31,93,18,18,6,...,0,1,4,2,1,3,4,2,4,3
1,GCA_000154205.1,CTATTTCCGCAGCCATGCCCCCGCTACCTGCAAGACTTCTCCTGGT...,nonpathogenic,64,78,56,51,16,23,12,...,0,3,0,1,0,0,1,2,1,0
2,GCA_000167995.1,GGTGATGCGCACCTATACCTTGCGCGCGCTGCGTGCAGAGCAAAAC...,pathogenic,48,67,89,46,9,15,11,...,0,0,0,3,1,1,0,0,1,0
3,GCA_000512375.1,ATGGGGATAGCCAAGAGATAACATTCTTATCCTCACAACTATGGCC...,pathogenic,70,57,51,72,21,13,13,...,3,3,1,1,2,0,1,2,0,4
4,GCA_000154305.1,TGAAAAGATACTGAGAGAAGATCCTGCAATCACTCATGCGGCTATG...,nonpathogenic,69,45,64,72,18,9,17,...,0,4,2,1,1,0,1,2,1,4
5,GCA_000157955.1,TTCGCCCAAAAGCACACAAAAAAGCCACACAATGATCCAAATCAGA...,nonpathogenic,78,59,59,54,35,15,13,...,1,1,1,0,2,1,2,0,1,4
6,GCA_000238635.1,CCGCACAGAGAAAGGATGCCGGATATGAGCGAGACACATTTCCCCC...,nonpathogenic,63,72,79,36,18,16,15,...,0,0,0,0,0,0,1,2,0,1
7,GCA_000156195.1,TATGACGAAGGAGGCATCATTGCAGCTCCCGGTTCGGCAATAGAAT...,nonpathogenic,73,51,68,58,21,8,22,...,3,3,0,2,2,0,0,2,0,2
8,GCA_000022745.1,TTTAGGGTACGGTCTATATGCAGGAGCTATTTCCTGGAANCGCTTC...,pathogenic,57,74,52,65,13,17,11,...,3,0,0,0,1,0,3,3,0,0
9,GCA_000153925.1,GTTTACATACATGGCAAGACCCTGATCTTTATCCTGCACCCATGTA...,nonpathogenic,70,62,37,80,20,17,7,...,0,3,0,0,0,2,4,3,1,2


## Dropping the Accession_ID and 250bp_READ

In [6]:
columns = ["Accession_ID" , "250bp_READ"]
data = raw_data.drop(columns, axis=1)

In [7]:
data.head(7)

Unnamed: 0,LABEL,A,C,G,T,AA,AC,AG,AT,CA,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
0,pathogenic,67,59,31,93,18,18,6,25,18,...,0,1,4,2,1,3,4,2,4,3
1,nonpathogenic,64,78,56,51,16,23,12,12,16,...,0,3,0,1,0,0,1,2,1,0
2,pathogenic,48,67,89,46,9,15,11,13,16,...,0,0,0,3,1,1,0,0,1,0
3,pathogenic,70,57,51,72,21,13,13,23,16,...,3,3,1,1,2,0,1,2,0,4
4,nonpathogenic,69,45,64,72,18,9,17,25,18,...,0,4,2,1,1,0,1,2,1,4
5,nonpathogenic,78,59,59,54,35,15,13,15,21,...,1,1,1,0,2,1,2,0,1,4
6,nonpathogenic,63,72,79,36,18,16,15,13,15,...,0,0,0,0,0,0,1,2,0,1


# Generating Binary Labels for pathogenic = 1 , nonpathogenic = 0

In [8]:
changes = {"pathogenic" : 1 , "nonpathogenic" : 0}  # changes to be made for binary conversion
data["LABEL"] = data["LABEL"].replace(changes)

In [9]:
data.head()

Unnamed: 0,LABEL,A,C,G,T,AA,AC,AG,AT,CA,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
0,1,67,59,31,93,18,18,6,25,18,...,0,1,4,2,1,3,4,2,4,3
1,0,64,78,56,51,16,23,12,12,16,...,0,3,0,1,0,0,1,2,1,0
2,1,48,67,89,46,9,15,11,13,16,...,0,0,0,3,1,1,0,0,1,0
3,1,70,57,51,72,21,13,13,23,16,...,3,3,1,1,2,0,1,2,0,4
4,0,69,45,64,72,18,9,17,25,18,...,0,4,2,1,1,0,1,2,1,4


In [10]:
len(data)

400000

In [11]:
print("Distribution of Pathogenic Data :",  (data["LABEL"].sum())/len(data))

Distribution of Pathogenic Data : 0.50115


# Separating the Target and Features in data

In [12]:
data_target = data["LABEL"]
data_feat = data.drop("LABEL" , axis=1)

In [13]:
data_target

0         1
1         0
2         1
3         1
4         0
         ..
399995    0
399996    0
399997    1
399998    1
399999    0
Name: LABEL, Length: 400000, dtype: int64

In [14]:
data_feat.head(5)

Unnamed: 0,A,C,G,T,AA,AC,AG,AT,CA,CC,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
0,67,59,31,93,18,18,6,25,18,16,...,0,1,4,2,1,3,4,2,4,3
1,64,78,56,51,16,23,12,12,16,28,...,0,3,0,1,0,0,1,2,1,0
2,48,67,89,46,9,15,11,13,16,15,...,0,0,0,3,1,1,0,0,1,0
3,70,57,51,72,21,13,13,23,16,11,...,3,3,1,1,2,0,1,2,0,4
4,69,45,64,72,18,9,17,25,18,4,...,0,4,2,1,1,0,1,2,1,4


# Performing the train-test split

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
train_data , test_data , train_target , test_target = train_test_split(data_feat, data_target, test_size = 0.1)

In [17]:
print("Distribution of Pathogenic Train Data :",  (train_target.sum())/len(train_data))

Distribution of Pathogenic Train Data : 0.5008055555555555


In [18]:
print("Distribution of Pathogenic Test Data :",  (test_target.sum())/len(test_data))

Distribution of Pathogenic Test Data : 0.50425


In [19]:
train_data.shape

(360000, 340)

In [20]:
test_data.shape

(40000, 340)

In [21]:
train_data.head(5)

Unnamed: 0,A,C,G,T,AA,AC,AG,AT,CA,CC,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
369297,56,70,60,64,11,15,16,14,20,17,...,1,4,1,1,1,2,1,1,1,0
68927,55,71,75,49,14,13,15,13,16,21,...,0,0,1,0,0,2,0,2,1,1
169096,79,41,64,66,25,12,22,20,12,4,...,1,2,3,3,1,1,2,0,3,1
352493,64,44,56,86,22,8,9,25,15,10,...,0,1,2,4,4,0,2,3,3,9
31996,75,35,32,108,31,7,7,30,10,7,...,2,2,3,1,0,4,6,7,1,17


# Implementing the models

# Logistic Regression , Data Size : 3.6e5

In [22]:
from sklearn.linear_model import LogisticRegression

In [23]:
log_reg_clf = LogisticRegression()

In [24]:
import time
tic = time.time()
log_reg_clf.fit(train_data, train_target)
toc = time.time()
print("Time Taken for Fiiting" , toc-tic)



Time Taken for Fiiting 134.27339482307434


In [25]:
log_reg_clf.score(train_data, train_target)

0.771425

In [26]:
log_reg_clf.score(test_data, test_target)

0.77415

In [None]:
log_

#### Implementing the Confusion Matrix for Train_Data

In [27]:
from sklearn.metrics import confusion_matrix
confusion_matrix(log_reg_clf.predict(train_data) , train_target)

array([[139191,  41768],
       [ 40519, 138522]], dtype=int64)

#### Implementing the Confusion Matrix for Test_Data

In [28]:
confusion_matrix(log_reg_clf.predict(test_data) , test_target)

array([[15509,  4713],
       [ 4321, 15457]], dtype=int64)

#   
## Random Forest Classifier , Data Size : 3.6e5

In [29]:
from sklearn.ensemble import RandomForestClassifier
rand_for_clf = RandomForestClassifier()

In [30]:
import time
tic = time.time()
rand_for_clf.fit(train_data, train_target)
toc = time.time()
print("Time taken for fitting : ", toc-tic)



Time taken for fitting :  24.98577618598938


In [31]:
rand_for_clf.score(train_data,train_target)

0.9904111111111111

In [32]:
rand_for_clf.score(test_data,test_target)

0.746075

In [33]:
confusion_matrix(rand_for_clf.predict(train_data) , train_target)

array([[179144,   2886],
       [   566, 177404]], dtype=int64)

In [34]:
confusion_matrix(rand_for_clf.predict(test_data) , test_target)

array([[16186,  6513],
       [ 3644, 13657]], dtype=int64)

#   
## Linear Support Vector Classifier , Data Size : 3.6e5

In [49]:
from sklearn.svm import LinearSVC

linear_svc = LinearSVC(max_iter = 10000)

In [None]:
import time
tic = time.time()
linear_svc.fit(train_data, train_target)
toc = time.time()
print("Time taken for training :", toc-tic)

In [None]:
linear_svc.score(train_data,train_target)

In [None]:
linear_svc.score(test_data,test_target)

In [46]:
confusion_matrix(linear_svc.predict(train_data) , train_target)

array([[ 22400,   2605],
       [157310, 177685]], dtype=int64)

In [47]:
confusion_matrix(linear_svc.predict(test_data) , test_target)

array([[ 2511,   276],
       [17319, 19894]], dtype=int64)

#   
## Decision Tree Based Classifier , Data Size : 3.6e5

In [35]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_clf = DecisionTreeClassifier()

In [36]:
import time
tic = time.time()
decision_tree_clf.fit(train_data, train_target)
toc = time.time()
print("Time taken to train :" ,toc-tic)

Time taken to train : 70.18243026733398


In [37]:
decision_tree_clf.score(train_data,train_target)

1.0

In [38]:
decision_tree_clf.score(test_data,test_target)

0.684125

In [39]:
confusion_matrix(decision_tree_clf.predict(train_data) , train_target)

array([[179710,      0],
       [     0, 180290]], dtype=int64)

In [41]:
confusion_matrix(decision_tree_clf.predict(test_data) , test_target)

array([[13735,  6540],
       [ 6095, 13630]], dtype=int64)

#   
## Support Vector Classifier , Data Size : 2e5
##### takes too much time ~1hr

In [22]:
from sklearn.svm import SVC

model_svc = SVC(gamma = "auto")

In [None]:
import time
tic = time.time()
model_svc.fit(train_data, train_target)
toc = time.time()
print("Time taken for fitting :",toc-tic)

In [None]:
model_svc.score(train_data,train_target)

In [None]:
model_svc.score(test_data,test_target)

In [None]:
confusion_matrix(model_svc.predict(train_data) , train_target)

In [None]:
confusion_matrix(model_svc.predict(test_data) , test_target)