# DNA Sequencing for Detecting E.coli Virus

In [1]:
# import libriaries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import OneHotEncoder
import pickle 
from sklearn.neural_network import MLPClassifier 
from sklearn.metrics import classification_report, accuracy_score 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay 

In [2]:
# load dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'
names = ['Class', 'id', 'Sequence']
data = pd.read_csv(url, names = names)
data.head()

Unnamed: 0,Class,id,Sequence
0,+,S10,\t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1,+,AMPC,\t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2,+,AROH,\t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3,+,DEOP2,\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4,+,LEU1_TRNA,\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...


In [3]:
data.shape

(106, 3)

In [4]:
data.dtypes

Class       object
id          object
Sequence    object
dtype: object

In [5]:
data.loc[:, "Class"]

0      +
1      +
2      +
3      +
4      +
      ..
101    -
102    -
103    -
104    -
105    -
Name: Class, Length: 106, dtype: object

In [6]:
data["Class"]

0      +
1      +
2      +
3      +
4      +
      ..
101    -
102    -
103    -
104    -
105    -
Name: Class, Length: 106, dtype: object

In [7]:
# Refining and structuring the data into feature and label
classes = data["Class"]
classes.value_counts()

+    53
-    53
Name: Class, dtype: int64

In [8]:
# generate a liist of the DNA sequences
sequence = list(data["Sequence"])
sequence[-1]

'\t\ttaacattaataaataaggaggctctaatggcactcattagccaatcaatcaagaact'

## Data Preprocessing

We will start by removing the `"\t"` (tab) notation from the beginning of each dna sequence

In [9]:
# remove tab from each sequence
dic = {}

for i, seq in enumerate(sequence):
    nucleotides = list(seq) # ['\t','\t','t','a','a','c','a',]
    nucleotides = [char for char in nucleotides if char != '\t']
    # append class assignment
    nucleotides.append(classes[i])
    
    dic[i] = nucleotides

In [13]:
list(dic[0])

['t',
 'a',
 'c',
 't',
 'a',
 'g',
 'c',
 'a',
 'a',
 't',
 'a',
 'c',
 'g',
 'c',
 't',
 't',
 'g',
 'c',
 'g',
 't',
 't',
 'c',
 'g',
 'g',
 't',
 'g',
 'g',
 't',
 't',
 'a',
 'a',
 'g',
 't',
 'a',
 't',
 'g',
 't',
 'a',
 't',
 'a',
 'a',
 't',
 'g',
 'c',
 'g',
 'c',
 'g',
 'g',
 'g',
 'c',
 't',
 't',
 'g',
 't',
 'c',
 'g',
 't',
 '+']

In [14]:
# convert the Dict into a dataframe
df = pd.DataFrame(dic)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,96,97,98,99,100,101,102,103,104,105
0,t,t,g,a,t,a,c,t,c,t,...,c,c,t,a,g,c,g,c,c,t
1,a,g,t,a,c,g,a,t,g,t,...,c,g,a,g,a,c,t,g,t,a
2,c,c,a,t,g,g,g,t,a,t,...,g,c,t,a,g,t,a,c,c,a
3,t,t,c,t,a,g,g,c,c,t,...,a,t,g,g,a,c,t,g,g,c
4,a,a,t,g,t,g,g,t,t,a,...,g,a,a,g,g,a,t,a,t,a


In [15]:
df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,96,97,98,99,100,101,102,103,104,105
53,t,c,g,a,g,a,g,a,g,g,...,c,c,t,a,t,a,a,t,g,a
54,c,c,g,c,t,g,a,a,t,a,...,t,t,a,t,a,t,t,t,a,a
55,g,a,c,t,a,g,a,c,t,c,...,t,t,t,g,c,a,t,c,a,c
56,t,a,g,c,g,t,t,a,t,a,...,g,t,t,a,g,t,g,c,c,t
57,+,+,+,+,+,+,+,+,+,+,...,-,-,-,-,-,-,-,-,-,-


As we can see from the dataframe, the classes for each sequence is located at the last row, therefore we will need to transpose the dataframe

In [16]:
# transpose the dataframe to get the target column
df = df.T
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,t,a,c,t,a,g,c,a,a,t,...,g,c,t,t,g,t,c,g,t,+
1,t,g,c,t,a,t,c,c,t,g,...,c,a,t,c,g,c,c,a,a,+
2,g,t,a,c,t,a,g,a,g,a,...,c,a,c,c,c,g,g,c,g,+
3,a,a,t,t,g,t,g,a,t,g,...,a,a,c,a,a,a,c,t,c,+
4,t,c,g,a,t,a,a,t,t,a,...,c,c,g,t,g,g,t,a,g,+


In [17]:
# renmae the 57th column as it is our classes column
df.rename(columns={57:"Class"}, inplace=True)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,Class
0,t,a,c,t,a,g,c,a,a,t,...,g,c,t,t,g,t,c,g,t,+
1,t,g,c,t,a,t,c,c,t,g,...,c,a,t,c,g,c,c,a,a,+
2,g,t,a,c,t,a,g,a,g,a,...,c,a,c,c,c,g,g,c,g,+
3,a,a,t,t,g,t,g,a,t,g,...,a,a,c,a,a,a,c,t,c,+
4,t,c,g,a,t,a,a,t,t,a,...,c,c,g,t,g,g,t,a,g,+


In [18]:
# lets create a temporary copy of the dataframe
temp = df.copy(deep=True)

# drop the target column
temp = temp.drop(["Class"], axis=1)
temp.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,50,51,52,53,54,55,56
0,t,a,c,t,a,g,c,a,a,t,...,g,g,c,t,t,g,t,c,g,t
1,t,g,c,t,a,t,c,c,t,g,...,g,c,a,t,c,g,c,c,a,a
2,g,t,a,c,t,a,g,a,g,a,...,c,c,a,c,c,c,g,g,c,g
3,a,a,t,t,g,t,g,a,t,g,...,t,a,a,c,a,a,a,c,t,c
4,t,c,g,a,t,a,a,t,t,a,...,t,c,c,g,t,g,g,t,a,g


The independent variable which is the dna sequences are it string format, we will need to convert them to numbers so that the data can be fed to the model. We will be using `OneHotEncoder`

In [21]:
# encoding using one-hot encoder

enc = OneHotEncoder(handle_unknown="ignore")
enc.fit(temp)

In [25]:
# view the categories
enc.categories_

[array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dtype=object),
 array(['a', 'c', 'g', 't'], dty

In [26]:
len(enc.categories_)

57

In [29]:
# transform the fitted data and convert to an array
df1 = enc.transform(temp).toarray()
df1

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [37]:
len(df1[0])

228

We will save the fitted `OneHotEncoder` so that we can use it when deploying our model.

In [36]:
# saving the OneHotEncoder
with open("EColi-encoder.pickle", "wb") as f: 
    pickle.dump(enc, f) 

In [38]:
# convert the encoded array into a dataframe
df_new = pd.DataFrame(df1)
df_new.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,218,219,220,221,222,223,224,225,226,227
0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [39]:
df_new.shape

(106, 228)

Alternative way to encode the string data using pandas `get_dummies()` method