# DNA Sequencing for Detecting E.coli Virus

In [1]:
# import libriaries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import OneHotEncoder
import pickle 
from sklearn.neural_network import MLPClassifier 
from sklearn.metrics import classification_report, accuracy_score 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay 

In [2]:
# load dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'
names = ['Class', 'id', 'Sequence']
data = pd.read_csv(url, names = names)
data.head()

Unnamed: 0,Class,id,Sequence
0,+,S10,\t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1,+,AMPC,\t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2,+,AROH,\t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3,+,DEOP2,\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4,+,LEU1_TRNA,\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...


In [8]:
data.shape

(106, 3)

In [9]:
data.dtypes

Class       object
id          object
Sequence    object
dtype: object

In [10]:
data.loc[:, "Class"]

0      +
1      +
2      +
3      +
4      +
      ..
101    -
102    -
103    -
104    -
105    -
Name: Class, Length: 106, dtype: object

In [11]:
data["Class"]

0      +
1      +
2      +
3      +
4      +
      ..
101    -
102    -
103    -
104    -
105    -
Name: Class, Length: 106, dtype: object

In [12]:
# Refining and structuring the data into feature and label
classes = data["Class"]
classes.value_counts()

+    53
-    53
Name: Class, dtype: int64

In [13]:
# generate a liist of the DNA sequences
sequence = list(data["Sequence"])
sequence[-1]

'\t\ttaacattaataaataaggaggctctaatggcactcattagccaatcaatcaagaact'

## Data Preprocessing

We will start by removing the `"\t"` (tab) notation from the beginning of each dna sequence

In [24]:
# remove tab from each sequence
dic = {}

for i, seq in enumerate(sequence):
    nucleotides = list(seq) # ['\t','\t','t','a','a','c','a',]
    nucleotides = [char for char in nucleotides if char != '\t']
    # append class assignment
    nucleotides.append(classes[i])
    
    dic[i] = nucleotides

In [32]:
list('\t\ttaacattaataaataaggaggctctaatggcactcattagccaatcaatcaagaact')

['\t',
 '\t',
 't',
 'a',
 'a',
 'c',
 'a',
 't',
 't',
 'a',
 'a',
 't',
 'a',
 'a',
 'a',
 't',
 'a',
 'a',
 'g',
 'g',
 'a',
 'g',
 'g',
 'c',
 't',
 'c',
 't',
 'a',
 'a',
 't',
 'g',
 'g',
 'c',
 'a',
 'c',
 't',
 'c',
 'a',
 't',
 't',
 'a',
 'g',
 'c',
 'c',
 'a',
 'a',
 't',
 'c',
 'a',
 'a',
 't',
 'c',
 'a',
 'a',
 'g',
 'a',
 'a',
 'c',
 't']