In [2]:
# Import, change module names
import numpy as np
import pandas as pd

# import the uci Molecular Biology (Promoter Gene Sequences) Data Set
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'
names = ['Class', 'id', 'Sequence']
df = pd.read_csv(url, names = names)

In [3]:
df.head()

Unnamed: 0,Class,id,Sequence
0,+,S10,\t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1,+,AMPC,\t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2,+,AROH,\t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3,+,DEOP2,\taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4,+,LEU1_TRNA,\ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...


In [5]:
df['Class'].nunique()

2

In [6]:
df.shape

(106, 3)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106 entries, 0 to 105
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Class     106 non-null    object
 1   id        106 non-null    object
 2   Sequence  106 non-null    object
dtypes: object(3)
memory usage: 2.6+ KB


In [8]:
df.describe()

Unnamed: 0,Class,id,Sequence
count,106,106,106
unique,2,106,106
top,-,93,\ttttatatttttcgcttgtcaggccggaataactccctataatgc...
freq,53,1,1


In [11]:
classes = df.loc[:,'Class']

In [12]:
classes

0      +
1      +
2      +
3      +
4      +
      ..
101    -
102    -
103    -
104    -
105    -
Name: Class, Length: 106, dtype: object

In [14]:
sequences = df.loc[:,'Sequence']
dataset = {}

for i, seq in enumerate(sequences):
    nucleotides = list(seq)
    nucleotides = [x for x in nucleotides if x != '\t']
    nucleotides.append(classes[i])
    dataset[i] = nucleotides
    
print(dataset)

{0: ['t', 'a', 'c', 't', 'a', 'g', 'c', 'a', 'a', 't', 'a', 'c', 'g', 'c', 't', 't', 'g', 'c', 'g', 't', 't', 'c', 'g', 'g', 't', 'g', 'g', 't', 't', 'a', 'a', 'g', 't', 'a', 't', 'g', 't', 'a', 't', 'a', 'a', 't', 'g', 'c', 'g', 'c', 'g', 'g', 'g', 'c', 't', 't', 'g', 't', 'c', 'g', 't', '+'], 1: ['t', 'g', 'c', 't', 'a', 't', 'c', 'c', 't', 'g', 'a', 'c', 'a', 'g', 't', 't', 'g', 't', 'c', 'a', 'c', 'g', 'c', 't', 'g', 'a', 't', 't', 'g', 'g', 't', 'g', 't', 'c', 'g', 't', 't', 'a', 'c', 'a', 'a', 't', 'c', 't', 'a', 'a', 'c', 'g', 'c', 'a', 't', 'c', 'g', 'c', 'c', 'a', 'a', '+'], 2: ['g', 't', 'a', 'c', 't', 'a', 'g', 'a', 'g', 'a', 'a', 'c', 't', 'a', 'g', 't', 'g', 'c', 'a', 't', 't', 'a', 'g', 'c', 't', 't', 'a', 't', 't', 't', 't', 't', 't', 't', 'g', 't', 't', 'a', 't', 'c', 'a', 't', 'g', 'c', 't', 'a', 'a', 'c', 'c', 'a', 'c', 'c', 'c', 'g', 'g', 'c', 'g', '+'], 3: ['a', 'a', 't', 't', 'g', 't', 'g', 'a', 't', 'g', 't', 'g', 't', 'a', 't', 'c', 'g', 'a', 'a', 'g', 't', 'g', 

In [15]:
dframe = pd.DataFrame(dataset)

In [16]:
df_new = dframe.transpose()

In [17]:
df_new.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,t,a,c,t,a,g,c,a,a,t,...,g,c,t,t,g,t,c,g,t,+
1,t,g,c,t,a,t,c,c,t,g,...,c,a,t,c,g,c,c,a,a,+
2,g,t,a,c,t,a,g,a,g,a,...,c,a,c,c,c,g,g,c,g,+
3,a,a,t,t,g,t,g,a,t,g,...,a,a,c,a,a,a,c,t,c,+
4,t,c,g,a,t,a,a,t,t,a,...,c,c,g,t,g,g,t,a,g,+


In [18]:
df_new.shape

(106, 58)

In [23]:
df_new.rename(columns = {57:'Class'},inplace=True)

In [24]:
df_new


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,Class
0,t,a,c,t,a,g,c,a,a,t,...,g,c,t,t,g,t,c,g,t,+
1,t,g,c,t,a,t,c,c,t,g,...,c,a,t,c,g,c,c,a,a,+
2,g,t,a,c,t,a,g,a,g,a,...,c,a,c,c,c,g,g,c,g,+
3,a,a,t,t,g,t,g,a,t,g,...,a,a,c,a,a,a,c,t,c,+
4,t,c,g,a,t,a,a,t,t,a,...,c,c,g,t,g,g,t,a,g,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,c,c,t,c,a,a,t,g,g,c,...,g,a,a,c,t,a,t,a,t,-
102,g,t,a,t,t,c,t,c,a,a,...,t,c,a,a,c,a,t,t,g,-
103,c,g,c,g,a,c,t,a,c,g,...,a,a,g,g,c,t,t,c,c,-
104,c,t,c,g,t,c,c,t,c,a,...,a,g,g,a,g,g,a,a,c,-


In [25]:
df_new.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,Class
count,106,106,106,106,106,106,106,106,106,106,...,106,106,106,106,106,106,106,106,106,106
unique,4,4,4,4,4,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,2
top,t,a,a,c,a,a,a,a,a,a,...,c,c,c,t,t,c,c,t,t,-
freq,38,34,30,30,36,42,38,34,33,36,...,36,42,31,33,35,32,29,29,34,53


In [34]:
column = df_new.columns

In [37]:
series = []
for name in column:
    series.append((name,df_new[name].value_counts()))

In [38]:
series

[(0,
  t    38
  c    27
  a    26
  g    15
  Name: 0, dtype: int64),
 (1,
  a    34
  t    26
  g    24
  c    22
  Name: 1, dtype: int64),
 (2,
  a    30
  g    28
  t    27
  c    21
  Name: 2, dtype: int64),
 (3,
  c    30
  g    28
  t    26
  a    22
  Name: 3, dtype: int64),
 (4,
  a    36
  g    29
  t    22
  c    19
  Name: 4, dtype: int64),
 (5,
  a    42
  t    24
  g    22
  c    18
  Name: 5, dtype: int64),
 (6,
  a    38
  t    30
  c    21
  g    17
  Name: 6, dtype: int64),
 (7,
  a    34
  t    32
  g    20
  c    20
  Name: 7, dtype: int64),
 (8,
  a    33
  t    32
  c    22
  g    19
  Name: 8, dtype: int64),
 (9,
  a    36
  t    28
  c    22
  g    20
  Name: 9, dtype: int64),
 (10,
  a    38
  t    31
  c    22
  g    15
  Name: 10, dtype: int64),
 (11,
  c    31
  t    29
  g    25
  a    21
  Name: 11, dtype: int64),
 (12,
  t    34
  g    29
  a    29
  c    14
  Name: 12, dtype: int64),
 (13,
  c    38
  a    24
  g    23
  t    21
  Name: 13, dtype: int64)

In [39]:
df_new = pd.get_dummies(df_new)

In [40]:
df_new

Unnamed: 0,0_a,0_c,0_g,0_t,1_a,1_c,1_g,1_t,2_a,2_c,...,55_a,55_c,55_g,55_t,56_a,56_c,56_g,56_t,Class_+,Class_-
0,0,0,0,1,1,0,0,0,0,1,...,0,0,1,0,0,0,0,1,1,0
1,0,0,0,1,0,0,1,0,0,1,...,1,0,0,0,1,0,0,0,1,0
2,0,0,1,0,0,0,0,1,1,0,...,0,1,0,0,0,0,1,0,1,0
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,1,0,0,1,0
4,0,0,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,0,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
102,0,0,1,0,0,0,0,1,1,0,...,0,0,0,1,0,0,1,0,0,1
103,0,1,0,0,0,0,1,0,0,1,...,0,1,0,0,0,1,0,0,0,1
104,0,1,0,0,0,0,0,1,0,1,...,1,0,0,0,0,1,0,0,0,1


In [46]:
df_new.drop('Class_-',axis=1,inplace=True)

In [47]:
df_new.rename(columns={'Class_+':'Class'},inplace=True)

In [48]:
df_new

Unnamed: 0,0_a,0_c,0_g,0_t,1_a,1_c,1_g,1_t,2_a,2_c,...,54_t,55_a,55_c,55_g,55_t,56_a,56_c,56_g,56_t,Class
0,0,0,0,1,1,0,0,0,0,1,...,0,0,0,1,0,0,0,0,1,1
1,0,0,0,1,0,0,1,0,0,1,...,0,1,0,0,0,1,0,0,0,1
2,0,0,1,0,0,0,0,1,1,0,...,0,0,1,0,0,0,0,1,0,1
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
4,0,0,0,1,0,1,0,0,0,0,...,1,1,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,0,1,0,0,0,1,0,0,0,0,...,1,1,0,0,0,0,0,0,1,0
102,0,0,1,0,0,0,0,1,1,0,...,1,0,0,0,1,0,0,1,0,0
103,0,1,0,0,0,0,1,0,0,1,...,1,0,1,0,0,0,1,0,0,0
104,0,1,0,0,0,0,0,1,0,1,...,0,1,0,0,0,0,1,0,0,0


In [49]:
# Use the model_selection module to separate training and testing datasets
from sklearn import model_selection

# Create X and Y datasets for training
X = np.array(df_new.drop(['Class'], 1))
y = np.array(df_new['Class'])

# define seed for reproducibility
seed = 1

# split data into training and testing datasets
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, random_state=seed)


In [50]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# define scoring method
scoring = 'accuracy'

# Define models to train
names = ["Nearest Neighbors", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "SVM Linear", "SVM RBF", "SVM Sigmoid"]

classifiers = [
    KNeighborsClassifier(n_neighbors = 3),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(kernel = 'linear'), 
    SVC(kernel = 'rbf'),
    SVC(kernel = 'sigmoid')
]

models = zip(names, classifiers)

# evaluate each model in turn
results = []
names = []

for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state = seed)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    



Nearest Neighbors: 0.823214 (0.113908)
Gaussian Process: 0.873214 (0.056158)




Decision Tree: 0.673214 (0.230357)




Random Forest: 0.700000 (0.195256)




Neural Net: 0.862500 (0.117925)
AdaBoost: 0.912500 (0.112500)
Naive Bayes: 0.837500 (0.137500)
SVM Linear: 0.850000 (0.108972)
SVM RBF: 0.887500 (0.067315)




SVM Sigmoid: 0.900000 (0.093541)


In [55]:
for name, model in models:
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(name)
    print(accuracy_score(y_test, predictions))
    print(classification_report(y_test, predictions))