In [1]:
import pandas as pd
from aerobot.io import load_training_data, load_validation_data
from aerobot.utls import process_data
from aerobot.models import LogisticClassifier, GeneralClassifier
import seaborn as sns

# A model based only on chemical features of the genome 

The oxidation state of DSDNA is an analytic function of its GC content. For proteins and RNA, we actually need to calculate these values, so we load a reference table. 

The code below loads amino acid and DNA sequence content for each genome, calculates the nominal oxidation states of those, and then uses them as features in a 3 class predictor of oxygen utilization. 

TODO: need to get access to coding sequences to calculate ZC of RNA coding seqs. Josh is on this. 

In [2]:
training_data = load_training_data(feature_type='chemical')
validation_data = load_validation_data(feature_type='chemical')

In [3]:
# Prepare data for classification
cleaned_data = process_data(
    training_data["features"], training_data["labels"]["physiology"],
    validation_data["features"], validation_data["labels"]["physiology"])

In [4]:
training_data["features"]

Unnamed: 0,gc_content,number_of_genes,cds_aa_zc,cds_aa_NC,cds_aa_NN,cds_aa_NO,cds_aa_NS,cds_nt_zc,cds_nt_NC,cds_nt_NN,cds_nt_NO
GCA_018364455.1,0.397108,2027,-0.181580,5.049418,1.322511,2.503527,0.028954,0.898374,9.526472,3.765716,8.166180
GCA_018382595.1,0.658769,1982,-0.120669,4.739994,1.331036,2.444457,0.029963,0.903893,9.480398,3.790845,8.355247
GCA_018500005.1,0.559004,2933,-0.139910,4.820259,1.327489,2.438841,0.039145,0.908269,9.508829,3.795532,8.275603
GCA_018260395.1,0.384381,2164,-0.157605,4.952402,1.334554,2.485141,0.035042,0.895817,9.519444,3.746664,8.178853
GCA_018260445.1,0.454419,2309,-0.151177,4.908725,1.335421,2.470430,0.034993,0.901295,9.516261,3.775879,8.216181
...,...,...,...,...,...,...,...,...,...,...,...
GCF_900182615.1,0.567127,4072,-0.140769,4.852109,1.339824,2.439453,0.038207,0.910334,9.510909,3.804856,8.278082
GCF_900182695.1,0.631108,4240,-0.138523,4.809742,1.359516,2.409570,0.036675,0.913165,9.503230,3.820903,8.318638
GCF_900182565.1,0.463988,3301,-0.167159,5.033637,1.369696,2.466247,0.037186,0.911401,9.536851,3.816866,8.191596
GCF_900182545.1,0.750419,5194,-0.115992,4.673057,1.357393,2.399747,0.021950,0.916100,9.485631,3.841545,8.394653


In [4]:
print("Fitting model...")
model = LogisticClassifier(max_iter=100000, normalize=True, random_state=None)
model.fit(cleaned_data["X_train"], cleaned_data["y_train"])
accuracy = model.score(cleaned_data["X_train"], cleaned_data["y_train"])
balanced_accuracy = model.balanced_accuracy(cleaned_data["X_train"], cleaned_data["y_train"])

print("Accuracy: " + str(accuracy))
print("Balanced Accuracy: " + str(balanced_accuracy))

# compute accuracy and balanced accuracy on test set
test_accuracy = model.score(cleaned_data["X_test"], cleaned_data["y_test"])
test_balanced_accuracy = model.balanced_accuracy(cleaned_data["X_test"], cleaned_data["y_test"])
print("Test Accuracy: " + str(test_accuracy))
print("Test Balanced Accuracy: " + str(test_balanced_accuracy))

C = model.confusion_matrix(cleaned_data["X_test"], cleaned_data["y_test"])
C = pd.DataFrame(C,index=model.classifier.classes_, columns=model.classifier.classes_)
C.apply(lambda x: x/x.sum(), axis=1)


Fitting model...
Accuracy: 0.7656716417910447
Balanced Accuracy: 0.5826862710457275
Test Accuracy: 0.6183206106870229
Test Balanced Accuracy: 0.5401324029343353


Unnamed: 0,Aerobe,Anaerobe,Facultative
Aerobe,0.716049,0.271605,0.012346
Anaerobe,0.095652,0.904348,0.0
Facultative,0.575758,0.424242,0.0


In [5]:
# Make it a two-class thing and run again
training_data["labels"]["physiology"] = training_data["labels"]["physiology"].replace("Facultative", "Aerobe")
validation_data["labels"]["physiology"] = validation_data["labels"]["physiology"].replace("Facultative", "Aerobe")

cleaned_data = process_data(
    training_data["features"], training_data["labels"]["physiology"],
    validation_data["features"], validation_data["labels"]["physiology"])


print("Fitting two-class model...")
model = LogisticClassifier(max_iter=100000, normalize=True, random_state=None)
model.fit(cleaned_data["X_train"], cleaned_data["y_train"])
accuracy = model.score(cleaned_data["X_train"], cleaned_data["y_train"])
balanced_accuracy = model.balanced_accuracy(cleaned_data["X_train"], cleaned_data["y_train"])

print("Accuracy: " + str(accuracy))
print("Balanced Accuracy: " + str(balanced_accuracy))

# compute accuracy and balanced accuracy on test set
test_accuracy = model.score(cleaned_data["X_test"], cleaned_data["y_test"])
test_balanced_accuracy = model.balanced_accuracy(cleaned_data["X_test"], cleaned_data["y_test"])
print("Test Accuracy: " + str(test_accuracy))
print("Test Balanced Accuracy: " + str(test_balanced_accuracy))

C = model.confusion_matrix(cleaned_data["X_test"], cleaned_data["y_test"])
C = pd.DataFrame(C,index=model.classifier.classes_, columns=model.classifier.classes_)
C.apply(lambda x: x/x.sum(), axis=1)

Fitting two-class model...
Accuracy: 0.8597014925373134
Balanced Accuracy: 0.8427810415291708
Test Accuracy: 0.7595419847328244
Test Balanced Accuracy: 0.7629991126885537


Unnamed: 0,Aerobe,Anaerobe
Aerobe,0.734694,0.265306
Anaerobe,0.208696,0.791304
