In [1]:
import pandas as pd
from aerobot.io import load_training_data, load_validation_data
from aerobot.utls import process_data
from aerobot.models import LogisticClassifier
import seaborn as sns

# A model based only on oxidation state features

The oxidation state of DSDNA is an analytic function of its GC content. For proteins and RNA, we actually need to calculate these values, so we load a reference table. 

The code below loads amino acid and DNA sequence content for each genome, calculates the nominal oxidation states of those, and then uses them as features in a 3 class predictor of oxygen utilization. 

TODO: need to get access to coding sequences to calculate ZC of RNA coding seqs. Josh is on this. 

In [2]:
CANONICAL_NTS = ['A', 'C', 'G', 'T']
GC_NTS = ['G', 'C']

def calc_gc(nt1_features_df):
    """Calculate the GC content of the canonical nucleotides.

    Args:
        nt1_features_df (pd.DataFrame): A dataframe containing the counts of
            single nucleotides.

    Returns:
        pd.Series: A series containing the GC content of each genome.
    """
    canonical_data = nt1_features_df[CANONICAL_NTS]
    gc = canonical_data[GC_NTS].sum(axis=1) / canonical_data.sum(axis=1)
    gc.name = 'gc_content'
    return gc

AA_NOSC_DF = pd.read_csv('aa_nosc.csv', index_col=0)
AA_NC = AA_NOSC_DF.NC
AA_ZC = AA_NOSC_DF.NOSC
CANONICAL_AAS = AA_NOSC_DF.index.tolist()

def calc_aa_zc(aa1_features_df):
    """Calculate the formal C oxidation state.

    Args:
        aa1_features_df (pd.DataFrame): A dataframe containing the counts of
            single amino acids.

    Returns:
        pd.Series: The formal C oxidation state.
    """
    aas = sorted(set(aa1_features_df.columns).intersection(CANONICAL_AAS))
    canonical_data = aa1_features_df[aas]
    NC_total = (canonical_data @ AA_NC[aas])
    Ne_total = (canonical_data @ (AA_ZC[aas] * AA_NC[aas]))
    mean_cds_zc = Ne_total / NC_total
    mean_cds_zc.name = 'cds_aa_zc'
    return mean_cds_zc

In [3]:
print("Loading nt data...")

feature_type = "nt_1mer"
training_data_nt = load_training_data(feature_type=feature_type)
validation_data_nt = load_validation_data(feature_type=feature_type)

print("Calculating GC content...")
train_gc = calc_gc(training_data_nt["features"])
test_gc = calc_gc(validation_data_nt["features"])

print("Loading aa data...")
feature_type = 'aa_1mer'
training_data_aa = load_training_data(feature_type=feature_type)
validation_data_aa = load_validation_data(feature_type=feature_type)

print("Calculating protein ZC...")
train_aa_zc = calc_aa_zc(training_data_aa["features"])
test_aa_zc = calc_aa_zc(validation_data_aa["features"])

# aa features have a different index than nt features
# need to strip the "RS_" from the index labels
train_aa_zc.index = train_aa_zc.index.str.strip("RS_")
test_aa_zc.index = test_aa_zc.index.str.strip("RS_")

print("Merging features...")
training_features = pd.merge(train_gc, train_aa_zc, left_index=True, right_index=True)
validation_features = pd.merge(test_gc, test_aa_zc, left_index=True, right_index=True)

if training_features['gc_content'].size == 0:
    print("No features found!")

training_data = training_data_aa
validation_data = validation_data_aa
training_data["features"] = train_aa_zc
validation_data["features"] = test_aa_zc

cleaned_data = process_data(
    training_data["features"], training_data["labels"]["physiology"],
    validation_data["features"], validation_data["labels"]["physiology"])

Loading nt data...
Calculating GC content...
Loading aa data...
Calculating protein ZC...
Merging features...


In [4]:
print("Fitting model...")
model = LogisticClassifier(max_iter=100000, normalize=True, random_state=None)
model.fit(cleaned_data["X_train"], cleaned_data["y_train"])
accuracy = model.score(cleaned_data["X_train"], cleaned_data["y_train"])
balanced_accuracy = model.balanced_accuracy(cleaned_data["X_train"], cleaned_data["y_train"])

print("Accuracy: " + str(accuracy))
print("Balanced Accuracy: " + str(balanced_accuracy))

# compute accuracy and balanced accuracy on test set
test_accuracy = model.score(cleaned_data["X_test"], cleaned_data["y_test"])
test_balanced_accuracy = model.balanced_accuracy(cleaned_data["X_test"], cleaned_data["y_test"])
print("Test Accuracy: " + str(test_accuracy))
print("Test Balanced Accuracy: " + str(test_balanced_accuracy))

C = model.confusion_matrix(cleaned_data["X_train"], cleaned_data["y_train"])
C = pd.DataFrame(C,index=model.classifier.classes_, columns=model.classifier.classes_)
C.apply(lambda x: x/x.sum(),axis=1)


Fitting model...
Accuracy: 0.6247848537005164
Balanced Accuracy: 0.40396950578338586
Test Accuracy: 0.5073529411764706
Test Balanced Accuracy: 0.4044339498884953


Unnamed: 0,Aerobe,Anaerobe,Facultative
Aerobe,0.275,0.725,0.0
Anaerobe,0.063091,0.936909,0.0
Facultative,0.25,0.75,0.0
