In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler



In [22]:
def loadseq(file):
    lines = []
    with open(file, 'r') as file:
        for line in file:
            lines.append(line.strip())
    return lines


glutfile = 'glutoutputnoheader.fasta'
sgltfile = 'sgltoutputnoheader.fasta'
hxtfile = 'hxtoutputnoheader.fasta'
chbfile = 'chboutputnoheader.fasta'


glut = loadseq(glutfile)
sglt = loadseq(sgltfile)
hxt = loadseq(hxtfile)
chb = loadseq(chbfile)

glut = glut[:25]
hxt = hxt[:25]
# chb = chb[:5]

glutlen = len(glut)
sgltlen = len(sglt)
hxtlen = len(hxt)
chblen = len(chb)

len(hxt),len(glut),len(sglt),len(chb)

(25, 25, 5, 25)

In [23]:
amino_acids = "ACDEFGHIKLMNPQRSTVWY"  # List of amino acids
n_amino_acids = len(amino_acids)  # Number of amino acids

In [24]:
def calculate_aac_features(sequences, amino_acids):
    n_amino_acids = len(amino_acids)
    features = []
    for fasta_sequence in sequences:
        aac = np.zeros(n_amino_acids)
        for aa in fasta_sequence:
            if aa in amino_acids:
                index = amino_acids.index(aa)
                aac[index] += 1

        # Normalize the amino acid composition to obtain relative frequencies
        aac /= len(fasta_sequence)

        # Append the AAC feature vector to the feature list
        features.append(aac)

    return np.array(features)

In [25]:
hxt_ = calculate_aac_features(hxt,amino_acids)
glut_ = calculate_aac_features(glut,amino_acids)
sglt_ = calculate_aac_features(sglt,amino_acids)
chb_ = calculate_aac_features(chb,amino_acids)

In [26]:
x_train = np.concatenate((glut_[:glutlen-1], sglt_[:sgltlen-1], hxt_[:hxtlen-1], chb_[:chblen-1]), axis=0)
x_test = np.concatenate((glut_[glutlen-1:], sglt_[sgltlen-1:], hxt_[hxtlen-1:], chb_[chblen-1:]), axis=0)

# Create the corresponding labels
y_train = np.array(["GLUT"] * (glutlen-1) + ["SGLT"] * (sgltlen-1) + ["HXT"] * (hxtlen-1) + ["CHB"] * (chblen-1))
y_test = np.array(["GLUT"] * 1 + ["SGLT"] * 1 + ["HXT"] * 1 + ["CHB"] * 1)


In [27]:
classifier = RandomForestClassifier()
classifier.fit(x_train, y_train)

RandomForestClassifier()

In [28]:
y_pred_train = classifier.predict(x_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
print("Training Accuracy:", train_accuracy)

y_pred_test = classifier.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Testing Accuracy:", test_accuracy)

y_test, y_pred_test

Training Accuracy: 1.0
Testing Accuracy: 0.25


(array(['GLUT', 'SGLT', 'HXT', 'CHB'], dtype='<U4'),
 array(['CHB', 'GLUT', 'GLUT', 'CHB'], dtype='<U4'))

In [29]:
from sklearn import svm

clf = svm.SVC()
clf.fit(x_train, y_train)

y_pred_train = clf.predict(x_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
print("Training Accuracy:", train_accuracy)

y_pred_test = clf.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Testing Accuracy:", test_accuracy)

y_test, y_pred_test

Training Accuracy: 0.7894736842105263
Testing Accuracy: 0.25


(array(['GLUT', 'SGLT', 'HXT', 'CHB'], dtype='<U4'),
 array(['CHB', 'GLUT', 'GLUT', 'CHB'], dtype='<U4'))

In [30]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(x_train, y_train)

y_pred_train = knn.predict(x_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
print("Training Accuracy:", train_accuracy)

y_pred_test = knn.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Testing Accuracy:", test_accuracy)

y_test, y_pred_test

Training Accuracy: 0.7631578947368421
Testing Accuracy: 0.25


(array(['GLUT', 'SGLT', 'HXT', 'CHB'], dtype='<U4'),
 array(['GLUT', 'GLUT', 'GLUT', 'SGLT'], dtype='<U4'))

In [31]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)
model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(y_train))
model.fit(x_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_class=76,
              num_parallel_tree=None, objective='multi:softmax', ...)

In [32]:
y_pred_train = model.predict(x_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
print("Training Accuracy:", train_accuracy)

y_pred_test = model.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Testing Accuracy:", test_accuracy)

y_test = encoder.inverse_transform(y_test)
y_pred_test = encoder.inverse_transform(y_pred_test)
y_test, y_pred_test

Training Accuracy: 1.0
Testing Accuracy: 0.75


(array(['GLUT', 'SGLT', 'HXT', 'CHB'], dtype='<U4'),
 array(['CHB', 'SGLT', 'HXT', 'CHB'], dtype='<U4'))