In [74]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [75]:
def loadseq(file):
    lines = []
    with open(file, 'r') as file:
        for line in file:
            lines.append(line.strip())
    return lines


glutfile = 'glutoutputnoheader.fasta'
sgltfile = 'newsgltoutputnoheader.fasta'
sweetfile = 'newsweetoutputnoheader.fasta'
# hxtfile = 'hxtoutputnoheader.fasta'
# chbfile = 'chboutputnoheader.fasta'


glut = loadseq(glutfile)
sglt = loadseq(sgltfile)
sweet = loadseq(sweetfile)
# hxt = loadseq(hxtfile)
# chb = loadseq(chbfile)

# glut = glut[:25]
# hxt = hxt[:25]
# chb = chb[:5]

glutlen = len(glut)
sgltlen = len(sglt)
sweetlen = len(sweet)
# hxtlen = len(hxt)
# chblen = len(chb)

len(glut),len(sglt),len(sweet)

(396, 24, 675)

In [76]:
amino_acids = "ACDEFGHIKLMNPQRSTVWY"  # List of amino acids
n_amino_acids = len(amino_acids)  # Number of amino acids

In [77]:
def calculate_aac_features(sequences, amino_acids):
    n_amino_acids = len(amino_acids)
    features = []
    for fasta_sequence in sequences:
        aac = np.zeros(n_amino_acids)
        for aa in fasta_sequence:
            if aa in amino_acids:
                index = amino_acids.index(aa)
                aac[index] += 1

        # Normalize the amino acid composition to obtain relative frequencies
        aac /= len(fasta_sequence)

        # Append the AAC feature vector to the feature list
        features.append(aac)

    return np.array(features)

In [78]:
glut_ = calculate_aac_features(glut,amino_acids)
sglt_ = calculate_aac_features(sglt,amino_acids)
sweet_ = calculate_aac_features(sweet,amino_acids)
# hxt_ = calculate_aac_features(hxt,amino_acids)
# chb_ = calculate_aac_features(chb,amino_acids)

In [79]:
x_train = np.concatenate((glut_[:glutlen-20], sglt_[:sgltlen-5], sweet_[:sweetlen-20]), axis=0)
x_test = np.concatenate((glut_[glutlen-20:], sglt_[sgltlen-5:], sweet_[sweetlen-20:]), axis=0)

# x_train = np.concatenate((glut_[:glutlen-1], sglt_[:sgltlen-1], hxt_[:hxtlen-1], chb_[:chblen-1]), axis=0)
# x_test = np.concatenate((glut_[glutlen-1:], sglt_[sgltlen-1:], hxt_[hxtlen-1:], chb_[chblen-1:]), axis=0)

# Create the corresponding labels

y_train = np.array(["GLUT"] * (glutlen-20) + ["SGLT"] * (sgltlen-5) + ["SWEET"] * (sweetlen-20))
y_test = np.array(["GLUT"] * 20 + ["SGLT"] * 5 + ["SWEET"] * 20)

# y_train = np.array(["GLUT"] * (glutlen-1) + ["SGLT"] * (sgltlen-1) + ["HXT"] * (hxtlen-1) + ["CHB"] * (chblen-1))
# y_test = np.array(["GLUT"] * 1 + ["SGLT"] * 1 + ["HXT"] * 1 + ["CHB"] * 1)

x_train.shape

(1050, 20)

In [80]:
classifier = RandomForestClassifier()
classifier.fit(x_train, y_train)

y_pred_train = classifier.predict(x_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
print("Training Accuracy:", train_accuracy)

y_pred_test = classifier.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Testing Accuracy:", test_accuracy)

y_test, y_pred_test

Training Accuracy: 0.9980952380952381
Testing Accuracy: 0.8


(array(['GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT',
        'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT',
        'GLUT', 'GLUT', 'GLUT', 'GLUT', 'SGLT', 'SGLT', 'SGLT', 'SGLT',
        'SGLT', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET'],
       dtype='<U5'),
 array(['GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT',
        'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'SWEET',
        'GLUT', 'SWEET', 'SWEET', 'GLUT', 'SGLT', 'SWEET', 'GLUT', 'GLUT',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'GLUT', 'SWEET', 'SWEET',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'GLUT', 'SWEET',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET'],
       dtype='<U5'))

In [81]:
from sklearn import svm

clf = svm.SVC()
clf.fit(x_train, y_train)

y_pred_train = clf.predict(x_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
print("Training Accuracy:", train_accuracy)

y_pred_test = clf.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Testing Accuracy:", test_accuracy)

y_test, y_pred_test

Training Accuracy: 0.8961904761904762
Testing Accuracy: 0.7777777777777778


(array(['GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT',
        'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT',
        'GLUT', 'GLUT', 'GLUT', 'GLUT', 'SGLT', 'SGLT', 'SGLT', 'SGLT',
        'SGLT', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET'],
       dtype='<U5'),
 array(['SWEET', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT',
        'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'SWEET',
        'GLUT', 'SWEET', 'SWEET', 'GLUT', 'GLUT', 'SWEET', 'GLUT', 'GLUT',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'GLUT', 'SWEET', 'SWEET',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET'],
       dtype='<U5'))

In [82]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(x_train, y_train)

y_pred_train = knn.predict(x_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
print("Training Accuracy:", train_accuracy)

y_pred_test = knn.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Testing Accuracy:", test_accuracy)

y_test, y_pred_test

Training Accuracy: 0.9180952380952381
Testing Accuracy: 0.8444444444444444


(array(['GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT',
        'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT',
        'GLUT', 'GLUT', 'GLUT', 'GLUT', 'SGLT', 'SGLT', 'SGLT', 'SGLT',
        'SGLT', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET'],
       dtype='<U5'),
 array(['GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT',
        'SWEET', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT',
        'GLUT', 'SWEET', 'SWEET', 'GLUT', 'SGLT', 'SWEET', 'SGLT', 'SGLT',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'GLUT', 'SWEET', 'SWEET',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SGLT', 'SWEET',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET'],
       dtype='<U5'))

In [85]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)
model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(y_train))
model.fit(x_train, y_train)

y_pred_train = model.predict(x_train)
train_accuracy = accuracy_score(y_train, y_pred_train)
print("Training Accuracy:", train_accuracy)

y_pred_test = model.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Testing Accuracy:", test_accuracy)

y_test = encoder.inverse_transform(y_test)
y_pred_test = encoder.inverse_transform(y_pred_test)
y_test, y_pred_test

Training Accuracy: 0.9980952380952381
Testing Accuracy: 0.8444444444444444


(array(['GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT',
        'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT',
        'GLUT', 'GLUT', 'GLUT', 'GLUT', 'SGLT', 'SGLT', 'SGLT', 'SGLT',
        'SGLT', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET'],
       dtype='<U5'),
 array(['SWEET', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT',
        'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'GLUT', 'SWEET',
        'GLUT', 'SWEET', 'SWEET', 'GLUT', 'SGLT', 'SWEET', 'SGLT', 'SGLT',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'GLUT', 'SWEET', 'SWEET',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET',
        'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET', 'SWEET'],
       dtype='<U5'))