In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
np.random.seed(42)

# Pre-processing data

## read data

In [None]:
#load data
def read_data(file_path):
    data = np.genfromtxt(file_path, dtype='str')
    return data

## Label transform

In [None]:
# label transform -1 to 0 1 to 1
def transform_labels(labels):
    return (np.array(labels, dtype=int) + 1) // 2

## Extracting features using k-mer features

In [None]:
# extracting feature for all train data, val data and test data
# feature extrac using k-mer model
def k_mer(train_peptides, test_peptides, k=3):
    def generate_kmers(peptides, k):
        return [peptides[i:i + k] for i in range(len(peptides) - k + 1)]

    def create_vocabulary(data, k):
        kmers = set()
        for sequence in data:
            kmers.update(generate_kmers(sequence, k))
        return sorted(list(kmers))

    def kmer_encoded_peptide(data, vocabulary, k):
        kmer_rep = np.zeros((len(data), len(vocabulary)))
        for i, sequence in enumerate(data):
            for kmer in generate_kmers(sequence, k):
                if kmer in vocabulary:
                    kmer_rep[i, vocabulary.index(kmer)] += 1
        return kmer_rep

    # Create k-mer vocabulary from the training set
    kmer_vocabulary = create_vocabulary(train_peptides, k)

    # Generate k-mer representation for training and validation sets
    train_kmer = kmer_encoded_peptide(train_peptides, kmer_vocabulary, k)
    test_kmer = kmer_encoded_peptide(test_peptides, kmer_vocabulary, k)

    return train_kmer, test_kmer


## Get the final train data, val data and test data

In [None]:
# load data
train = read_data('train.dat')
test = read_data('test.dat')
X = train[:, 1]
y = train[:, 0]
print(X.shape, y.shape)

(1566,) (1566,)


In [None]:
# train data and val data label transform -1 to 0, 1 to 1
y_labels = transform_labels(y)

In [None]:
print(X.shape, y_labels.shape)

(1566,) (1566,)


In [None]:
# extract feature datas
#X_train_bows, X_val_bows, X_test_bows = bag_of_words(oversampled_X, X_val, test)
train_kmer, test_kmer = k_mer(X , test)

In [None]:
print(train_kmer.shape, y_labels.shape, test_kmer.shape)

(1566, 6837) (1566,) (392, 6837)


In [None]:
# split the train data to train data and val data
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_val, y_train, y_val  = train_test_split(train_kmer, y_labels, test_size=0.2, random_state=42)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(1252, 6837) (314, 6837) (1252,) (314,)


In [None]:
from imblearn.over_sampling import SMOTE
# Apply SMOTE to the training set
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print(X_resampled.shape, y_resampled.shape)

(2264, 6837) (2264,)


In [None]:
#print the shape for all data
print(X_resampled.shape, y_resampled.shape,X_val.shape, y_val.shape)


(2264, 6837) (2264,) (314, 6837) (314,)


# Model

In [None]:
# Building the neural network model
tf.keras.utils.set_random_seed(
    42
)
model = Sequential()
model.add(Dense(256, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(2, activation='softmax'))  # Output layer with 2 classification

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])

# Train the model
model.fit(X_resampled, y_resampled, validation_data=(X_val, y_val), epochs=30, batch_size=16)




Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7f92446a11d0>

In [None]:

pred=np.argmax(model.predict(test_kmer),axis=1)
pred[pred==0]=-1



In [None]:
print(pred)

[-1 -1  1 -1 -1 -1  1 -1 -1  1 -1  1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1
 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1  1
 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1  1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
  1 -1 -1 -1  1 -1  1 -1 -1 -1 -1 -1 -1  1 -1  1 -1 -1 -1 -1  1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1  1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1

In [None]:
f=open('results_dp_kmer.txt','w')
for i in pred:
    f.write(str(i)+'\n')
f.close()