In [None]:
import numpy as np

# Pre-processing data

## read data

In [None]:
#load data
def read_data(file_path):
    data = np.genfromtxt(file_path, dtype='str')
    return data

## split data

In [None]:
#split data

def train_test_split(X, y, test_ratio, random_seed=None):
    if random_seed is not None:
        np.random.seed(random_seed)

    # Shuffle the indices
    num_samples = len(X)
    shuffled_indices = np.random.permutation(num_samples)

    # Calculate the number of samples for the test set
    num_test_samples = int(test_ratio * num_samples)

    # Split the data
    test_indices = shuffled_indices[:num_test_samples]
    train_indices = shuffled_indices[num_test_samples:]

    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]

    return X_train, X_test, y_train, y_test

## Oversampling (smote)

In [None]:
# oversample data
def oversample_data(X, y):

    # Identify minority and majority class
    unique_classes, class_counts = np.unique(y, return_counts=True)
    minority_class = unique_classes[np.argmin(class_counts)]
    majority_class = unique_classes[np.argmax(class_counts)]

    # Find indices of minority class samples
    minority_indices = np.where(y == minority_class)[0]

    # Calculate the oversampling factor
    oversample_factor = int(class_counts[0] / class_counts[1]) - 1

    # Oversample the minority class by duplicating samples
    oversampled_indices = np.tile(minority_indices, oversample_factor)
    oversampled_X = np.concatenate([X, X[oversampled_indices]])
    oversampled_y = np.concatenate([y, y[oversampled_indices]])

    # Shuffle the oversampled data
    shuffle_indices = np.random.permutation(len(oversampled_X))
    oversampled_X = oversampled_X[shuffle_indices]
    oversampled_y = oversampled_y[shuffle_indices]

    return oversampled_X, oversampled_y


## Label transform

In [None]:
# label transform -1 to 0 1 to 1
def transform_labels(labels):
    return (np.array(labels, dtype=int) + 1) // 2

## Extracting features using K-mer

In [None]:
# extracting feature for all train data, val data and test data
# feature extrac using k-mer model
def k_mer(train_peptides, val_peptides, test_peptides, k=3):
    def generate_kmers(peptides, k):
        return [peptides[i:i + k] for i in range(len(peptides) - k + 1)]

    def create_vocabulary(data, k):
        kmers = set()
        for sequence in data:
            kmers.update(generate_kmers(sequence, k))
        return sorted(list(kmers))

    def kmer_encoded_peptide(data, vocabulary, k):
        kmer_rep = np.zeros((len(data), len(vocabulary)))
        for i, sequence in enumerate(data):
            for kmer in generate_kmers(sequence, k):
                if kmer in vocabulary:
                    kmer_rep[i, vocabulary.index(kmer)] += 1
        return kmer_rep

    # Create k-mer vocabulary from the training set
    kmer_vocabulary = create_vocabulary(train_peptides, k)

    # Generate k-mer representation for training and validation sets
    train_kmer = kmer_encoded_peptide(train_peptides, kmer_vocabulary, k)
    val_kmer = kmer_encoded_peptide(val_peptides, kmer_vocabulary, k)
    test_kmer = kmer_encoded_peptide(test_peptides, kmer_vocabulary, k)

    return train_kmer, val_kmer, test_kmer


## Get the final train data, val data and test data

In [None]:
# load data
train = read_data('train.dat')
test = read_data('test.dat')
X = train[:, 1]
y = train[:, 0]

In [None]:
# split the train data to train data and val data
test_ratio=0.2
X_train, X_val, y_train, y_val = train_test_split(X,y, test_ratio)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(1253,) (313,) (1253,) (313,)


In [None]:
# oversampled the train data
oversampled_X, oversampled_y = oversample_data(X_train, y_train)

In [None]:
# train data and val data label transform -1 to 0, 1 to 1
oversampled_y_labels = transform_labels(oversampled_y)
y_val_labels = transform_labels(y_val)

In [None]:
# extract feature datas
#X_train_bows, X_val_bows, X_test_bows = bag_of_words(oversampled_X, X_val, test)
X_train_kmer, X_val_kmer, X_test_kmer = k_mer(oversampled_X, X_val, test)

In [None]:
#print the shape for all data
print(X_train_kmer.shape, X_val_kmer.shape, oversampled_y_labels.shape,y_val_labels.shape ,X_test_kmer.shape)

(2181, 6517) (313, 6517) (2181,) (313,) (392, 6517)


In [None]:
#transpose x_train
X_train_features = X_train_kmer.T
# reshape the label
y_train_labels = oversampled_y_labels[np.newaxis]

In [None]:
print(X_train_features.shape, y_train_labels.shape)

(6517, 2181) (1, 2181)


# Model
one input layer --one hidden layer -- one output layer

# Activation Functions

In [None]:
def tanh(x):
    return np.tanh(x)

def relu(x):
    return np.maximum(x, 0)

def sigmoid(x):
    return 1/(1+np.exp(-x))

In [None]:
def derivative_tanh(x):
    return (1 - np.power(np.tanh(x), 2))

def derivative_relu(x):
    return np.array(x > 0, dtype = np.float32)

#Initialize parameters Randomly
W1=np.random.randn(n1,n0)

b1=np.zeros((n1,1))

W2=np.random.randn(n2,n1)

b2=np.zeros((n2,1))

In [None]:
def initialize_parameters(n_x, n_h, n_y):
    w1 = np.random.randn(n_h, n_x)*0.1
    b1 = np.zeros((n_h, 1))

    w2 = np.random.randn(n_y, n_h)*0.1
    b2 = np.zeros((n_y, 1))

    parameters = {
        "w1" : w1,
        "b1" : b1,
        "w2" : w2,
        "b2" : b2
    }

    return parameters

# Forward Propagation

$ Z_1 = W_1 * X + B_1 $

$ A_1 = f ( Z_1 ) $  

$ Z_2 = W2 * A_1 + B_2 $

$ A_2 = sigmoid( Z_2 ) $





In [None]:
def forward_propagation(x, parameters):

    w1 = parameters['w1']
    b1 = parameters['b1']
    w2 = parameters['w2']
    b2 = parameters['b2']

    z1 = np.dot(w1, x) + b1
    a1 = relu(z1)

    z2 = np.dot(w2, a1) + b2
    a2 = sigmoid(z2)

    forward_cache = {
        "z1" : z1,
        "a1" : a1,
        "z2" : z2,
        "a2" : a2
    }

    return forward_cache

## Cost Function

$ Cost = - \frac{1}{m} \sum_{i=1}^{m} [ y*log(a_i) + (1-y)*log(1 - a_i) ] $

In [None]:
def cost_function(a2, y):
    m = y.shape[1]

    cost = (1./m) * (-np.dot(y,np.log(a2).T) - np.dot(1-y, np.log(1-a2).T))
    cost = np.squeeze(cost)

    return cost

# Backward Propagation
dZ2=(A2−Y)

dW2=1/m.dZ2.AT1

dB2=1/m.sum(dZ2,1)

dZ1=WT2.dZ2∗f|1(Z1)

dW1=1/m.dZ1.XT

dB1=1/m.sum(dZ1,1)

In [None]:
def backward_prop(x, y, parameters, forward_cache):

    w1 = parameters['w1']
    b1 = parameters['b1']
    w2 = parameters['w2']
    b2 = parameters['b2']

    a1 = forward_cache['a1']
    a2 = forward_cache['a2']

    m = x.shape[1]

    dz2 = (a2 - y)
    dw2 = (1/m)*np.dot(dz2, a1.T)
    db2 = (1/m)*np.sum(dz2, axis = 1, keepdims = True)

    dz1 = (1/m)*np.dot(w2.T, dz2)*derivative_relu(a1)
    dw1 = (1/m)*np.dot(dz1, x.T)
    db1 = (1/m)*np.sum(dz1, axis = 1, keepdims = True)

    gradients = {
        "dw1" : dw1,
        "db1" : db1,
        "dw2" : dw2,
        "db2" : db2
    }

    return gradients

# Updating Parameters

$ W_2 = W_2 -  \alpha * \frac{\partial Cost }{\partial W_2}$

$ B_2 = B_2 -  \alpha * \frac{\partial Cost }{\partial B_2}$

$ W_1 = W_1 -  \alpha * \frac{\partial Cost }{\partial W_1}$

$ B_1 = B_1 -  \alpha * \frac{\partial Cost }{\partial B_1}$


In [None]:
def update_parameters(parameters, gradients, learning_rate):

    w1 = parameters['w1']
    b1 = parameters['b1']
    w2 = parameters['w2']
    b2 = parameters['b2']

    dw1 = gradients['dw1']
    db1 = gradients['db1']
    dw2 = gradients['dw2']
    db2 = gradients['db2']

    w1 = w1 - learning_rate*dw1
    b1 = b1 - learning_rate*db1
    w2 = w2 - learning_rate*dw2
    b2 = b2 - learning_rate*db2

    parameters = {
        "w1" : w1,
        "b1" : b1,
        "w2" : w2,
        "b2" : b2
    }

    return parameters

# Complete Model

In [None]:
def model(x, y, n_h, learning_rate, iterations):

    n_x = x.shape[0]
    n_y = y.shape[0]

    cost_list = []

    parameters = initialize_parameters(n_x, n_h, n_y)

    for i in range(iterations):

        forward_cache = forward_propagation(x, parameters)

        cost = cost_function(forward_cache['a2'], y)

        gradients = backward_prop(x, y, parameters, forward_cache)

        parameters = update_parameters(parameters, gradients, learning_rate)

        cost_list.append(cost)

        if(i%(iterations/10) == 0):
            print("Cost after", i, "iterations is :", cost)

    return parameters, cost_list

In [None]:
iterations = 100
n_h = 1000
learning_rate = 0.5
Parameters, Cost_list = model(X_train_features, y_train_labels, n_h = n_h, learning_rate = learning_rate, iterations = iterations)

Cost after 0 iterations is : 0.799879512413404
Cost after 10 iterations is : 0.9469356824635891
Cost after 20 iterations is : 0.3407856158093043
Cost after 30 iterations is : 0.22240137461748474
Cost after 40 iterations is : 0.19080481293154122
Cost after 50 iterations is : 0.16856141006344774
Cost after 60 iterations is : 0.15166894249461255
Cost after 70 iterations is : 0.1382862128896168
Cost after 80 iterations is : 0.12735361599865988
Cost after 90 iterations is : 0.11821309978324872


In [None]:
# calculate mcc
def calculate_mcc(predicted_labels, true_labels):
    if len(predicted_labels) != len(true_labels):
        raise ValueError("Lengths of predicted_labels and true_labels must be the same.")

    TP, TN, FP, FN = 0, 0, 0, 0

    for predicted, true in zip(predicted_labels, true_labels):
        if predicted == 1 and true == 1:
            TP += 1
        elif predicted == 0 and true == 0:
            TN += 1
        elif predicted == 1 and true == 0:
            FP += 1
        elif predicted == 0 and true == 1:
            FN += 1

    numerator = TP * TN - FP * FN
    denominator = ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))

    if denominator == 0:
        return 0  # Handle division by zero
    else:
        return numerator / (denominator ** 0.5)

In [None]:
def accuracy(X, y, parameters):
  m = X.shape[1]
  forward_cache = forward_propagation(X, parameters)
  a_out = forward_cache['a2']
  y_pred = np.array(a_out > 0.5, dtype = 'float')
  acc = np.mean(y_pred == y)*100
  return acc

In [None]:
print("Accuracy of Train Dataset:", accuracy(X_train_features, y_train_labels, Parameters))
print("Accuracy of Val Dataset:", accuracy(X_val_kmer.T, y_val_labels, Parameters))

Accuracy of Train Dataset: 99.40394314534618
Accuracy of Val Dataset: 97.76357827476039


In [None]:
def predict(X):
    m = X.T.shape[1]
    forward_cache = forward_propagation(X.T, Parameters)
    a_out = forward_cache['a2']
    y_pred = np.array(a_out > 0.5, dtype = 'float')
    y_pred[y_pred==0]=-1
    return y_pred

In [None]:
y_test_predict = predict(X_test_kmer)

In [None]:
f=open('results_kmer.txt','w')
for i in y_test_predict.T:
    f.write(str(int(i))+'\n')
f.close()