### 0. Prepare Data in Each Fold

### 1. DNN model

In [8]:
import numpy as np
gene_x = np.load('./data/post_data/gene_x.npy', allow_pickle=True)
pheno_x = np.load('./data/post_data/pheno_x.npy', allow_pickle=True)
geno_pheno_x = np.hstack((gene_x, pheno_x))

In [13]:
import pandas as pd
fold_n = 1
subfeature_dict_df = pd.read_csv('./data/filtered_data/subfeature_dict_df.csv')
num_subfeature = subfeature_dict_df.shape[0]
train_idx = np.load('./data/post_data/train_idx_' + str(fold_n) + '.npy', allow_pickle=True)
test_idx = np.load('./data/post_data/test_idx_' + str(fold_n) + '.npy', allow_pickle=True)
train_x = geno_pheno_x[train_idx - num_subfeature]
test_x = geno_pheno_x[test_idx - num_subfeature]
print(train_x.shape)
print(test_x.shape)
train_label = np.load('./data/post_data/train_label_' + str(fold_n) + '.npy', allow_pickle=True)
test_label = np.load('./data/post_data/test_label_' + str(fold_n) + '.npy', allow_pickle=True)
print(train_label.shape)
print(test_label.shape)

(651, 8382)
(162, 8382)
(651, 3)
(162, 3)


In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [16]:
train_x = train_x.astype(np.float32)
train_x_tensor = torch.FloatTensor(train_x)
train_label_tensor = torch.LongTensor(train_label)
test_x = test_x.astype(np.float32)
test_x_tensor = torch.FloatTensor(test_x)
test_label_tensor = torch.LongTensor(test_label)

In [17]:
batch_size = 64

train_dataset = TensorDataset(train_x_tensor, train_label_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(test_x_tensor, test_label_tensor)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [18]:
class DNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(DNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.fc3(x)

input_dim = train_x.shape[1]
hidden_dim = 1024
output_dim = 3

model = DNN(input_dim, hidden_dim, output_dim)

In [19]:
from tqdm import tqdm
# Training Loop with tqdm
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    train_correct = 0
    train_total = 0
    pbar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}/{epochs}")
    
    for i, (batch_x, batch_label) in pbar:
        optimizer.zero_grad()
        outputs = model(batch_x)
        _, batch_targets = batch_label.max(dim=1)
        loss = criterion(outputs, batch_targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
        _, predicted = torch.max(outputs, 1)
        train_total += batch_label.size(0)
        train_correct += (predicted == batch_targets).sum().item()
        
        train_accuracy = 100 * train_correct / train_total
        pbar.set_postfix(Loss=loss.item(), Training_Accuracy=train_accuracy)
    avg_loss = total_loss / len(train_loader)
    print(f"\nEpoch [{epoch+1}/{epochs}], Avg Loss: {avg_loss:.4f}, Training Accuracy: {train_accuracy:.2f}%")

    # Evaluate the Model with tqdm
    model.eval()
    correct = 0
    total = 0
    pbar = tqdm(test_loader, desc="Evaluating")

    with torch.no_grad():
        for batch_x, batch_label in pbar:
            outputs = model(batch_x)
            _, predicted = torch.max(outputs, 1)
            _, batch_targets = batch_label.max(dim=1)
            total += batch_label.size(0)
            correct += (predicted == batch_targets).sum().item()
    accuracy = 100 * correct / total
    print(f'Accuracy on test set: {accuracy:.2f}%')
    print('--------------------------------------------------------------')
    print('\n')
    



Epoch 1/20: 100%|██████████| 11/11 [00:00<00:00, 19.66it/s, Loss=0.379, Training_Accuracy=73.1]



Epoch [1/20], Avg Loss: 1.2018, Training Accuracy: 73.12%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 187.50it/s]


Accuracy on test set: 77.78%
--------------------------------------------------------------




Epoch 2/20: 100%|██████████| 11/11 [00:00<00:00, 26.80it/s, Loss=0.388, Training_Accuracy=79.1]



Epoch [2/20], Avg Loss: 0.7198, Training Accuracy: 79.11%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 214.24it/s]


Accuracy on test set: 77.78%
--------------------------------------------------------------




Epoch 3/20: 100%|██████████| 11/11 [00:00<00:00, 32.63it/s, Loss=0.438, Training_Accuracy=79.1]



Epoch [3/20], Avg Loss: 0.6537, Training Accuracy: 79.11%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 214.27it/s]


Accuracy on test set: 77.78%
--------------------------------------------------------------




Epoch 4/20: 100%|██████████| 11/11 [00:00<00:00, 29.56it/s, Loss=0.352, Training_Accuracy=79.1]



Epoch [4/20], Avg Loss: 0.6196, Training Accuracy: 79.11%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 218.44it/s]


Accuracy on test set: 77.78%
--------------------------------------------------------------




Epoch 5/20: 100%|██████████| 11/11 [00:00<00:00, 31.53it/s, Loss=0.636, Training_Accuracy=79.1]



Epoch [5/20], Avg Loss: 0.6282, Training Accuracy: 79.11%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 199.98it/s]


Accuracy on test set: 77.78%
--------------------------------------------------------------




Epoch 6/20: 100%|██████████| 11/11 [00:00<00:00, 31.43it/s, Loss=0.308, Training_Accuracy=79.1]



Epoch [6/20], Avg Loss: 0.5994, Training Accuracy: 79.11%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 238.41it/s]


Accuracy on test set: 77.78%
--------------------------------------------------------------




Epoch 7/20: 100%|██████████| 11/11 [00:00<00:00, 31.18it/s, Loss=0.556, Training_Accuracy=79.1]



Epoch [7/20], Avg Loss: 0.5671, Training Accuracy: 79.11%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 230.72it/s]


Accuracy on test set: 77.78%
--------------------------------------------------------------




Epoch 8/20: 100%|██████████| 11/11 [00:00<00:00, 26.76it/s, Loss=0.877, Training_Accuracy=79.9]



Epoch [8/20], Avg Loss: 0.5523, Training Accuracy: 79.88%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 230.71it/s]


Accuracy on test set: 72.22%
--------------------------------------------------------------




Epoch 9/20: 100%|██████████| 11/11 [00:00<00:00, 28.68it/s, Loss=0.787, Training_Accuracy=80.3]



Epoch [9/20], Avg Loss: 0.5490, Training Accuracy: 80.34%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 232.47it/s]


Accuracy on test set: 77.16%
--------------------------------------------------------------




Epoch 10/20: 100%|██████████| 11/11 [00:00<00:00, 31.85it/s, Loss=0.386, Training_Accuracy=80.6]



Epoch [10/20], Avg Loss: 0.5432, Training Accuracy: 80.65%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 205.08it/s]


Accuracy on test set: 77.78%
--------------------------------------------------------------




Epoch 11/20: 100%|██████████| 11/11 [00:00<00:00, 32.45it/s, Loss=0.175, Training_Accuracy=80.2]



Epoch [11/20], Avg Loss: 0.4918, Training Accuracy: 80.18%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 214.25it/s]


Accuracy on test set: 78.40%
--------------------------------------------------------------




Epoch 12/20: 100%|██████████| 11/11 [00:00<00:00, 31.81it/s, Loss=0.359, Training_Accuracy=82.8]



Epoch [12/20], Avg Loss: 0.4196, Training Accuracy: 82.80%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 230.65it/s]


Accuracy on test set: 71.60%
--------------------------------------------------------------




Epoch 13/20: 100%|██████████| 11/11 [00:00<00:00, 32.39it/s, Loss=0.499, Training_Accuracy=83.9]



Epoch [13/20], Avg Loss: 0.4217, Training Accuracy: 83.87%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 229.58it/s]


Accuracy on test set: 71.60%
--------------------------------------------------------------




Epoch 14/20: 100%|██████████| 11/11 [00:00<00:00, 31.98it/s, Loss=0.294, Training_Accuracy=84.3]



Epoch [14/20], Avg Loss: 0.3877, Training Accuracy: 84.33%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 214.25it/s]


Accuracy on test set: 76.54%
--------------------------------------------------------------




Epoch 15/20: 100%|██████████| 11/11 [00:00<00:00, 26.74it/s, Loss=0.481, Training_Accuracy=88] 



Epoch [15/20], Avg Loss: 0.3296, Training Accuracy: 88.02%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 230.71it/s]


Accuracy on test set: 68.52%
--------------------------------------------------------------




Epoch 16/20: 100%|██████████| 11/11 [00:00<00:00, 31.76it/s, Loss=0.189, Training_Accuracy=90.2]



Epoch [16/20], Avg Loss: 0.2616, Training Accuracy: 90.17%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 212.27it/s]


Accuracy on test set: 74.07%
--------------------------------------------------------------




Epoch 17/20: 100%|██████████| 11/11 [00:00<00:00, 32.32it/s, Loss=0.0472, Training_Accuracy=90.8]



Epoch [17/20], Avg Loss: 0.2189, Training Accuracy: 90.78%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 230.73it/s]


Accuracy on test set: 77.78%
--------------------------------------------------------------




Epoch 18/20: 100%|██████████| 11/11 [00:00<00:00, 31.35it/s, Loss=0.167, Training_Accuracy=90.2]



Epoch [18/20], Avg Loss: 0.2400, Training Accuracy: 90.17%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 199.96it/s]


Accuracy on test set: 77.78%
--------------------------------------------------------------




Epoch 19/20: 100%|██████████| 11/11 [00:00<00:00, 32.14it/s, Loss=0.0897, Training_Accuracy=90.5]



Epoch [19/20], Avg Loss: 0.2612, Training Accuracy: 90.48%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 210.69it/s]


Accuracy on test set: 77.78%
--------------------------------------------------------------




Epoch 20/20: 100%|██████████| 11/11 [00:00<00:00, 32.25it/s, Loss=0.602, Training_Accuracy=91.6]



Epoch [20/20], Avg Loss: 0.3029, Training Accuracy: 91.55%


Evaluating: 100%|██████████| 3/3 [00:00<00:00, 214.17it/s]

Accuracy on test set: 76.54%
--------------------------------------------------------------







### 2. Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(n_estimators=100, random_state=42)  # using 100 trees
rf.fit(train_x, train_label)

In [21]:
predicted_labels = rf.predict(test_x)
accuracy = accuracy_score(test_label, predicted_labels)
print(f'Accuracy with Random Forest: {accuracy * 100:.2f}%')

Accuracy with Random Forest: 75.93%


### 3. Linear Regression

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [23]:
# Create the model. Increase max_iter if the algorithm doesn't converge.
lr_classifier = LogisticRegression(max_iter=100)

# Fit the model to the training data
train_label_1d = np.argmax(train_label, axis=1)
lr_classifier.fit(train_x, train_label_1d)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
# Predict class labels directly
predicted_labels = lr_classifier.predict(test_x)
test_label_1d = np.argmax(test_label, axis=1)  # If test_label is one-hot encoded

# Calculate accuracy
accuracy = accuracy_score(test_label_1d, predicted_labels)
print(f'Accuracy with Logistic Regression: {accuracy * 100:.2f}%')


Accuracy with Logistic Regression: 77.78%
