In [1]:
from math import factorial
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import torch.nn.functional as F
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

True
1
0
NVIDIA GeForce RTX 3060 Laptop GPU


In [2]:
train = np.load('C:/Users/PHB/phylo/data/nogap50k_500/TRAIN.npy')
valid = np.load('C:/Users/PHB/phylo/data/nogap50k_500/VALID.npy')
test = np.load('C:/Users/PHB/phylo/data/nogap50k_500/TEST.npy')
print(train.shape)
print(valid.shape)
print(test.shape)

#N unrooted trees given N taxa
def n_unroot(Ntaxa):
    N=factorial(2*Ntaxa-5)/(factorial(Ntaxa-3)*2**(Ntaxa-3))
    return(int(N))

def to_categorical(y, num_classes):
    """ 1-hot encodes a tensor """
    return np.eye(num_classes, dtype='uint8')[y]
#Generate labels
Nlabels=n_unroot(4)
train_label=to_categorical(np.repeat(range(0,Nlabels),len(train)/Nlabels), num_classes=3)
valid_label=to_categorical(np.repeat(range(0,Nlabels),len(valid)/Nlabels), num_classes=3)
test_label=to_categorical(np.repeat(range(0,Nlabels),len(test)/Nlabels), num_classes=3)

(150000, 4, 500, 1)
(15000, 4, 500, 1)
(15000, 4, 500, 1)


In [3]:

class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Conv2d(4, 16, (3, 1), padding=(1, 0)),
            nn.ReLU(),
            nn.MaxPool2d((2, 1), (2, 1)),
            nn.Conv2d(16, 8, (3, 1), padding=(1, 0)),
            nn.ReLU(),
            nn.MaxPool2d((2, 1), (2, 1)),
        )
        
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(8, 16, (2, 1), stride=(2, 1)),
            nn.ReLU(),
            nn.ConvTranspose2d(16, 4, (2, 1), stride=(2, 1)),
            nn.Sigmoid(),
        )

    def forward(self, x, return_embeddings=False):
        x = self.encoder(x)
        if return_embeddings:
            return x
        x = self.decoder(x)
        return x

In [4]:
# Initialize model, loss function and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda

NVIDIA GeForce RTX 3060 Laptop GPU
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [5]:
# Saving the model
model_path = 'C:/Users/PHB/phylo/model/model0.pth'
loaded_model = Autoencoder()
loaded_model.load_state_dict(torch.load(model_path))
#torch.save(model,model_path)

# Loading the model (for future use)
#loaded_model = torch.load(model_path)
loaded_model.to(device)  # Move the model to the appropriate device
loaded_model.eval()  # Set the model to evaluation mode


Autoencoder(
  (encoder): Sequential(
    (0): Conv2d(4, 16, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(16, 8, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
  )
  (decoder): Sequential(
    (0): ConvTranspose2d(8, 16, kernel_size=(2, 1), stride=(2, 1))
    (1): ReLU()
    (2): ConvTranspose2d(16, 4, kernel_size=(2, 1), stride=(2, 1))
    (3): Sigmoid()
  )
)

In [6]:
def get_embeddings(model, data_loader):
    model.eval()
    embeddings = []
    
    with torch.no_grad():
        for data in data_loader:
            inputs = data[0].to(device)
            embedding = model(inputs, return_embeddings=True)
            embeddings.append(embedding.cpu().detach().numpy())
    
    embeddings = np.concatenate(embeddings, axis=0)
    return embeddings

In [7]:
# Convert numpy arrays to PyTorch tensors
train_data = torch.tensor(train, dtype=torch.float32)
valid_data = torch.tensor(valid, dtype=torch.float32)
test_data = torch.tensor(test, dtype=torch.float32)

# Create TensorDatasets and DataLoaders
batch_size = 64
train_dataset = TensorDataset(train_data)
valid_dataset = TensorDataset(valid_data)
test_dataset = TensorDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [8]:
train_embeddings = get_embeddings(loaded_model, train_loader)
test_embeddings = get_embeddings(loaded_model, test_loader)

In [25]:
train_embeddings.shape

(150000, 8, 125, 1)

array([0, 0, 0, ..., 2, 2, 2])

In [9]:
X_train = train_embeddings.reshape(150000, -1)
y_train = np.repeat(range(0,Nlabels),len(train)/Nlabels)
X_test = test_embeddings.reshape(15000, -1)
y_test = np.repeat(range(0,Nlabels),len(test)/Nlabels)

In [12]:

X_test_pca = pca.transform(X_test)

NameError: name 'pca' is not defined

In [11]:
from sklearn.decomposition import PCA

In [20]:
pca = PCA(60)
X_train_pca = pca.fit_transform(X_train)

In [21]:
X_train_pca.shape

(150000, 60)

In [44]:
# training a Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB().fit(X_train, y_train)
gnb_predictions = gnb.predict(X_test)

In [45]:
gnb_predictions

array([0, 1, 2, ..., 0, 0, 1])

In [46]:

from sklearn.metrics import confusion_matrix
# model accuracy for X_test  
accuracy = gnb.score(X_test, y_test)
  
# creating a confusion matrix
cm = confusion_matrix(y_test, gnb_predictions)

In [47]:
print(accuracy)

0.3364666666666667


In [15]:
import scipy
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, SGDRegressor
from sklearn import metrics

In [None]:
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train)
svm_predictions = svm_model_linear.predict(X_test)
  
# model accuracy for X_test  
accuracy = svm_model_linear.score(X_test, y_test)
print(accuracy)

In [None]:
svm_grid = {
    'C' : [0.1, 1.0, 10.0],
    'kernel' :['linear', 'poly', 'rbf', 'sigmoid'],
    'degree' : [3],
    'gamma': ['scale'],
}


result_list = []
grid_list = []

grid = GridSearchCV(
        estimator = SVC(),
        param_grid = svm_grid,
        scoring = 'accuracy',
        verbose = 1,
        n_jobs = -1 # use all available cores
    )
grid.fit(X_train_pca, y_train)
result_list.append(pd.DataFrame.from_dict(grid.cv_results_))
grid_list.append(grid)



Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.


In [13]:
for grid in grid_list:
    print(grid.best_estimator_)
    print()
    preds = grid.predict(X_test_pca)
    #print(f'{scipy.stats.spearmanr(ys_test, preds)}')
    fpr, tpr, thresholds = metrics.roc_curve(ys_test, preds, pos_label=1)
    print(f'{metrics.auc(fpr, tpr)}')
    print('\n', '-' * 80, '\n')

NameError: name 'grid_list' is not defined

In [1]:
#!/usr/bin/env python
# coding: utf-8
import random
from collections import Counter
from tqdm import tqdm 
import argparse





train_size = 0.5
Xs_train, Xs_test, ys_train, ys_test = train_test_split(Xs, ys, train_size=train_size, random_state=42)

Xs_train.shape, Xs_test.shape, len(ys_train), len(ys_test)

pca = PCA(60)
Xs_train_pca = pca.fit_transform(Xs_train)

svm_grid = {
    'C' : [0.1, 1.0, 10.0],
    'kernel' :['linear', 'poly', 'rbf', 'sigmoid'],
    'degree' : [3],
    'gamma': ['scale'],
}


result_list = []
grid_list = []

grid = GridSearchCV(
        estimator = SVR(),
        param_grid = svm_grid,
        scoring = 'roc_auc',
        verbose = 1,
        n_jobs = -1 # use all available cores
    )
grid.fit(Xs_train_pca, ys_train)
result_list.append(pd.DataFrame.from_dict(grid.cv_results_))
grid_list.append(grid)

Xs_test_pca = pca.transform(Xs_test)
for grid in grid_list:
    print(grid.best_estimator_)
    print()
    preds = grid.predict(Xs_test_pca)
    #print(f'{scipy.stats.spearmanr(ys_test, preds)}')
    fpr, tpr, thresholds = metrics.roc_curve(ys_test, preds, pos_label=1)
    print(f'{metrics.auc(fpr, tpr)}')
    print('\n', '-' * 80, '\n')

True

In [2]:
!nvidia-smi

Sun Apr 16 14:21:04 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 528.49       Driver Version: 528.49       CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   53C    P3    18W /  55W |      0MiB /  6144MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces