# Cross-validation training of the three models

In [1]:
import pandas as pd
import numpy as np
import torch
import transformers
from sklearn.ensemble import GradientBoostingClassifier
from Bio import SeqIO
import os
import sys
import pickle
import random

sys.path.append("../../")


  from .autonotebook import tqdm as notebook_tqdm


### Load in the sequence data 

In [6]:
LTRs = [rec for rec in SeqIO.parse("/home/xhorvat9/LTR_classification_data/Sequence_files/train_LTRs.fasta", "fasta")]
nonLTRs = [rec for rec in SeqIO.parse("/home/xhorvat9/LTR_classification_data/Sequence_files/non_LTRs_training.fasta", "fasta")]



LTR_motifs = pd.read_csv("~/LTR_classification_data/TFBS/LTR_train_motifCounts.csv", sep="\t").set_index("ID")
non_LTR_motifs = pd.read_csv("~/LTR_classification_data/TFBS/non_LTR_train_motifCounts.csv", sep="\t").set_index("ID")


### Check that sequences match motifs

In [47]:
# LTR ordering is identical to its motif representation
print("Indices for LTR sequences and motifs are identical: ", all(LTR_motifs["ID"] == [s.id for s in LTRs]))

# subset nonLTRs sequences to match the order of the motifs
non_LTR_sequence_df = pd.DataFrame({"sequence": [str(rec.seq) for rec in nonLTRs], "ID": [rec.id for rec in nonLTRs]})
non_LTR_sequence_df.set_index("ID", inplace=True)
non_LTR_sequence_df = non_LTR_sequence_df[np.invert(non_LTR_sequence_df.index.duplicated(keep='first'))]
print("Indices for non-LTR sequences and motifs are identical after subsetting: ", all(non_LTR_sequence_df.index == non_LTR_motifs.index))


Indices for LTR sequences and motifs are identical:  True


Indices for non-LTR sequences and motifs are identical after subsetting:  True


In [49]:
X = np.array([str(s.seq) for s in LTRs] + non_LTR_sequence_df["sequence"].tolist())
y = np.array([1]*len(LTRs) + [0]*len(non_LTR_sequence_df))

X_motifs = pd.concat([LTR_motifs, non_LTR_motifs], axis=0)

### Load in the BERT_model

In [100]:
from utils.BERT_utils import tok_func
import tqdm
# Preprocess the data 
tokenizer = transformers.BertTokenizer.from_pretrained('zhihan1996/DNA_bert_6')

# cut up sequences longer than 512
#def custom_tok(seq, k=510, tokenizer):
X_tokenized = []
for seq in tqdm.tqdm(X):
    sequence_chunks = [seq[i:i+510] for i in range(0, len(seq), 510)]
    seq_tokenized = []
    for chunk in sequence_chunks:
        tokenized_chunk = tokenizer(tok_func(chunk), padding=True, max_length=512, truncation=True)
        seq_tokenized.append(tokenized_chunk)
    X_tokenized.append(seq_tokenized)
#dataset = Dataset(tokenizer([tok_func(x, int(kmer), STRIDE_SIZE) for x in X], padding=True, truncation=True, max_length=512), y)
X_tokenized = np.array(X_tokenized, dtype="object")

  0%|          | 17/346040 [00:00<34:35, 166.69it/s]

100%|██████████| 346040/346040 [38:36<00:00, 149.40it/s] 


In [101]:
import pickle
pickle.dump(X_tokenized, open("X_tokenized.pkl", "wb+"))

In [17]:
from BERT_model import LTRBERT

ltrbert_model = LTRBERT(2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at zhihan1996/DNA_bert_6 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Load in the CNN

In [75]:
from utils.CNN_utils import onehote

X_OHE = [onehote(x) for x in X]
X_OHE = np.array(X_OHE, dtype="object")

In [135]:
# save the the OHE vector as .npy object 
#X_OH = np.asarray(X_OHE, dtype="object")
#np.save("X_OHE.npy", X_OH)

array([0., 0., 0., 0.])

### Run the K-fold CV

In [125]:
from sklearn.model_selection import KFold, StratifiedKFold
# TODO use StratifiedKFold instead for an even distribution of classes 
kf = StratifiedKFold(n_splits=5)
kf.get_n_splits(X_OHE)
split = kf.split(X_OHE, y)

In [143]:
from torch.utils.data import DataLoader
split = kf.split(X_OHE, y)
for i, (train_index, test_index) in enumerate(split):
    # Train the BERT model
    # TODO fix the subsetting 
    BERT_train_dataset = DataLoader(X_tokenized[train_index], y[train_index])
    BERT_test_dataset = DataLoader(X_tokenized[test_index], y[test_index])
    # TODO 


    # Train the CNN
    OHE_train_X = X_OHE[train_index]
    OHE_test_X = X_OHE[test_index]
    # TODO 


    # Train the GBC
    # TODO 

ValueError: batch_size should be a positive integer value, but got batch_size=[1 1 1 ... 0 0 0]

In [128]:
OHE_train_X

NameError: name 'OHE_train_X' is not defined

In [11]:
train_index

array([0, 1, 2, 3, 4, 5, 6, 7])

In [None]:
https://saturncloud.io/blog/how-to-use-kfold-cross-validation-with-dataloaders-in-pytorch/