In [1]:
import os
import hickle as hkl
import numpy as np
import pybedtools
from pkg_resources import resource_filename
from torch import nn
import torch.nn.functional as F
import torch
from torch.utils.data import DataLoader, Dataset
from janggu.data import Bioseq, Cover, ReduceDim, SqueezeDim, Transpose
import torch.optim as optim
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from IPython.display import Image

Using TensorFlow backend.


In [2]:
os.environ['JANGGU_OUTPUT'] = '/home/wangccy/janggu_out' 

In [3]:
order = 3

In [4]:
!shuf -n 50 /home/wangccy/anaconda3/envs/final-proj/lib/python3.9/site-packages/janggu/roi_train.bed > /home/wangccy/anaconda3/envs/final-proj/lib/python3.9/site-packages/janggu/roi_train_rand50.bed
#Set the random number of gene segments taking for training, you can change 1 to any number and change file name in the output

In [5]:
!shuf -n 10 /home/wangccy/anaconda3/envs/final-proj/lib/python3.9/site-packages/janggu/roi_test.bed > /home/wangccy/anaconda3/envs/final-proj/lib/python3.9/site-packages/janggu/roi_test_rand10.bed 
#Set the random number of gene segments taking for testing, you can change 1 to any number and change file name in the output

In [4]:
REFGENOME = resource_filename('janggu', 'hg19.fa')


ROI_TRAIN_FILE = resource_filename('janggu', 'roi_train_rand1.bed') # change file name here accordingly


ROI_TEST_FILE = resource_filename('janggu', 'roi_test_rand1.bed') # change file name here accordingly

PEAK_FILE = '/home/wangccy/sites_all.bed'


In [7]:
#Prepare data for Sklearn Training
DNA = SqueezeDim(ReduceDim(Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
                                                       roi=ROI_TRAIN_FILE,
                                                       binsize=50,
                                                       order=3,
                                                       flank=150,
                                                       cache=False,
                                                       verbose=True)))

DNA_test = SqueezeDim(ReduceDim(Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
                                                       roi=ROI_TEST_FILE,
                                                       binsize=50,
                                                       order=3,
                                                       flank=150,
                                                       cache=False,
                                                       verbose=True)))

In [8]:
#Prepare data for Sklearn Testing
LABELS = SqueezeDim(ReduceDim(Cover.create_from_bed('sites_all', roi=ROI_TRAIN_FILE,
                                                   bedfiles=PEAK_FILE,
                                                   binsize=50,
                                                   resolution=1,
                                                   flank=150,
                                                   mode='name_category',
                                                   conditions=['start'])))

LABELS_test = SqueezeDim(ReduceDim(Cover.create_from_bed('sites_all', roi=ROI_TEST_FILE,
                                                   bedfiles=PEAK_FILE,
                                                   binsize=50,
                                                   resolution=1,
                                                   flank=150,
                                                   mode='name_category',
                                                   conditions=['start'])))

In [9]:
#Logistic Regression Model
logreg = LogisticRegression(max_iter=1e6)
logreg.fit(DNA,LABELS)
logregpred = logreg.predict_proba(DNA_test)[:,1]

In [10]:
#Random Forest Model
rf = RandomForestClassifier()
rf.fit(DNA, LABELS)
rfpred = rf.predict_proba(DNA_test)[:,1]

In [11]:
#Testing for Logistic Regression Model
roc_auc_score(LABELS_test[:], logregpred)

0.7771885287604101

In [12]:
#Testing for Random Forest Model
roc_auc_score(LABELS_test[:], rfpred)

0.7629927821441724