In [1]:
import os
import hickle as hkl
import numpy as np
import pybedtools

from pkg_resources import resource_filename

from janggu.data import Bioseq, Cover, ReduceDim, SqueezeDim

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from IPython.display import Image

Using TensorFlow backend.


In [2]:
os.environ['JANGGU_OUTPUT'] = '/home/wangccy/janggu_out' 

In [3]:
order = 5

In [32]:
!shuf -n 1 /home/wangccy/anaconda3/envs/final-proj/lib/python3.9/site-packages/janggu/roi_train.bed > /home/wangccy/anaconda3/envs/final-proj/lib/python3.9/site-packages/janggu/roi_train_rand1.bed
#set the random number of gene taking for training, you can change 1 to any number change file name in the output

In [33]:
!shuf -n 1 /home/wangccy/anaconda3/envs/final-proj/lib/python3.9/site-packages/janggu/roi_test.bed > /home/wangccy/anaconda3/envs/final-proj/lib/python3.9/site-packages/janggu/roi_test_rand1.bed 
#set the random number of gene taking for testing, you can change 1 to any number change file name in the output

In [34]:
REFGENOME = resource_filename('janggu', 'hg19.fa')

#ROI_TRAIN_FILE = resource_filename('janggu', 'roi_train.bed')

ROI_TRAIN_FILE = resource_filename('janggu', 'roi_train_rand1.bed') # change file name here accordingly


ROI_TEST_FILE = resource_filename('janggu', 'roi_test_rand1.bed') # change file name here accordingly

PEAK_FILE = '/home/wangccy/sites_all.bed'


In [35]:
DNA = SqueezeDim(ReduceDim(Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
                                                       roi=ROI_TRAIN_FILE,
                                                       binsize=50,
                                                       order=3,
                                                       flank=50,
                                                       cache=False,
                                                       verbose=True)))

DNA_test = SqueezeDim(ReduceDim(Bioseq.create_from_refgenome('dna', refgenome=REFGENOME,
                                                       roi=ROI_TEST_FILE,
                                                       binsize=50,
                                                       order=3,
                                                       flank=50,
                                                       cache=False,
                                                       verbose=True)))

In [38]:
DNA.shape

(387, 64)

In [14]:
hkl.dump(DNA, 'DNA_Train.hkl', mode='w') # save into hkl then we don't need to rerun previous step
hkl.dump(DNA_test, 'DNA_Test.hkl', mode='w')

In [15]:
DNA = hkl.load('DNA_Train.hkl')
DNA_test = hkl.load('DNA_Test.hkl')

In [16]:
LABELS = SqueezeDim(ReduceDim(Cover.create_from_bed('sites_all', roi=ROI_TRAIN_FILE,
                                                   bedfiles=PEAK_FILE,
                                                   binsize=50,
                                                   resolution=1,
                                                   flank=50,
                                                   mode='name_category',
                                                   conditions=['start'])))

LABELS_test = SqueezeDim(ReduceDim(Cover.create_from_bed('sites_all', roi=ROI_TEST_FILE,
                                                   bedfiles=PEAK_FILE,
                                                   binsize=50,
                                                   resolution=1,
                                                   flank=50,
                                                   mode='name_category',
                                                   conditions=['start'])))

In [17]:
LABELS_test.shape

(4281,)

In [22]:
logreg = LogisticRegression(max_iter=1e6)
logreg.fit(DNA,LABELS)
logregpred = logreg.predict_proba(DNA_test)[:,1]

In [23]:
svc = SVC(probability=True)
svc.fit(DNA, LABELS)
svcpred = svc.predict_proba(DNA_test)[:,1]

In [24]:
rf = RandomForestClassifier()
rf.fit(DNA, LABELS)
rfpred = rf.predict_proba(DNA_test)[:,1]

In [25]:
roc_auc_score(LABELS_test[:], logregpred)

0.5732404565529494

In [26]:
roc_auc_score(LABELS_test[:], svcpred)

0.5616434895734657

In [27]:
roc_auc_score(LABELS_test[:], rfpred)

0.5445932746432918