In [1]:
import numpy as np
from collections import OrderedDict

# auROC calculation given motif and importance scores

### Load importance scores

In [2]:
#Scores for sequence number 6000 - 8000
impscore = np.load("/users/eprakash/newlsgkm/lsgkm/mytests/talgata/runs/talgata_t3_l6_k5_d1_g2.0_c10/ISM_importance_scores_TALGATA_task0.test.2000.npy")
impscore.shape

(2000, 200, 4)

###  Load sequences and motif info

In [3]:
import simdna.synthetic as synthetic
import gzip

data_filename = "sequences.simdata.gz"
data = synthetic.read_simdata_file(data_filename)
l = data.embeddings[6781]
for i in l:
    print(i.startPos, i.what.__dict__, len(i.what), i.what.getDescription())
    print(data.sequences[215])

(71, {'string': 'CTAGATAAGG', 'stringDescription': 'GATA_disc1'}, 10, 'GATA_disc1')
CGCCAACAGATGGTAACCGGCCTCATAGACCCTAGACATTGGTCATAGTGGGGCGCGACTGATTGTGAGGCTATGTTTAAACGGTCCCTCGGCCGCTACAGAGGCTAAGTAGGACAACTCTCGGAGGGAATACATAAACCAATTGGTGGCCGTATTCATTCACCCCAACAACCCAGGTGGCAACAGCTGGTACCCGACTG
(46, {'string': 'GGTGATAAGG', 'stringDescription': 'GATA_disc1'}, 10, 'GATA_disc1')
CGCCAACAGATGGTAACCGGCCTCATAGACCCTAGACATTGGTCATAGTGGGGCGCGACTGATTGTGAGGCTATGTTTAAACGGTCCCTCGGCCGCTACAGAGGCTAAGTAGGACAACTCTCGGAGGGAATACATAAACCAATTGGTGGCCGTATTCATTCACCCCAACAACCCAGGTGGCAACAGCTGGTACCCGACTG
(184, {'string': 'CCGAACAGATGGATGT', 'stringDescription': 'TAL1_known1'}, 16, 'TAL1_known1')
CGCCAACAGATGGTAACCGGCCTCATAGACCCTAGACATTGGTCATAGTGGGGCGCGACTGATTGTGAGGCTATGTTTAAACGGTCCCTCGGCCGCTACAGAGGCTAAGTAGGACAACTCTCGGAGGGAATACATAAACCAATTGGTGGCCGTATTCATTCACCCCAACAACCCAGGTGGCAACAGCTGGTACCCGACTG
(167, {'string': 'ACGAACAGATGGCCAG', 'stringDescription': 'TAL1_known1'}, 16, 'TAL1_known1')
CGCCAACAGATGGTAACCGGCCTCATAGACCCTAGACATTGGT

### Create ordered dictionary of sequence names to motif list 

In [4]:
seq2motiflist = OrderedDict()
i = 6000
seqnamelist = list()
for motiflist in data.embeddings[6000:8000]:
    seqname = "seq_" + str(i)
    seqnamelist.append(seqname)
    seq2motiflist[seqname] = list()
    for el in motiflist:
        d = dict()
        d['begin'] = el.startPos
        d['end'] = el.startPos + len(el.what)
        d['motif'] = el.what.getDescription()
        seq2motiflist[seqname].append(d)
    i = i+1
#print(seq2motiflist)

### Use importance score evaluator to get motif-sized-window scores and corresponding labels for auROC calc

In [5]:
from ssvmimp.impevaluator.motif_importance_score_evaluator import MotifImportanceScoreEvaluator

#print(seqnamelist)
seqnames = np.array(seqnamelist)
#print(seqnames)
print(seqnames.shape)
impscoreeval = MotifImportanceScoreEvaluator(seq2motiflist, seqnames, impscore)
print("Known motifs" + str(impscoreeval.get_known_motifs()))

print("Motif size for TAL1_known1  " + str(impscoreeval.get_motif_size('TAL1_known1')))
print("Motif size for GATA_disc1  " + str(impscoreeval.get_motif_size('GATA_disc1')))

(2000,)
Known motifs['GATA_disc1', 'TAL1_known1']
Motif size for TAL1_known1  16
Motif size for GATA_disc1  10


In [6]:
(scores, labels) = impscoreeval.get_motif_scores('GATA_disc1')
print("Scores shape: ", scores.shape)
print("Labels shape: ", labels.shape)

('Scores shape: ', (311182,))
('Labels shape: ', (311182,))


### Perform auROC calculation using scores and labels

In [7]:
from sklearn import metrics

auroc = metrics.roc_auc_score(labels, scores)
print("auROC is: " + str(auroc))

auROC is: 0.9695125757648693
