In [1]:
import numpy as np

# Importance score generation for ISM (In Silico Mutagenesis)

In [2]:
def get_masked_imp_scores(unmasked_scores, selected_sequence_string):
    masked_scores = np.zeros((len(selected_sequence_string), 4), dtype=float)
    i=0
    for c in list(selected_sequence_string):
      if (c=="A" or c=="a"):
        char_idx = 0
      elif (c=="C" or c=="c"):
        char_idx = 1
      elif (c=="G" or c=="g"):
        char_idx = 2
      elif (c=="T" or c=="t"):
        char_idx = 3 
      else: 
        char_idx = -1
      if char_idx != -1:
        masked_scores[i][char_idx] = unmasked_scores[i][char_idx]
      i = i+1
    return masked_scores

In [3]:
def execcmd(cmd):
    print("Executing command: " + str(cmd))
    !$cmd

## Set up paths for location of gkmtrain, gkmpredict, the model to be used for training, scratch directory etc

In [4]:
lsgkmdir = "/users/eprakash/newlsgkm/lsgkm"
scratch = "/users/eprakash/scratch"
talgatadir = "%s/mytests/talgata" % (lsgkmdir)
model_l6_k5_d1_dir = "%s/runs/talgata_t3_l6_k5_d1_g2.0_c10" % (talgatadir)
ism_model = "%s/TALGATAtrained.model.txt" % (model_l6_k5_d1_dir)
ism_pos_predict = "%s/TALGATAtestpos_predict.txt" % (model_l6_k5_d1_dir)
ism_neg_predict = "%s/TALGATAtestneg_predict.txt" % (model_l6_k5_d1_dir)
gkmtrain = "%s/bin/gkmtrain" % (lsgkmdir)
gkmpredict = "%s/bin/gkmpredict" % (lsgkmdir)

### Run gkmpredict for a labelled dictionary of sequences

In [5]:
from  collections import OrderedDict

def get_lsgkm_decisions(seqdict):
    execcmd("rm -rf %s/seq.fa %s/output.txt" % (scratch, scratch))
    with open("%s/seq.fa" % (scratch), "w") as fw:
        for k in seqdict.keys():
             fw.write(">%s\n%s\n" % (k, seqdict[k]))
    fw.close()            
    execcmd("%s %s/seq.fa %s %s/output.txt" % (gkmpredict, scratch, ism_model, scratch))
    retdict = OrderedDict()
    with open("%s/output.txt" % (scratch)) as fr:
        for line in fr:
            l = line.rstrip().split('\t')
            retdict[l[0]]=float(l[1])
    fr.close()
    return retdict

### Function to read a set of fasta sequences from file

In [6]:
import sys
import re
from  collections import OrderedDict

def get_sequences(fname, doprint=False):
        fp = open(fname, "r")
        expecting = "label"
        label=''
        lines = OrderedDict()
        for line in fp:
                if expecting == "label":
                        match = re.match(">(.*)$", line)
                        if match:
                                label = match.group(1)
                                expecting = "sequence"
                        else:
                                print("Expecting LABEL but found (!!): " + line)
                                continue
                else:
                        match = re.match("(\w+)$", line)
                        if match:
                                fullsequence = match.group(1).rstrip("\n")
                                lines[label] = fullsequence
                        else:
                                print("Expecting SEQUENCE but found (!!): " + line)
                        expecting = "label"
                        label=""
        fp.close()
        if doprint:
                print("Number of sequences in " + fname + " is " + str(len(lines)))
        return lines

## Read the test sequences for which importance scores are desired and get the scores using ISM

In [7]:
sequences_file = "%s/task0.test.2000.fa" % (talgatadir)
allseqs = get_sequences(sequences_file, doprint=True)

Number of sequences in /users/eprakash/newlsgkm/lsgkm/mytests/talgata/task0.test.2000.fa is 2000


In [None]:
import time
from  collections import OrderedDict

start = time.time()
ism_scores_array = None
tracker = 0
for seqkey in allseqs.keys():
    print("Starting with sequence " + str(tracker))
    seqval = allseqs[seqkey]
    seq_len = len(seqval)
    dec_func = np.zeros((4, seq_len), dtype=float)
    # For A, G, C, T
    i = 0
    for bp in ('A', 'C', 'G', 'T'):  
        #Make as many copies as the length of the sequence
        replicated = np.array([seqval for x in range(seq_len)])
        # substitute the base pair position 1 of seq 1, position 2 of seq 2 etc
        replicatedseqdict = OrderedDict()
        for j in range(seq_len):
            replicated[j] = replicated[j][:j] + bp + replicated[j][j+1:]
            replicatedseqdict["seqkey_%s_%s" % (bp, str(j))] = replicated[j] 
        #Find out decision function for each sequence
        predictions = get_lsgkm_decisions(replicatedseqdict)
        dec_func[i] = np.array(predictions.values())
        i = i+1
    avg_dec_func = np.average(dec_func, axis=0)
    in_silico_imp_scores = dec_func - avg_dec_func
    in_silico_imp_scores = np.transpose(in_silico_imp_scores)
    print(in_silico_imp_scores.shape)
    print(in_silico_imp_scores)
    masked_scores = get_masked_imp_scores(in_silico_imp_scores, seqval)
    if ism_scores_array is None:
        ism_scores_array = masked_scores[None,:]
    else:
        ism_scores_array = np.append(ism_scores_array, masked_scores[None, :], axis=0)
    if (tracker % 10 == 0):
        np.save("ISM_importance_scores_TALGATA_task0.test.2000", ism_scores_array)
    print("Done with sequence " + str(tracker))
    tracker = tracker + 1
np.save("ISM_importance_scores_TALGATA_task0.test.2000", ism_scores_array)    
print("In silico importance scores for " + str(len(allseqs)) + " sequences computed in: ", round(time.time() - start, 2), "s")

Starting with sequence 0
Executing command: rm -rf /users/eprakash/scratch/seq.fa /users/eprakash/scratch/output.txt
Executing command: /users/eprakash/newlsgkm/lsgkm/bin/gkmpredict /users/eprakash/scratch/seq.fa /users/eprakash/newlsgkm/lsgkm/mytests/talgata/runs/talgata_t3_l6_k5_d1_g2.0_c10/TALGATAtrained.model.txt /users/eprakash/scratch/output.txt
INFO 2018-10-07 22:53:24: Number of threads is set to 1
INFO 2018-10-07 22:53:24: test log
INFO 2018-10-07 22:53:24: load model /users/eprakash/newlsgkm/lsgkm/mytests/talgata/runs/talgata_t3_l6_k5_d1_g2.0_c10/TALGATAtrained.model.txt
INFO 2018-10-07 22:53:24: reading... 1000/4229
INFO 2018-10-07 22:53:24: reading... 2000/4229
INFO 2018-10-07 22:53:25: reading... 3000/4229
INFO 2018-10-07 22:53:25: reading... 4000/4229
INFO 2018-10-07 22:53:25: write prediction result to /users/eprakash/scratch/output.txt
INFO 2018-10-07 22:53:25: 100 scored
INFO 2018-10-07 22:53:25: 200 scored
Executing command: rm -rf /users/eprakash/scratch/seq.fa /user

In [None]:
import ssvmimp.viz_sequence
%matplotlib inline

ssvmimp.viz_sequence.plot_weights(masked_scores)