**MAKING PICTURES

In [1]:
from __future__ import print_function
import matplotlib.pyplot as plt
%matplotlib inline

import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '1'

import numpy as np
from models.resnet import ResNet
from models.unet import UNet
from models.skip import skip
import torch
import torch.optim

from utils.inpainting_utils import *

def inpainting(container, cuda = False, iterations = 100):
    
    seq_np = container.seq_np
    mask_np = container.mask_np

    if cuda: 
        torch.backends.cudnn.enabled = True
        torch.backends.cudnn.benchmark =True
        dtype = torch.cuda.FloatTensor
    else:
        torch.backends.cudnn.enabled = False
        torch.backends.cudnn.benchmark = False
        dtype = torch.FloatTensor

    PLOT = False
    imsize = -1
    dim_div_by = 64


    pad = 'reflection' # 'zero'
    OPT_OVER = 'net'
    OPTIMIZER = 'adam'
    NET_TYPE = 'skip_depth6'
    
    INPUT = 'noise'
    input_depth = 32
    LR = 0.01 
    num_iter = iterations
    param_noise = False
    show_every = 5
    figsize = 5 #????
    reg_noise_std = 0.03

    net = skip(input_depth, seq_np.shape[0], #change skip function in models/skip.py
               num_channels_down = [128] * 3,
               num_channels_up =   [128] * 3,
               num_channels_skip =    [128] * 3,  
               filter_size_up = 3, filter_size_down = 3, 
               upsample_mode='nearest', filter_skip_size=1,
               need_sigmoid=True, need_bias=True, pad=pad, act_fun='LeakyReLU').type(dtype)
    
    global description
    things = [NET_TYPE, pad, OPT_OVER, OPTIMIZER, INPUT, input_depth, LR, num_iter]
    description="_".join([str(x) for x in things])

    
    net = net.type(dtype) 
    net_input = get_noise(input_depth, INPUT, seq_np.shape[1]).type(dtype) #tensor 

    
    s  = sum(np.prod(list(p.size())) for p in net.parameters())
    print ('Number of params: %d' % s)

    # Loss
    mse = torch.nn.MSELoss().type(dtype)

    # img_var = np_to_torch(img_np).type(dtype)
    # mask_var = np_to_torch(img_mask_np).type(dtype)

    img_var = np_to_torch(seq_np).type(dtype)
    mask_var = np_to_torch(mask_np).type(dtype)
    
    
    def closure():

        global i
    #     if param_noise:
    #         for n in [x for x in net.parameters() if len(x.size()) == 4]:
    #             n = n + n.detach().clone().normal_() * n.std() / 50

        net_input = net_input_saved
        if reg_noise_std > 0:
            net_input = net_input_saved + (noise.normal_() * reg_noise_std)
        out = net(net_input)
        total_loss = mse(out * mask_var, img_var * mask_var)
        total_loss.backward()
        print ('Iteration %05d    Loss %f' % (i, total_loss.item()), '\r', end='')

        i += 1

        return total_loss


    net_input_saved = net_input.detach().clone()
    noise = net_input.detach().clone()
    p = get_params(OPT_OVER, net, net_input) # list of tensors to optimize over !! in optimize
    loss = optimize(OPTIMIZER, p, closure, LR, num_iter) # optimize is in utils/common.utils
    out_np = torch_to_np(net(net_input))

    
    return out_np, loss

In [2]:
from PIL import Image
import PIL
import numpy as np
import IPython.display
import random 

from Bio import SeqIO
import math
from collections import Counter
import datetime
import time

fasta_file = "data/myco_genome.fasta"
local_genome = "/Users/pochtalionizm/Projects/neuro/data/GCF_000195955.2_ASM19595v2_genomic.gbff"
remote_genome = "data/myco_genome.gbff"

In [3]:
class Container:
    def __init__(self):
        self.record = None #Seq??
        self.length = None #int
        self.seq = None #np.array of chars
        self.seq_np = None #np.array of [1, 0, 0, 0]
        self.out_seq = None #np.array of chars
        self.out_array = None #np.array of [1, 0, 0, 0]
        
        
        self.bases_dict     = {"A": 0, "T": 1, "C": 2, "G": 3}
        self.bases_list = ["A", "T", "C", "G"]
        self.bases_np        = {"A": np.array([1, 0, 0, 0], dtype = np.float32),
                             "T": np.array([0, 1, 0, 0], dtype = np.float32),
                             "C": np.array([0, 0, 1, 0], dtype = np.float32),
                             "G": np.array([0, 0, 0, 1], dtype = np.float32)
                            }
        self.freqs = None # dict {'A':0.34, ...}
        self.counter = {}
        print("container created")
        
        
    def read_seq(self, genome_file = remote_genome, genome_file_type = "genbank"):
        iterator = SeqIO.parse(genome_file, genome_file_type)
        self.record = next(iterator)
        self.seq = np.asarray(self.record.seq) 
        
        print("read seq from file {}, length = {}".format(genome_file, len(self.record.seq)))
    
    def generate_seq(self, length = None):
        if length == None:
            length = len(self.record.seq)
        seq_np = np.zeros((4, length), dtype = np.float32)
        for index in range(length):
            base = self.seq[index]
            channel = self.bases_dict[base]
            seq_np[channel][index] = 1
            
            
        self.length = length
        self.seq_np = seq_np
        
        print("generated seq for analysis, length = {}".format(self.length))
        
    def generate_mask(self, seed=False):
        length = self.length
        length_mask = math.ceil(self.length * 0.1)
        
        if seed:
            random.seed(7)
        mask_np = np.zeros((4, length), dtype=np.float32)
        mask = np.zeros(length)
        
        mask_np.fill(1)
        for n in range(length_mask):
            spot = 1
            index = random.randint(0, length-spot)
            for i in range(index, index+spot):
                mask_np[:, i] = [0,0,0,0]
                mask[i] = 1
                
        self.mask_np = mask_np
        self.length_mask = int(sum(mask)) # true mask length!!
        self.mask = mask

        print("generated mask with {} spots of {} bp, seed {}".format(self.length_mask, spot, seed))
        
    def inpaint(self, out_np = None):
        if out_np is None:
            self.out_np = inpainting(self)
        else:
            self.out_np = out_np
            print("assigned inpainted array")
            
    
    def _get_freqs(self):
        counter = Counter(self.seq[0:self.length])
        self.freqs = {letter : value / self.length for (letter, value) in counter.items()}
    
    def _baseline(self): #count mistakes under mask if using random predictor with frequences
        counter = 0
        for i in range(self.length):
            if self.mask[i] == 1: #if its under mask
                w = [self.freqs[x] for x in self.bases_list]
                letter = random.choices(self.bases_list, weights=w)[0]
                if letter != self.seq[i]:
                        counter +=1
        return counter
    
    
    def baseline(self):
        self._get_freqs()
        baselines = []
        for i in range(100):
            baselines.append(self._baseline())
        mean = np.mean(baselines)
        sd = np.std(baselines)
        self.counter["baseline_mean"] = mean
        self.counter["baseline_sd"] = sd
        print("got baseline")

    def generate_out(self): 
        
        out_seq = np.zeros(self.length, dtype= "U8")
        for i in range(self.length):
            channel = np.argmax(self.out_np[:, i])
            out_seq[i] = self.bases_list[channel] 
            
        out_array = np.zeros((4,self.length)) #array analog to seq_np
        for i in range(self.length):
            n = np.argmax(self.out_np[:, i])
            out_array[n, i] = 1
            
        self.out_seq = out_seq
        self.out_array = out_array
        
        self.coding = np.zeros(self.length)
        for f in self.record.features:
            if f.type == "CDS":
                start = f.location.start
                end = f.location.end
                self.coding[start: end + 1] = 1 
                
        
        
        diff = np.zeros(self.length)
        for i in range(self.length):
            if not np.array_equal(self.out_array[:, i], self.seq_np[:, i]):
                diff[i] = 1
        self.diff = diff
        
        print("generated out arrays")
        
    def generate_counter(self):
        c = self.counter
        c["all_mist"] = sum(self.diff)
        c["mask_mist"] = sum(self.diff * self.mask)
        c["free_mist"] = sum(self.diff) - sum(self.diff * self.mask)
        
        if c["all_mist"] != c["mask_mist"] + c["free_mist"]:
            print("error in counter")
        
        c["coding_mask"] = sum(self.mask * self.coding)
        c["noncoding_mask"] = sum(self.mask) - sum(self.mask * self.coding) 
        
        if c["coding_mask"]+ c["noncoding_mask"] != self.length_mask:
            print("error in counter")
        
        c["coding_mask_mist"] = sum(self.diff * self.mask * self.coding)
        c["noncoding_mask_mist"] = sum(self.diff * self.mask) - sum(self.diff * self.mask * self.coding)
        
        if c["coding_mask_mist"]+c["noncoding_mask_mist"] != c["mask_mist"]:
            print("error in counter")
        
        c["mask_part"] = c["mask_mist"] / self.length_mask
        c["coding_part"] = c["coding_mask_mist"] / c["coding_mask"]
        c["noncoding_part"] = c["noncoding_mask_mist"] / c["noncoding_mask"]
        c["baseline_part"] = c["baseline_mean"] / self.length_mask
        c["baseline_part_sd"] = c["baseline_sd"] / self.length_mask
        
        c["free_part"] = c["free_mist"] / (self.length - self.length_mask)
        
        
        
        print("generated counter")
    
    
    def print_results(self, description = ""):
        if description == "":
            print("no description!")
        
        interpretation = ("mask\tcoding\tnoncod\tfreqs\t+-\tfree")
        c = self.counter
        results = "{:.3}\t{:.3}\t{:.3}\t{:.3}\t{:.3}\t{:.3}".format(c["mask_part"],  c["coding_part"], c["noncoding_part"], 
                                                             c["baseline_part"], c["baseline_part_sd"], c["free_part"])
        print(interpretation)
        print(results)

        file = open("data/nnet_results.txt", "+a")
        file.write("#{}\tlength\tmask\t{}\n".format(interpretation, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
        file.write("{}\t{}\t{}\t{}\n".format(results, self.length, self.length_mask, description))
        file.close()
        
        file = open("data/nnet_log.txt", "+a")
        file.write("#{}\n".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
        file.write("{}\tlength\tmask\n".format(interpretation))
        file.write("{}\t{}\t{}\t{}\n".format(results, self.length, self.length_mask, description))
        file.write("{}\n".format(self.counter))
        file.close()
    
        self.title = "{}\n{}\t{}\n{}".format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), self.length, self.length_mask, description)

In [4]:
container = Container()
container.read_seq(remote_genome)
container.generate_seq(length = 100000)
container.generate_mask(seed=True)

container created
read seq from file data/myco_genome.gbff, length = 4411532
generated seq for analysis, length = 100000
generated mask with 9532 spots of 1 bp, seed True


In [None]:
start_time = time.time()

i = 0
out_np, loss = inpainting(container, cuda=False, iterations = 500)

elapsed_time = time.time() - start_time

print("\ntime: {:.3}s".format(elapsed_time))

Number of params: 646788
Starting optimization with ADAM




Iteration 00029    Loss 0.165206 

In [None]:
container.inpaint(out_np)
container.baseline()
container.generate_out()
container.generate_counter()


In [None]:
container.print_results(description)

In [None]:
print(container.counter)

In [None]:
import matplotlib.pyplot as plt
plt.plot(loss)
plt.ylabel('loss')
plt.xlabel('iteration')
plt.title(container.title)
plt.savefig("data/loss_{}.png".format(datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")))
plt.show()

In [None]:
from Bio import SeqIO
iterator = SeqIO.parse(remote_genome, "genbank")
record = next(iterator)
    
# print(record.annotations.keys())
# print(record.features[0].type)
# print(dir(record.features[5].location))
# print(record.features[5].location.start)
# for f in record.features:
#     print(f.type)
#     print(f.location.start)

print(len(record.seq))


In [None]:
x = np.zeros(10)
x[1:3] = 1
print(x)

In [None]:
a = Container()
a.diff   = np.array([1, 1, 1, 0, 0])
a.mask   = np.array([1, 1, 1, 1, 1])
a.coding = np.array([1, 0, 0, 0, 0])
a.length_mask = sum(a.mask)
a.generate_counter()
a.print_results()

In [None]:
c = {}
c['1']='a'
b = c
b['2'] = 'b'
c['3'] = 'c'
b ['4'] ='d'




print(c)