**MAKING PICTURES

In [1]:
from PIL import Image
import PIL
import numpy as np
import IPython.display
import random 

from Bio import SeqIO
import math
from collections import Counter
import datetime

fasta_file = "myco_genome.fasta"



In [143]:
class Container:
    def __init__(self):
        self.record = None #SeqRecord object
        self.length = None #int
        self.seq = None #np.array of chars
        self.seq_np = None #np.array of [1, 0, 0, 0]
        self.out_seq = None #np.array of chars
        self.out_array = None #np.array of [1, 0, 0, 0]
        
        
        self.bases_dict     = {"A": 0, "T": 1, "C": 2, "G": 3}
        self.bases_list = ["A", "T", "C", "G"]
        self.bases_np        = {"A": np.array([1, 0, 0, 0], dtype = np.float32),
                             "T": np.array([0, 1, 0, 0], dtype = np.float32),
                             "C": np.array([0, 0, 1, 0], dtype = np.float32),
                             "G": np.array([0, 0, 0, 1], dtype = np.float32)
                            }
        self.freqs = None # dict {'A':0.34, ...}
        self.counter =  np.zeros((2, 2, 2)) # (no mist/mist, no mask/mask, noncoding/cds)

        self.genome_file   = "/Users/pochtalionizm/Projects/neuro/data/GCF_000195955.2_ASM19595v2_genomic.gbff"
        self.genome_file_type = "genbank"
        
    def read_seq(self):
        iterator = SeqIO.parse(self.genome_file, self.genome_file_type)
        self.record = next(iterator)
        self.seq = np.asarray(self.record.seq) 
    
    def generate_seq(self):
        length = 2 ** 16
        seq_np = np.zeros((4, length), dtype = np.float32)
        for index in range(length):
            base = self.seq[index]
            channel = self.bases_dict[base]
            seq_np[channel][index] = 1
            
            
        self.length = length
        self.seq_np = seq_np
        
    def generate_mask(self, seed=False):
        length = self.length
        length_mask = math.ceil(self.length * 0.1)
        
        if seed:
            random.seed(7)
        mask_np = np.zeros((4, length), dtype=np.float32)
        mask = np.zeros(length)
        
        mask_np.fill(1)
        for n in range(length_mask):
            spot = 1
            index = random.randint(0, length-spot)
            for i in range(index, index+spot):
                mask_np[:, i] = [0,0,0,0]
                mask[i] = 1
                
        self.mask_np = mask_np
        self.length_mask = length_mask
        self.mask = mask

        
    def inpaint(self, out_np = None):
        if out_np is None:
            self.out_np = inpainting(self)
        else:
            self.out_np = out_np
            
    
    def get_freqs(self):
        counter = Counter(self.seq[0:self.length])
        self.freqs = {letter : value / self.length for (letter, value) in counter.items()}
    
    def _baseline(self):
        counter = 0
        for i in range(self.length):
            if self.mask_np[1, i] == 0: #its under mask
                w = [self.freqs[x] for x in self.bases_list]
                letter = random.choices(self.bases_list, weights=w)[0]
                if letter != self.seq[i]:
                        counter +=1
        return counter
    
    
    def baseline(self):
        baselines = []
        for i in range(100):
            baselines.append(self._baseline())
        mean = np.mean(baselines)
        sd = np.std(baselines)
#         self.counter["baseline_mean"] = mean
#         self.counter["baseline_sd"] = sd
        print("got baseline")

    def generate_out(self): 
        
        
        out_seq = np.zeros(self.length, dtype= "U8")
        for i in range(self.length):
            channel = np.argmax(self.out_np[:, i])
            out_seq[i] = self.bases_list[channel] 
            
        out_array = np.zeros((4,self.length)) #array analog to seq_np
        for i in range(self.length):
            n = np.argmax(self.out_np[:, i])
            out_array[n, i] = 1
            
        self.out_seq = out_seq
        self.out_array = out_array
        
        self.coding = np.zeros(self.length)
        for f in self.record.features:
            if f.type == "CDS":
                start = f.location.start
                end = f.location.end
                self.coding[start: end + 1] = 1 
                
        
        
        diff = np.zeros(self.length)
        for i in range(self.length):
            if not np.array_equal(self.out_array[:, i], self.seq_np[:, i]):
                diff[i] = 1
                

        self.diff = diff
        
    
    
    def print_results(self, description):
            print("mask\tfree\tbaseline")
            res = "{:f}\t{:f}\t{:f}\t± {:f}\t{:f}\t{:f}".format()
            print(res)

            file = open("nnet_results.txt", "+a")
            file.write("{}\t{}\t{}\n".format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), res, description))
            file.close()

In [144]:
container = Container()
container.read_seq()
container.generate_seq()
container.generate_mask(seed=True)

In [145]:
i = 0
out_np = inpainting(container)


Number of params: 646788
Starting optimization with ADAM
Iteration 00009    Loss 0.166331 

In [146]:
container.out_np = out_np

In [147]:
container.inpaint(out_np)
container.get_freqs()
container.baseline()
container.generate_out()

got baseline


In [148]:
print(sum(container.diff)/container.length)
print(sum(container.mask)/container.length)
print(sum(container.coding)/container.length)
print(sum(container.diff * container.mask))
print(sum(container.diff * container.mask * container.coding)/ sum(container.mask * container.coding))

0.6170806884765625
0.0952301025390625
0.8859405517578125
4264.0
0.6834298118668596


In [149]:
print(len(container.record.seq))

4411532


In [43]:
from __future__ import print_function
import matplotlib.pyplot as plt
%matplotlib inline

import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '1'

import numpy as np
from models.resnet import ResNet
from models.unet import UNet
from models.skip import skip
import torch
import torch.optim

from utils.inpainting_utils import *

def inpainting(container):
    
    seq_np = container.seq_np
    mask_np = container.mask_np

    # torch.backends.cudnn.enabled = True
    # torch.backends.cudnn.benchmark =True
    # dtype = torch.cuda.FloatTensor

    torch.backends.cudnn.enabled = False
    torch.backends.cudnn.benchmark = False
    dtype = torch.FloatTensor

    PLOT = False
    imsize = -1
    dim_div_by = 64


    pad = 'reflection' # 'zero'
    OPT_OVER = 'net'
    OPTIMIZER = 'adam'
    NET_TYPE = 'skip_depth6'
    
    INPUT = 'noise'
    input_depth = 32
    LR = 0.01 
    num_iter = 10
    param_noise = False
    show_every = 5
    figsize = 5 #????
#     reg_noise_std = 0.03

    net = skip(input_depth, seq_np.shape[0], #change skip function in models/skip.py
               num_channels_down = [128] * 3,
               num_channels_up =   [128] * 3,
               num_channels_skip =    [128] * 3,  
               filter_size_up = 3, filter_size_down = 3, 
               upsample_mode='nearest', filter_skip_size=1,
               need_sigmoid=True, need_bias=True, pad=pad, act_fun='LeakyReLU').type(dtype)
    
    global description
    things = [NET_TYPE, pad, OPT_OVER, OPTIMIZER, INPUT, input_depth, LR, num_iter]
    description=" ".join([str(x) for x in things])
    
    net = net.type(dtype) 
    net_input = get_noise(input_depth, INPUT, seq_np.shape[1]).type(dtype) #tensor 

    
    s  = sum(np.prod(list(p.size())) for p in net.parameters())
    print ('Number of params: %d' % s)

    # Loss
    mse = torch.nn.MSELoss().type(dtype)

    # img_var = np_to_torch(img_np).type(dtype)
    # mask_var = np_to_torch(img_mask_np).type(dtype)

    img_var = np_to_torch(seq_np).type(dtype)
    mask_var = np_to_torch(mask_np).type(dtype)
    
    i = 0
    def closure():

        global i
    #     if param_noise:
    #         for n in [x for x in net.parameters() if len(x.size()) == 4]:
    #             n = n + n.detach().clone().normal_() * n.std() / 50

    #     net_input = net_input_saved
    #     if reg_noise_std > 0:
    #         net_input = net_input_saved + (noise.normal_() * reg_noise_std)
        out = net(net_input)
        total_loss = mse(out * mask_var, img_var * mask_var)
        total_loss.backward()
        print ('Iteration %05d    Loss %f' % (i, total_loss.item()), '\r', end='')

        i += 1

        return total_loss


    net_input_saved = net_input.detach().clone()
    noise = net_input.detach().clone()
    p = get_params(OPT_OVER, net, net_input) # list of tensors to optimize over !! in optimize
    optimize(OPTIMIZER, p, closure, LR, num_iter) # optimize is in utils/common.utils
    
    out_np = torch_to_np(net(net_input))
    
    return out_np

In [34]:
from Bio import SeqIO
iterator = SeqIO.parse("/Users/pochtalionizm/Projects/neuro/data/GCF_000195955.2_ASM19595v2_genomic.gbff", "genbank")
record = next(iterator)
    
# print(record.annotations.keys())
# print(record.features[0].type)
# print(dir(record.features[5].location))
# print(record.features[5].location.start)
for f in record.features:
    print(f.type)
    print(f.location.start)



source
0
gene
0
CDS
0
gene
2051
CDS
2051
gene
3279
CDS
3279
gene
4433
CDS
4433
gene
5239
CDS
5239
gene
7301
CDS
7301
gene
9913
CDS
9913
gene
10886
tRNA
10886
gene
11111
tRNA
11111
gene
11873
CDS
11873
gene
12467
CDS
12467
gene
13132
CDS
13132
gene
13713
CDS
13713
gene
14088
CDS
14088
gene
14913
CDS
14913
gene
15589
CDS
15589
gene
17466
CDS
17466
gene
18758
CDS
18758
gene
20230
CDS
20230
gene
21636
CDS
21636
repeat_region
23172
gene
23269
CDS
23269
gene
23860
CDS
23860
gene
25643
tRNA
25643
gene
25912
CDS
25912
gene
27022
CDS
27022
gene
27594
CDS
27594
gene
28361
CDS
28361
gene
29244
CDS
29244
gene
29721
CDS
29721
gene
31188
CDS
31188
gene
31513
CDS
31513
gene
32056
CDS
32056
gene
33223
CDS
33223
gene
34294
CDS
34294
gene
36606
CDS
36606
gene
36866
CDS
36866
gene
37258
CDS
37258
gene
39055
CDS
39055
gene
39876
CDS
39876
gene
41303
CDS
41303
gene
42003
CDS
42003
gene
42432
CDS
42432
gene
43561
CDS
43561
gene
46580
CDS
46580
gene
47365
CDS
47365
gene
48232
CDS
48232
gene
49042
CDS
49042
g

542141
CDS
542141
gene
543173
CDS
543173
gene
545374
CDS
545374
gene
545888
CDS
545888
gene
547075
CDS
547075
gene
547343
CDS
547343
repeat_region
547487
gene
547585
CDS
547585
gene
549674
CDS
549674
gene
551197
CDS
551197
gene
551748
CDS
551748
gene
552025
CDS
552025
gene
552613
CDS
552613
gene
554015
CDS
554015
gene
554312
CDS
554312
gene
554881
CDS
554881
gene
556457
CDS
556457
gene
557526
CDS
557526
gene
558894
CDS
558894
gene
559887
CDS
559887
gene
560847
CDS
560847
gene
561853
CDS
561853
gene
562224
CDS
562224
gene
562722
CDS
562722
gene
563563
CDS
563563
gene
565020
CDS
565020
gene
565796
CDS
565796
gene
566507
CDS
566507
gene
566775
CDS
566775
gene
567221
CDS
567221
gene
567920
CDS
567920
gene
568963
CDS
568963
gene
569987
CDS
569987
gene
570538
CDS
570538
gene
571709
CDS
571709
gene
573045
CDS
573045
gene
573983
CDS
573983
misc_feature
575032
gene
575347
CDS
575347
gene
576786
CDS
576786
gene
577663
CDS
577663
gene
578425
CDS
578425
gene
579348
CDS
579348
repeat_region
580577


repeat_region
1029344
gene
1029512
CDS
1029512
gene
1030577
CDS
1030577
gene
1031895
CDS
1031895
gene
1032709
CDS
1032709
gene
1033839
CDS
1033839
gene
1034902
CDS
1034902
gene
1036027
CDS
1036027
gene
1036998
CDS
1036998
gene
1037919
CDS
1037919
gene
1039935
CDS
1039935
gene
1041263
CDS
1041263
gene
1042114
CDS
1042114
gene
1043298
CDS
1043298
gene
1044316
CDS
1044316
gene
1045198
CDS
1045198
gene
1046135
CDS
1046135
gene
1048411
CDS
1048411
gene
1050592
CDS
1050592
gene
1051543
CDS
1051543
gene
1052359
CDS
1052359
gene
1052695
CDS
1052695
gene
1053764
CDS
1053764
gene
1054246
CDS
1054246
gene
1055023
CDS
1055023
gene
1057299
misc_feature
1057299
gene
1057645
CDS
1057645
gene
1058259
CDS
1058259
gene
1060655
CDS
1060655
gene
1061963
CDS
1061963
gene
1063139
CDS
1063139
gene
1064113
CDS
1064113
gene
1065126
CDS
1065126
gene
1066077
CDS
1066077
gene
1067560
CDS
1067560
gene
1068204
CDS
1068204
gene
1069882
CDS
1069882
gene
1071254
CDS
1071254
gene
1073326
CDS
1073326
gene
1073544
CDS
10

CDS
1485861
gene
1487160
CDS
1487160
gene
1488153
CDS
1488153
gene
1490116
CDS
1490116
gene
1492319
CDS
1492319
gene
1494563
CDS
1494563
gene
1497194
CDS
1497194
gene
1499212
CDS
1499212
gene
1500660
CDS
1500660
gene
1500925
CDS
1500925
gene
1501598
CDS
1501598
gene
1502640
CDS
1502640
gene
1503102
CDS
1503102
gene
1503393
CDS
1503393
gene
1504355
CDS
1504355
gene
1505074
CDS
1505074
gene
1505916
CDS
1505916
gene
1506754
CDS
1506754
repeat_region
1507530
gene
1507572
CDS
1507572
gene
1508183
CDS
1508183
gene
1508542
CDS
1508542
gene
1508967
CDS
1508967
gene
1509280
CDS
1509280
gene
1510845
CDS
1510845
gene
1511972
CDS
1511972
gene
1512727
tRNA
1512727
gene
1513046
CDS
1513046
gene
1515622
CDS
1515622
gene
1517490
CDS
1517490
gene
1518230
CDS
1518230
gene
1518762
CDS
1518762
gene
1519199
CDS
1519199
gene
1520004
CDS
1520004
gene
1521884
CDS
1521884
gene
1524028
CDS
1524028
gene
1525292
CDS
1525292
gene
1526611
CDS
1526611
gene
1530172
CDS
1530172
gene
1531347
CDS
1531347
gene
1532442
CD

1840571
CDS
1840571
gene
1842450
CDS
1842450
gene
1842897
CDS
1842897
gene
1843740
CDS
1843740
gene
1846715
CDS
1846715
gene
1846988
CDS
1846988
gene
1848516
CDS
1848516
gene
1852272
CDS
1852272
gene
1852927
CDS
1852927
gene
1853183
CDS
1853183
gene
1853605
CDS
1853605
gene
1854398
CDS
1854398
gene
1855763
CDS
1855763
gene
1856773
CDS
1856773
gene
1857730
CDS
1857730
gene
1858732
CDS
1858732
gene
1859757
CDS
1859757
gene
1862346
CDS
1862346
gene
1865575
CDS
1865575
gene
1866630
CDS
1866630
gene
1867841
CDS
1867841
gene
1868722
CDS
1868722
gene
1869921
CDS
1869921
gene
1870841
CDS
1870841
gene
1871362
CDS
1871362
gene
1872638
CDS
1872638
gene
1874159
CDS
1874159
gene
1875303
CDS
1875303
gene
1881703
CDS
1881703
gene
1886511
CDS
1886511
gene
1888025
CDS
1888025
gene
1891225
CDS
1891225
gene
1892269
CDS
1892269
gene
1893576
gene
1895724
CDS
1895724
gene
1896119
CDS
1896119
gene
1896474
CDS
1896474
gene
1896875
CDS
1896875
gene
1898299
CDS
1898299
gene
1899259
CDS
1899259
gene
1900240
CDS


CDS
2374460
gene
2376570
CDS
2376570
gene
2377147
CDS
2377147
gene
2377469
CDS
2377469
gene
2378385
CDS
2378385
gene
2379244
CDS
2379244
gene
2379805
CDS
2379805
gene
2380662
CDS
2380662
gene
2381070
CDS
2381070
gene
2382488
CDS
2382488
gene
2386292
CDS
2386292
gene
2387201
CDS
2387201
gene
2388615
CDS
2388615
gene
2390084
CDS
2390084
gene
2390307
CDS
2390307
gene
2391214
CDS
2391214
gene
2392516
CDS
2392516
gene
2393410
CDS
2393410
gene
2393850
CDS
2393850
gene
2394649
CDS
2394649
gene
2395300
CDS
2395300
gene
2396007
CDS
2396007
gene
2396901
CDS
2396901
gene
2397329
CDS
2397329
gene
2398719
CDS
2398719
gene
2399797
CDS
2399797
gene
2400375
CDS
2400375
gene
2401986
tRNA
2401986
gene
2402192
CDS
2402192
gene
2402506
CDS
2402506
gene
2402976
CDS
2402976
gene
2404164
CDS
2404164
gene
2404615
CDS
2404615
gene
2405665
CDS
2405665
gene
2406117
CDS
2406117
gene
2406839
CDS
2406839
gene
2407621
CDS
2407621
gene
2408384
CDS
2408384
gene
2409696
CDS
2409696
gene
2410637
CDS
2410637
gene
2412118

gene
2900225
CDS
2900225
gene
2900917
CDS
2900917
gene
2902508
CDS
2902508
gene
2903638
CDS
2903638
gene
2904820
CDS
2904820
gene
2906088
CDS
2906088
gene
2906813
CDS
2906813
gene
2907825
CDS
2907825
gene
2910228
CDS
2910228
gene
2911003
CDS
2911003
gene
2912682
CDS
2912682
gene
2914014
CDS
2914014
gene
2915845
CDS
2915845
gene
2916359
CDS
2916359
gene
2917870
CDS
2917870
gene
2921550
CDS
2921550
gene
2923198
CDS
2923198
gene
2924229
CDS
2924229
gene
2924816
CDS
2924816
gene
2925491
CDS
2925491
gene
2925733
CDS
2925733
gene
2926354
CDS
2926354
gene
2926985
CDS
2926985
gene
2927476
CDS
2927476
gene
2927989
CDS
2927989
gene
2928387
CDS
2928387
gene
2930069
CDS
2930069
gene
2930343
CDS
2930343
gene
2930804
CDS
2930804
gene
2931692
CDS
2931692
gene
2932296
CDS
2932296
gene
2933170
CDS
2933170
gene
2934197
CDS
2934197
gene
2935045
CDS
2935045
gene
2936809
CDS
2936809
gene
2937864
CDS
2937864
gene
2939011
CDS
2939011
gene
2939958
CDS
2939958
gene
2940608
CDS
2940608
gene
2941188
CDS
2941188


3218338
gene
3219273
CDS
3219273
gene
3219862
CDS
3219862
gene
3221766
CDS
3221766
gene
3223567
CDS
3223567
gene
3224707
CDS
3224707
gene
3226362
CDS
3226362
gene
3228253
CDS
3228253
gene
3230737
CDS
3230737
gene
3231072
CDS
3231072
gene
3232870
CDS
3232870
gene
3234188
CDS
3234188
gene
3237817
CDS
3237817
gene
3238085
CDS
3238085
gene
3238600
CDS
3238600
gene
3239828
CDS
3239828
gene
3240547
CDS
3240547
gene
3241221
CDS
3241221
gene
3242197
CDS
3242197
gene
3242969
CDS
3242969
gene
3243696
CDS
3243696
gene
3245444
CDS
3245444
gene
3251071
CDS
3251071
gene
3255684
CDS
3255684
gene
3262247
CDS
3262247
gene
3267736
CDS
3267736
gene
3272213
CDS
3272213
gene
3273205
CDS
3273205
gene
3274071
CDS
3274071
gene
3274948
CDS
3274948
gene
3276379
CDS
3276379
gene
3283334
CDS
3283334
gene
3285069
CDS
3285069
mobile_element
3288462
gene
3288463
CDS
3288463
gene
3289704
CDS
3289704
gene
3289789
CDS
3289789
gene
3290623
CDS
3290623
gene
3291502
CDS
3291502
gene
3296349
CDS
3296349
gene
3297836
CDS
32

3788620
gene
3790155
CDS
3790155
gene
3790847
CDS
3790847
gene
3792357
CDS
3792357
gene
3793256
CDS
3793256
mobile_element
3795057
repeat_region
3795057
misc_feature
3795099
repeat_region
3796384
gene
3796447
CDS
3796447
gene
3797436
CDS
3797436
gene
3799242
CDS
3799242
gene
3799634
CDS
3799634
mobile_element
3799986
repeat_region
3799986
gene
3800091
CDS
3800091
gene
3800785
CDS
3800785
repeat_region
3801529
gene
3801652
CDS
3801652
gene
3803918
CDS
3803918
gene
3804864
CDS
3804864
gene
3805620
CDS
3805620
gene
3807573
CDS
3807573
gene
3808460
CDS
3808460
gene
3809441
CDS
3809441
gene
3811021
CDS
3811021
gene
3811718
CDS
3811718
gene
3812500
CDS
3812500
gene
3814089
CDS
3814089
gene
3815026
CDS
3815026
gene
3816128
CDS
3816128
gene
3817238
CDS
3817238
gene
3818041
CDS
3818041
gene
3820652
CDS
3820652
gene
3822261
CDS
3822261
gene
3823879
CDS
3823879
gene
3824701
CDS
3824701
gene
3825329
CDS
3825329
gene
3826251
CDS
3826251
gene
3826547
CDS
3826547
gene
3826990
CDS
3826990
gene
3828782

gene
4325073
CDS
4325073
gene
4325494
CDS
4325494
gene
4326003
CDS
4326003
gene
4327548
CDS
4327548
gene
4328400
CDS
4328400
gene
4329416
CDS
4329416
gene
4330038
CDS
4330038
gene
4331497
CDS
4331497
gene
4336776
CDS
4336776
gene
4337945
CDS
4337945
gene
4338170
CDS
4338170
gene
4338848
CDS
4338848
gene
4340269
CDS
4340269
gene
4341565
CDS
4341565
gene
4341879
CDS
4341879
gene
4342769
CDS
4342769
gene
4343313
CDS
4343313
gene
4345038
CDS
4345038
gene
4346480
CDS
4346480
repeat_region
4348720
repeat_region
4348773
gene
4348826
CDS
4348826
gene
4350744
CDS
4350744
gene
4351074
CDS
4351074
gene
4352273
CDS
4352273
gene
4352608
CDS
4352608
gene
4353009
CDS
4353009
repeat_region
4353279
repeat_region
4353330
repeat_region
4353381
gene
4355006
CDS
4355006
gene
4356692
CDS
4356692
gene
4357592
CDS
4357592
gene
4360198
CDS
4360198
gene
4360542
CDS
4360542
gene
4362031
CDS
4362031
gene
4363416
CDS
4363416
gene
4364978
CDS
4364978
gene
4366907
CDS
4366907
gene
4368517
CDS
4368517
gene
4370154
CD

In [39]:
x = np.zeros(10)
x[1:3] = 1
print(x)

[0. 1. 1. 0. 0. 0. 0. 0. 0. 0.]


In [57]:
counter =  np.zeros((2, 2, 2))
counter[1, 0, 1] += 1
print(counter)

[[[0. 0.]
  [0. 0.]]

 [[0. 1.]
  [0. 0.]]]
