# Effect of Gaussian noise addition on the prediction performance of the Dual Attention Recurrent Neural Network

06/2021 Jonathan Fiorentino & Michele Monti

In [None]:
import torch
import numpy as np
from matplotlib import pyplot as plt
import collections
import typing
from torch import nn
from torch.autograd import Variable
from torch.nn import functional as tf
import re
import logging
import os

import typing
from typing import Tuple
import json

from torch import optim
from sklearn.preprocessing import StandardScaler
import joblib

import pandas as pd

from My_allFunctions import*

import time

import sklearn.metrics

In [None]:
%matplotlib inline

# Prediction performance on the original data

In [None]:
FileListtxt=[]
FileListcsv=[]
subList=[]

# Get the names of the subfolders in DATA and the file with the protein time traces
subfolders=[x[0] for x in os.walk('./DATA/')][1:]

for sub in subfolders:
    print('----------- %s ------------' % (sub))
    for file in os.listdir(sub):
        if 'Protein' in file and 'txt' in file:
            print(file)
            subList.append(sub)
            FileListtxt.append(sub+'/'+file)
            FileListcsv.append(sub+'/'+file.replace('txt','csv'))

In [None]:
da_rnn_kwargs = {"batch_size": 128, "T": 50}

for i in range(len(FileListtxt)):
    
    a=time.time()
    txtFile=FileListtxt[i]
    csvFile=FileListcsv[i]
    sub=subList[i]
    
    timelist = np.loadtxt(txtFile)[:,0]
    dt = timelist[3]-timelist[2]

    _, cols = txt_to_csv(txtFile, csvFile, ncols = [0])
    origmat=np.zeros((len(timelist),len(cols)+1))
    predmat=np.zeros((len(timelist),len(cols)+1))
    predmat[:,0]=timelist
    origmat[:,0]=timelist
    
    j=0
    for target in cols:
        print(target)
        a=time.time()
        target=[target,]
        raw_data = pd.read_csv(csvFile, nrows=5000, usecols = cols)
        
        # Data preprocessing
        prep,scaler= my_preprocess_data(raw_data, target)
        
        # Load the trained network
        netFolder=txtFile.replace('.txt','')+'/'+target[0]+'/'
             
        with open(netFolder+"enc_kwargs.json", "r") as fi:
            enc_kwargs = json.load(fi)
        encoder = Encoder(**enc_kwargs)
        encoder.load_state_dict(torch.load(netFolder+"encoder.torch", map_location=device))
        
        with open(netFolder+"dec_kwargs.json", "r") as fi:
            dec_kwargs = json.load(fi)
        dec_kwargs['out_feats']=1    
        decoder = Decoder(**dec_kwargs)
        decoder.load_state_dict(torch.load(netFolder+"decoder.torch", map_location=device))
        
        y_pred_orig,mse_orig,mape_orig=my_noisy_predict(encoder,decoder, prep, 50, 1500,0.)
        predmat[:,j+1]=y_pred_orig[:,0]
        origmat[:,j+1]=prep.targs[:,0]
        j+=1
#     np.savetxt('pred_'+re.sub(r'^.*?Protein', 'Protein', txtFile),np.c_[predmat],fmt='%f',delimiter='\t')
#     np.savetxt('orig_'+re.sub(r'^.*?Protein', 'Protein', txtFile),np.c_[origmat],fmt='%f',delimiter='\t')

# Addition of Gaussian noise

In [None]:
def my_noisy_predict(enc,dec, prep: TrainData, T: int, TimeFuture: int,sig):
    
    if TimeFuture+T > prep.targs.shape[0]:
        TimeFuture=prep.targs.shape[0]-T
    
    batch_size= 1
    da_rnn_kwargs = {"batch_size": 1, "T": T}

    out_size = prep.targs.shape[1]
    y_pred = np.zeros((TimeFuture+T, out_size))
    y_pred[range(T)]= prep.targs[range(T)]

    for y_i in range(T, T+TimeFuture, batch_size):

        y_slc = slice(y_i, y_i + batch_size)
        batch_idx = range(T+TimeFuture)[y_slc]
        #takes all the value for each batch size of the data
        b_len = len(batch_idx)

        X = np.zeros((b_len, T - 1, prep.feats.shape[1]))
        y_history = np.zeros((b_len, T - 1, prep.targs.shape[1]))


        for b_i, b_idx in enumerate(batch_idx):
            idx = range(b_idx - T, b_idx - 1)
            X[b_i, :, :] = prep.feats[idx,:]
            y_history[b_i, :] = y_pred[idx]

        y_history=y_history+np.random.normal(0,sig,np.shape(y_history))
        print(np.shape(y_history))
        y_history = numpy_to_tvar(y_history)
        att,_, input_encoded = enc(numpy_to_tvar(X))

        y_pred[y_slc] = dec(input_encoded, y_history).cpu().data.numpy()

    # Compute the mean square error of the prediction
    mse = np.mean((prep.targs - y_pred)**2)
    for i in range(prep.targs.shape[1]):
        mape=sklearn.metrics.mean_absolute_percentage_error(prep.targs[:,i],y_pred[:,i])
    
    return y_pred,mse,mape

In [None]:
da_rnn_kwargs = {"batch_size": 128, "T": 50}

mseTotMatrices=[]
mapeTotMatrices=[]

for i in range(len(FileListtxt)):
    
    a=time.time()
    txtFile=FileListtxt[i]
    csvFile=FileListcsv[i]
    sub=subList[i]
    print(i,txtFile)
    
    timelist = np.loadtxt(txtFile)[:,0]
    dt = timelist[3]-timelist[2]

    _, cols = txt_to_csv(txtFile, csvFile, ncols = [0])
    
    mseMat=np.zeros((len(cols),1+len(np.arange(.1,1.,.1))))
    mapeMat=np.zeros((len(cols),1+len(np.arange(.1,1.,.1))))
    
    j=0
    for target in cols:
        a=time.time()
        target=[target,]
        raw_data = pd.read_csv(csvFile, nrows=5000, usecols = cols)
        
        # Data preprocessing
        prep,scaler= my_preprocess_data(raw_data, target)
        
        # Load the trained network
        netFolder=txtFile.replace('.txt','')+'/'+target[0]+'/'
        
        with open(netFolder+"enc_kwargs.json", "r") as fi:
            enc_kwargs = json.load(fi)
        encoder = Encoder(**enc_kwargs)
        encoder.load_state_dict(torch.load(netFolder+"encoder.torch", map_location=device))
        
        with open(netFolder+"dec_kwargs.json", "r") as fi:
            dec_kwargs = json.load(fi)
        dec_kwargs['out_feats']=1    
        decoder = Decoder(**dec_kwargs)
        decoder.load_state_dict(torch.load(netFolder+"decoder.torch", map_location=device))
        
        y_pred_orig,mse_orig,mape_orig=my_noisy_predict(encoder,decoder, prep, 50, 1500,0.)

        mse_vec=[]
        mape_vec=[]
        
        mse_vec.append(mse_orig)
        mape_vec.append(mape_orig)
        
        # Add noise with increasing amplitude and predict the gene expression 
        for sigma in np.arange(.1,1.,.1):
            y_pred,mse,mape=my_noisy_predict(encoder,decoder, prep, 50, 1500,sigma)
            mse_vec.append(mse)
            mape_vec.append(mape)
        
        mseMat[j,:]=np.array(mse_vec)
        mapeMat[j,:]=np.array(mape_vec)
        
        b=time.time()   
        print(b-a)
        j+=1
    att_matt_dir='./DATA/Noise_analysis/'
    if os.path.isdir(att_matt_dir)==False:
        os.mkdir(att_matt_dir)
    np.savetxt(att_matt_dir+'mse_mat'+sub.replace('/','_')+re.sub(r'^.*?Protein', 'Protein', txtFile),np.c_[mseMat],fmt='%f',delimiter='\t')
    np.savetxt(att_matt_dir+'mape_mat'+sub.replace('/','_')+re.sub(r'^.*?Protein', 'Protein', txtFile),np.c_[mapeMat],fmt='%f',delimiter='\t')
        
    mseTotMatrices.append(mseMat)
    mapeTotMatrices.append(mapeMat)