# Transformer
upload sim_seq_1_train_sequences2.txt

In [None]:
# from https://discuss.pytorch.org/t/using-transformer-on-timeseries/104759
import numpy
import math
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from scipy import stats


class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


class SeqTransformer(nn.Module):
    def __init__(self, _max_seq_length, _feature_size, _num_layers, _num_heads):
        super(SeqTransformer, self).__init__()
        self.pos_encoder = PositionalEncoding(_feature_size)
        self.embedding = nn.Embedding(_max_seq_length, _feature_size)
        self.layers = nn.TransformerEncoderLayer(d_model=_feature_size, nhead=_num_heads)
        self.transformer = nn.TransformerEncoder(self.layers, num_layers=_num_layers)
        self.decoder = nn.Linear(_feature_size * _max_seq_length, 1)

    def forward(self, x):
        x = self.embedding(x)
        x = self.pos_encoder(x)
        x = self.transformer(x)
        x = x.view(1, -1)
        x = self.decoder(x)
        return x


MAX_ROWS = 100
HALF_ROWS = MAX_ROWS // 2
max_seq_length = 111
NUM_EPOCHS = 1000
num_layers = 2
num_heads = 5
feature_size = 10
torch.manual_seed(1)

training_data_fn = "sim_seq_1_train_sequences2.txt" #"/home/jgburk/PycharmProjects/IntroductionToDeepLearning/TF_Data/sim_seq_1_train_sequences.txt"

training_data = numpy.genfromtxt(training_data_fn, delimiter="\t", dtype=None,
                                 skip_header=0, max_rows=MAX_ROWS)
training_sequences = list()
training_expression = list()
mapper = dict([(ord('A'), 1),
               (ord('T'), 2),
               (ord('C'), 3),
               (ord('G'), 4),
               (ord('N'), 5)])
for row in training_data:
    ts = torch.IntTensor([[mapper[x] for x in row[0]]]).transpose(0, 1)
    #print(ts)
    training_sequences.append(ts)
    training_expression.append(torch.FloatTensor([row[1]]))

# working through https://stackoverflow.com/questions/56783182/runtimeerror-the-size-of-tensor-a-133-must-match-the-size-of-tensor-b-10-at
t_s = torch.nn.utils.rnn.pad_sequence(training_sequences, batch_first=True)
t_e = torch.cat(training_expression)

model = SeqTransformer(max_seq_length, feature_size, num_layers, num_heads)
train_inputs = t_s[0:MAX_ROWS:2, :, :]
test_inputs = t_s[1:MAX_ROWS:2, :, :]
train_targets = t_e[0:MAX_ROWS:2]
test_targets = t_e[1:MAX_ROWS:2]
#print(f'inputs.shape: {train_inputs.shape}')

loss_fn = nn.L1Loss()
optimizer = torch.optim.AdamW(model.parameters())

for epoch in range(NUM_EPOCHS):
    loss_sum = 0.0
    optimizer.zero_grad()
    for idx, _input in enumerate(train_inputs):
        score = model(_input)
        loss = loss_fn(score.view(1, -1), train_targets[idx].view(1, -1))
        loss.backward()
        loss_sum = loss_sum + torch.abs(loss)
    optimizer.step()
    print(f'Epoch: {epoch}, Loss: {loss_sum / len(train_inputs)}')

pred_l = list()
actual_l = list()
loss_sq_l = list()
for idx, test_input in enumerate(test_inputs):
    pred = model(test_input)
    pred = pred.item()
    actual = test_targets[idx]
    actual = actual.item()
    loss = pred - actual
    print(f'loss[{idx}]: {loss}')
    pred_l.append(pred)
    actual_l.append(actual)
    loss_sq_l.append(abs(loss))

psn = stats.pearsonr(x=pred_l, y=actual_l)
spn = stats.spearmanr(a=pred_l, b=actual_l)
plt.scatter(x=pred_l, y=actual_l, c=loss_sq_l)
plt.xlim([min(pred_l + actual_l) - 1, max(pred_l + actual_l) + 1])
plt.ylim([min(pred_l + actual_l) - 1, max(pred_l + actual_l) + 1])
plt.xlabel("Predicted Expression")
plt.ylabel("Actual Expression")
plt.suptitle(f'(Pearson + Spearman)/2 = {round((psn[0] + spn.correlation) / 2.0, ndigits=3)}')
plt.title(f'mean(P-Values) = {round((psn[1] + spn.pvalue) / 2.0, ndigits=3)}')
plt.show()
print(max(loss_sq_l))
print(f'Pearson r={round(psn[0], ndigits=3)}, P-Value={round(psn[1], ndigits=3)}')
print(f'Spearman correlation={round(spn.correlation, ndigits=3)}, P-Value={round(spn.pvalue, ndigits=3)}')

  training_data = numpy.genfromtxt(training_data_fn, delimiter="\t", dtype=None,


Epoch: 0, Loss: 10.697022438049316
Epoch: 1, Loss: 9.228446006774902
Epoch: 2, Loss: 8.176187515258789
Epoch: 3, Loss: 7.170403480529785
Epoch: 4, Loss: 6.189487934112549
Epoch: 5, Loss: 5.233256816864014
Epoch: 6, Loss: 4.326925277709961
Epoch: 7, Loss: 3.3854193687438965
Epoch: 8, Loss: 2.5347325801849365
Epoch: 9, Loss: 1.972529411315918
Epoch: 10, Loss: 1.6806044578552246
Epoch: 11, Loss: 1.7128338813781738
Epoch: 12, Loss: 1.9110195636749268
Epoch: 13, Loss: 2.1989264488220215
Epoch: 14, Loss: 2.274672269821167
Epoch: 15, Loss: 2.08693790435791
Epoch: 16, Loss: 1.608951210975647
Epoch: 17, Loss: 1.9913123846054077
Epoch: 18, Loss: 1.960890531539917
Epoch: 19, Loss: 1.6551308631896973
Epoch: 20, Loss: 1.6070233583450317
Epoch: 21, Loss: 1.7863589525222778
Epoch: 22, Loss: 1.827773094177246
Epoch: 23, Loss: 1.7122563123703003
Epoch: 24, Loss: 1.6502354145050049
Epoch: 25, Loss: 1.6185617446899414
Epoch: 26, Loss: 1.68135666847229
Epoch: 27, Loss: 1.6948930025100708
Epoch: 28, Loss: 