In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import unicodedata
import string

# in general, we cannot have all the unicode characters encoded..

class TextDataset(Dataset):

    SPECIAL_CHARS = [None, "\n", "β"]
    def __len__(self):
        return len(self.total_examples)
        pass
     
    # Makes the vocab from the given dataset
    def make_vocab(self):
        
        self.char2index = {}
        self.index2char = {}
        index = 0
        with open(self.text_file) as file:
            for line in file:
                for char in line:
                    if char not in self.char2index:
                        self.char2index[char] = index
                        self.index2char[index]  = char
                        index+=1
        
        for char in self.SPECIAL_CHARS:
#             print("this is the char")
            if char not in self.char2index:
                self.char2index[char] = index
                self.index2char[index] = char
                index += 1
        
        print(self.char2index[None])
        self.vocab_size = len(self.char2index)
    
    # this allows the model to handle all possible ASCII (non-unicode) strings passed into it!!
    def make_full_vocab(self):
        self.char2index = {}
        self.index2char = {}
        index = 0
        all_letters = string.printable 
        for char in all_letters:
            if char not in self.char2index:
                self.char2index[char] = index
                self.index2char[index]  = char
                index+=1
                
        for char in self.SPECIAL_CHARS:
            if char not in self.char2index:
                self.char2index[char] = index
                self.index2char[index] = char
                index += 1
                
        self.vocab_size = len(self.char2index)   
        
    def generate_tensor_for_char(self, char):
        temp = torch.zeros(1, self.vocab_size)
        temp[0][self.char2index[char]] = 1 
        return temp

    def lineToTensor(self,line):
        tensor = torch.zeros(len(line), 1, self.vocab_size)
        for li, letter in enumerate(line):
            tensor[li][0][self.char2index[letter]] = 1
        return tensor
      
    def get_None_tensor(self):
        tensor = torch.zeros(1, 1, self.vocab_size)
        tensor[0][0][self.char2index[None]] = 1
        return tensor
    
    def list_to_tensor(self, input_list):
        tensor = torch.zeros(len(input_list), 1, self.vocab_size)
        for elt in input_list:
            tensor[li][0][self.char2index[letter]] = 1
    
    # Get the tensor representing a new line character        
    def get_new_line_tensor(self):
        tensor = torch.zeros(1, 1, self.vocab_size)
        tensor[0][0][self.char2index["\n"]] = 1
        return tensor

    def unicodeToAscii(self,s):
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
            and c in self.char2index
        )

    def __init__(self, text_file,  convert_to_ascii = True):
        self.text_file = text_file

        self.training_examples = []

        total_inputs = []
        total_outputs = []
        
#         make the vocab
        self.make_full_vocab()
        
        with open(text_file) as file:
            for raw_line in file:
                
                line = self.unicodeToAscii(raw_line) if convert_to_ascii else raw_line

                inputs = self.lineToTensor([None] + [x for x in line])
                
                targets = self.lineToTensor([x for x in line] + ["\n"])  # we need a 0 as well!

                total_inputs.append(inputs)
                total_outputs.append(targets)

        assert len(total_inputs) == len(total_outputs)
        
        self.total_examples = list(zip(total_inputs, total_outputs))
        
    def __getitem__(self, index):
        return self.total_examples[index]


In [2]:
# ok, so now we can start building the network that will process these!!
# simply: iterate the dataset, and run train on it, do the log likelihood and etc.

import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)
    


In [3]:




# returns the loss for a line, evaluating on a BASIC rnn, and using the provided optimizer
def prototype_general_train(input_tensor, target_tensor, basic_rnn, optimizer):
    
        
    
    optimizer.zero_grad()
    
    criterion = nn.NLLLoss()    
    hidden = basic_rnn.initHidden()
    loss = 0
    basic_rnn.zero_grad()
    for i in range(input_tensor.size()[0]):
        output, hidden = basic_rnn(input_tensor[i], hidden)
        
        max_tensor = torch.argmax(target_tensor[i])
        ind = (torch.argmax(target_tensor[i])).item()
        true_target_tensor = torch.tensor([ind], dtype=torch.long)
        #     Evaluate the loss on each character!
        loss += criterion(output, true_target_tensor)
        
    loss.backward()
    optimizer.step()

    return output, loss.item()




In [4]:


def prototype_general_train_function(dataset,     num_epochs = 10):
# del rnn
# rnn = RNN(abc.vocab_size, n_hidden, abc.vocab_size)

    epoch_loss = 0
    # loss every k iters
    # 
    loss_per_k = 0
    
    total_loss = 0
    total_length  = 0
    epoch_length = 0


    
    
    rnn = RNN(dataset.vocab_size, 512, dataset.vocab_size)
    # If you set this too high, it might explode. If too low, it might not learn
    #  we had to set it a big lower to force convergence
    import torch.optim as optim
    optimizer = optim.SGD(rnn.parameters(), lr = 0.0001)
    
    for k in range(num_epochs ):

        for i,(x,y) in enumerate(tqdm(dataset)):
        #     print(x)
        #     print(x)
        #     print(y)
            _, loss = prototype_general_train(x,y, rnn, optimizer)
            epoch_loss += loss
            epoch_length += x.size()[0]
            total_loss += loss
            total_length += x.size()[0]
            loss_per_k += loss
        #     print(i)
        #     abc = i
        
            # for the first 100 iters, print the loss of every line!
#             if i < 100:
#                 print ("loss for {} is {}".format(i,epoch_loss/(i+1)))
                
            
            if i % 2000 == 0 and i != 0:
                print ("loss for {} is {}".format(i,loss_per_k/2000))
                loss_per_k = 0
                

        print("epoch {} loss is {}".format(k, epoch_loss/i))
        print("per character loss is {}".format(epoch_loss/epoch_length))
        epoch_loss = 0
        epoch_length = 0
    return rnn

In [18]:
# resumes training of the model, on a given dataset
def resume_training(rnn , dataset, num_epochs=10):
    epoch_loss = 0
    loss_per_k = 0
    
    total_loss = 0
    total_length  = 0
    epoch_length = 0


    
    
    # If you set this too high, it might explode. If too low, it might not learn
    #  we had to set it a big lower to force convergence
    import torch.optim as optim
    optimizer = optim.SGD(rnn.parameters(), lr = 0.0001)
    
    for k in range(num_epochs ):

        for i,(x,y) in enumerate(tqdm(dataset)):
        #     print(x)
        #     print(x)
        #     print(y)
            _, loss = prototype_general_train(x,y, rnn, optimizer)
            epoch_loss += loss
            epoch_length += x.size()[0]
            total_loss += loss
            total_length += x.size()[0]
            loss_per_k += loss
        #     print(i)
        #     abc = i
        
            # for the first 100 iters, print the loss of every line!
#             if i < 100:
#                 print ("loss for {} is {}".format(i,epoch_loss/(i+1)))
                
            
            if i % 2000 == 0 and i != 0:
                print ("loss for {} is {}".format(i,loss_per_k/2000))
                loss_per_k = 0
                

        print("epoch {} loss is {}".format(k, epoch_loss/i))
        print("per character loss is {}".format(epoch_loss/epoch_length))
        epoch_loss = 0
        epoch_length = 0
    return rnn


In [5]:


from tqdm import tqdm
# make a new dataset
final_reuters_dataset = TextDataset( "reuters_news_10000.txt", True)
trained_model = prototype_general_train_function(final_reuters_dataset, num_epochs=1)


 20%|██        | 2011/10000 [00:57<02:29, 53.35it/s]

loss for 2000 is 3589.754926404953


 40%|████      | 4004/10000 [01:49<03:01, 33.10it/s]

loss for 4000 is 3054.1623755836486


 60%|██████    | 6006/10000 [02:43<01:42, 39.01it/s]

loss for 6000 is 3072.1863031959533


 80%|████████  | 8005/10000 [03:37<00:49, 40.05it/s]

loss for 8000 is 3070.4005854320526


100%|██████████| 10000/10000 [04:26<00:00, 37.48it/s]

epoch 0 loss is 156.09293754545018
per character loss is 3.317949253123293





In [19]:
trained_model = resume_training(trained_model, final_reuters_dataset,  num_epochs=2)

 20%|██        | 2015/10000 [00:55<02:18, 57.67it/s]

loss for 2000 is 153.51308339047432


 40%|████      | 4006/10000 [01:46<02:55, 34.07it/s]

loss for 4000 is 142.32079424238205


 60%|██████    | 6007/10000 [02:39<01:41, 39.31it/s]

loss for 6000 is 144.33724083042145


 80%|████████  | 8009/10000 [03:33<00:49, 40.39it/s]

loss for 8000 is 144.88906172156334


100%|██████████| 10000/10000 [04:22<00:00, 38.07it/s]
  0%|          | 4/10000 [00:00<05:00, 33.24it/s]

epoch 0 loss is 143.81103113868593
per character loss is 3.0568820784640414


 20%|██        | 2009/10000 [00:56<02:31, 52.66it/s]

loss for 2000 is 279.8917650306225


 40%|████      | 4004/10000 [01:48<02:58, 33.59it/s]

loss for 4000 is 134.923519081831


 60%|██████    | 6007/10000 [02:41<01:40, 39.73it/s]

loss for 6000 is 136.82395698404312


 80%|████████  | 8009/10000 [03:34<00:48, 40.77it/s]

loss for 8000 is 136.5963933136463


100%|██████████| 10000/10000 [04:24<00:00, 37.80it/s]

epoch 1 loss is 136.5758900195554
per character loss is 2.903090168016646





In [6]:
print("β-thal")
s = "β-thal"

β-thal


In [7]:
# ok, so now we just need to be able to save the model

filename = "spec_char_reuters.pt"
if True:
    torch.save(trained_model.state_dict(), filename)
    
    
eval_model = RNN(final_reuters_dataset.vocab_size, 512, final_reuters_dataset.vocab_size)
eval_model.load_state_dict(torch.load(filename))



In [8]:
def unicodeToAscii(s):
    return ''.join(
        
        c for c in unicodedata.normalize('NFD', s) # take all characters from this result, then check the oboolean conditions on em
        if unicodedata.category(c) != 'Mn' #mark, nons-spacing
        and c in string.printable
    )


# we want something a bit more brusque for example:
# we literally, want to replace all characters in the text with a β character




# we should make an UNK token
# the unk token will be something like a special tensor 
# now, 

In [9]:

# unicode to regular text, but marks the text with regular beta symbols
# ideally, we can write this in a vectorized way, as opposed to just using a for loop
#  i mean, we could write a lambda for it; or we could write a list comprehension
def unicodeToMarkedAscii(s, mark="β"):
#     return "".join([c for c in s if c in unicodedata.normalize('NFD', s) and c in string.printable 
#                     else mark])
    return "".join([c if c in  unicodedata.normalize('NFD', s) and c in string.printable  else mark for c in s])



In [10]:
unicodeToMarkedAscii("john")

'john'

In [11]:

my_eqn = """
p=Exp∑BiXi / (1+Exp∑BiXi), multivariate logistic regression predictive model that calculated the risk of postoperative morbidity was developed, p = 1/(1 + e((4.810-1.287X1-0.504X2-0.500X3-0.474X4-0.405X5-0.318X6-0.316X7-0.305X8-0.278X9-0.255X10-0.138X11))).
"""

In [12]:
unicodeToMarkedAscii(my_eqn )

'\np=ExpβBiXi / (1+ExpβBiXi), multivariate logistic regression predictive model that calculated the risk of postoperative morbidity was developed, p = 1/(1 + e((4.810-1.287X1-0.504X2-0.500X3-0.474X4-0.405X5-0.318X6-0.316X7-0.305X8-0.278X9-0.255X10-0.138X11))).\n'

In [13]:

# returns the loss for the entire line (requires normalization for comparison)
def get_loss_on_line(rnn, line, vocab):
    input_tensor =  vocab.lineToTensor([None] + [x for x in line])
    target_tensor = vocab.lineToTensor([x for x in line] + ["\n"])
    criterion = nn.NLLLoss()    
    hidden = rnn.initHidden()
    loss = 0
    rnn.zero_grad()
    
    
#     print(input_tensor.size())
#     print(target_tensor[0])
#     print(target_tensor)
    for i in range(input_tensor.size()[0]):
        output, hidden = rnn(input_tensor[i], hidden)
        
        
        # had to do this simply because of how the NLLL and CrossEntropy are defined
        max_tensor = torch.argmax(target_tensor[i])
        ind = (torch.argmax(target_tensor[i])).item()
        true_target_tensor = torch.tensor([ind], dtype=torch.long)
    
#     Evaluate the loss on each character!
        loss += criterion(output, true_target_tensor)
    
    return loss.item()



In [46]:
# ok, so this is equivalent to just stripping out the bad characters

# this function runs a test. For a given set of test examples, it will create a dataset, and 
# then run the model on each of them, and then return the most confusing sentences!!

THRESHOLD = 2 



def TestHarness(test_examples, eval_model, vocab): 
    
    sent_values = []
# 
    for line in tqdm(test_examples):

#         skip short sentences
        if len(line ) < 15:
            continue
        example_loss = get_loss_on_line(eval_model, unicodeToMarkedAscii(line), vocab)/len(line )
    #     print(example_loss)
    
        
        sent_values.append((line, example_loss))
        if (example_loss > THRESHOLD):
    #         print(line)
            pass
        pass

    # return the k highest lines
    most_confusing  = sorted(sent_values, key=lambda x: x[1], reverse=True)
    for (sent, val) in most_confusing[:100]:
        print("{} had loss of {}\n".format(sent, val))
    return most_confusing

    
    pass

In [47]:
import os
import json
with open(os.path.join("dataXplorer", "result.json")) as file:
    data  = json.load(file)
    
for key in data:
    
#     print(key, data[key])
    pass

In [48]:
from nltk.tokenize import sent_tokenize
training_examples = []

with open("testing_samples.txt", "w") as file:
    for key in data:
        lines = sent_tokenize(data[key])
        for line in lines:
            training_examples.append(line)
            file.write(line)
            file.write("\n")

# print(training_examples)

In [49]:
most_confusing = TestHarness(training_examples, eval_model, final_reuters_dataset)

100%|██████████| 1019/1019 [00:30<00:00, 32.88it/s]

Regression equation Y=-10.363+1.916X3+1.446X4-1.445X5+2.070X6+4.679X7+1.125X9+1.023X10+2.223X11. had loss of 4.696212450663249

Binary logistic regression analysis showed that hyperuricemia (OR = 4.62, p = 0.000), proteinuria (OR = 3.96, p = 0.000), urinary tumor (OR = 2.92, p = 0.015), anemia (OR = 2.45, p = 0.000), stroke (OR = 1.96, p = 0.000), hypertension (OR = 1.83, p = 0.006), renal cyst (OR = 1.64, p = 0.018), female (OR = 1.54, p = 0.015), coronary artery disease (OR = 1.53, p = 0.008) and age (OR = 1.05, p = 0.000) were the risk factors of reduced renal function. had loss of 4.45322541704199

CIN was subdivided into two groups: CIN<sub>25%</sub> (SCr increase >25% but ≤0.5mg/dl), and CIN<sub>0.5</sub> (SCr increase >0.5mg/dl). had loss of 4.366510279038373

The results of multiple logistic regression models showed that some OH-PAHs were positively associated with ASCVD risk but not CHD risk, including 2-hydroxyfluoren (β = 1.761; 95% CI: 1.194-2.597), 9-hydroxyfluoren (β = 1.




picks up on stats, and genetics (protein names) -- these are somewhat perplexing
we can do a filtering post processing step


In [45]:
for sent,loss in most_confusing:
    print(loss, sent, len(sent))

# print(most_confusing)

5.499807877974077 (2012) [6]. 11
4.696212450663249 Regression equation Y=-10.363+1.916X3+1.446X4-1.445X5+2.070X6+4.679X7+1.125X9+1.023X10+2.223X11. 96
4.45322541704199 Binary logistic regression analysis showed that hyperuricemia (OR = 4.62, p = 0.000), proteinuria (OR = 3.96, p = 0.000), urinary tumor (OR = 2.92, p = 0.015), anemia (OR = 2.45, p = 0.000), stroke (OR = 1.96, p = 0.000), hypertension (OR = 1.83, p = 0.006), renal cyst (OR = 1.64, p = 0.018), female (OR = 1.54, p = 0.015), coronary artery disease (OR = 1.53, p = 0.008) and age (OR = 1.05, p = 0.000) were the risk factors of reduced renal function. 451
4.366510279038373 CIN was subdivided into two groups: CIN<sub>25%</sub> (SCr increase >25% but ≤0.5mg/dl), and CIN<sub>0.5</sub> (SCr increase >0.5mg/dl). 136
4.274568274575455 The results of multiple logistic regression models showed that some OH-PAHs were positively associated with ASCVD risk but not CHD risk, including 2-hydroxyfluoren (β = 1.761; 95% CI: 1.194-2.597), 9