In [13]:
import torch
from torch.utils.data import Dataset, DataLoader
import unicodedata
import string

# in general, we cannot have all the unicode characters encoded..

class TextDataset(Dataset):

    SPECIAL_CHARS = [None, "\n", "β"]
    def __len__(self):
        return len(self.total_examples)
        pass
     
    # Makes the vocab from the given dataset
    def make_vocab(self):
        
        self.char2index = {}
        self.index2char = {}
        index = 0
        with open(self.text_file) as file:
            for line in file:
                for char in line:
                    if char not in self.char2index:
                        self.char2index[char] = index
                        self.index2char[index]  = char
                        index+=1
        
        for char in self.SPECIAL_CHARS:
#             print("this is the char")
            if char not in self.char2index:
                self.char2index[char] = index
                self.index2char[index] = char
                index += 1
        
        print(self.char2index[None])
        self.vocab_size = len(self.char2index)
    
    # this allows the model to handle all possible ASCII (non-unicode) strings passed into it!!
    def make_full_vocab(self):
        self.char2index = {}
        self.index2char = {}
        index = 0
        all_letters = string.printable 
        for char in all_letters:
            if char not in self.char2index:
                self.char2index[char] = index
                self.index2char[index]  = char
                index+=1
                
        for char in self.SPECIAL_CHARS:
            if char not in self.char2index:
                self.char2index[char] = index
                self.index2char[index] = char
                index += 1
                
        self.vocab_size = len(self.char2index)   
        
    def generate_tensor_for_char(self, char):
        temp = torch.zeros(1, self.vocab_size)
        temp[0][self.char2index[char]] = 1 
        return temp

    def lineToTensor(self,line):
        tensor = torch.zeros(len(line), 1, self.vocab_size)
        for li, letter in enumerate(line):
            tensor[li][0][self.char2index[letter]] = 1
        return tensor
      
    def get_None_tensor(self):
        tensor = torch.zeros(1, 1, self.vocab_size)
        tensor[0][0][self.char2index[None]] = 1
        return tensor
    
    def list_to_tensor(self, input_list):
        tensor = torch.zeros(len(input_list), 1, self.vocab_size)
        for elt in input_list:
            tensor[li][0][self.char2index[letter]] = 1
    
    # Get the tensor representing a new line character        
    def get_new_line_tensor(self):
        tensor = torch.zeros(1, 1, self.vocab_size)
        tensor[0][0][self.char2index["\n"]] = 1
        return tensor

    def unicodeToAscii(self,s):
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
            and c in self.char2index
        )

    def __init__(self, text_file,  convert_to_ascii = True):
        self.text_file = text_file

        self.training_examples = []

        total_inputs = []
        total_outputs = []
        
#         make the vocab
        self.make_full_vocab()
        
        with open(text_file) as file:
            for raw_line in file:
                
                line = self.unicodeToAscii(raw_line) if convert_to_ascii else raw_line

                inputs = self.lineToTensor([None] + [x for x in line])
                
                targets = self.lineToTensor([x for x in line] + ["\n"])  # we need a 0 as well!

                total_inputs.append(inputs)
                total_outputs.append(targets)

        assert len(total_inputs) == len(total_outputs)
        
        self.total_examples = list(zip(total_inputs, total_outputs))
        
    def __getitem__(self, index):
        return self.total_examples[index]


In [14]:
# ok, so now we can start building the network that will process these!!
# simply: iterate the dataset, and run train on it, do the log likelihood and etc.

import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)
    


In [15]:




# returns the loss for a line, evaluating on a BASIC rnn, and using the provided optimizer
def prototype_general_train(input_tensor, target_tensor, basic_rnn, optimizer):
    
        
    
    optimizer.zero_grad()
    
    criterion = nn.NLLLoss()    
    hidden = basic_rnn.initHidden()
    loss = 0
    basic_rnn.zero_grad()
    for i in range(input_tensor.size()[0]):
        output, hidden = basic_rnn(input_tensor[i], hidden)
        
        max_tensor = torch.argmax(target_tensor[i])
        ind = (torch.argmax(target_tensor[i])).item()
        true_target_tensor = torch.tensor([ind], dtype=torch.long)
        #     Evaluate the loss on each character!
        loss += criterion(output, true_target_tensor)
        
    loss.backward()
    optimizer.step()

    return output, loss.item()




In [16]:


def prototype_general_train_function(dataset):
# del rnn
# rnn = RNN(abc.vocab_size, n_hidden, abc.vocab_size)

    epoch_loss = 0
    # loss every k iters
    # 
    loss_per_k = 0
    
    total_loss = 0
    total_length  = 0
    epoch_length = 0


    num_epochs = 10
    
    
    rnn = RNN(dataset.vocab_size, 512, dataset.vocab_size)
    # If you set this too high, it might explode. If too low, it might not learn
    #  we had to set it a big lower to force convergence
    import torch.optim as optim
    optimizer = optim.SGD(rnn.parameters(), lr = 0.0001)
    
    for k in range(num_epochs ):

        for i,(x,y) in enumerate(tqdm(dataset)):
        #     print(x)
        #     print(x)
        #     print(y)
            _, loss = prototype_general_train(x,y, rnn, optimizer)
            epoch_loss += loss
            epoch_length += x.size()[0]
            total_loss += loss
            total_length += x.size()[0]
            loss_per_k += loss
        #     print(i)
        #     abc = i
        
            # for the first 100 iters, print the loss of every line!
#             if i < 100:
#                 print ("loss for {} is {}".format(i,epoch_loss/(i+1)))
                
            
            if i % 100 == 0 and i != 0:
                print ("loss for {} is {}".format(i,loss_per_k/100))
                loss_per_k = 0
                

        print("epoch {} loss is {}".format(k, epoch_loss/i))
        print("per character loss is {}".format(epoch_loss/epoch_length))
        epoch_loss = 0
        epoch_length = 0

In [13]:

from tqdm import tqdm
# make a new dataset
final_reuters_dataset = TextDataset( "reuters_news_10000.txt", True)
prototype_general_train_function(final_reuters_dataset)


  1%|▊                                                                              | 97/10000 [00:13<26:27,  6.24it/s]

loss for 100 is 252.1782168006897


  2%|█▌                                                                            | 200/10000 [00:26<28:36,  5.71it/s]

loss for 200 is 234.55296186447143


  3%|██▎                                                                           | 300/10000 [00:38<25:40,  6.30it/s]

loss for 300 is 233.0284285736084


  4%|███                                                                           | 399/10000 [00:49<19:48,  8.08it/s]

loss for 400 is 200.36026893615724


  5%|███▉                                                                          | 498/10000 [01:02<20:44,  7.64it/s]

loss for 500 is 184.7065175151825


  6%|████▋                                                                         | 600/10000 [01:15<21:41,  7.22it/s]

loss for 600 is 181.0620216369629


  7%|█████▍                                                                        | 700/10000 [01:27<34:37,  4.48it/s]

loss for 700 is 177.24454802513122


  8%|██████▏                                                                       | 800/10000 [01:40<36:31,  4.20it/s]

loss for 800 is 166.47085697174072


  9%|███████                                                                       | 900/10000 [01:57<24:35,  6.17it/s]

loss for 900 is 188.60617389678956


 10%|███████▋                                                                     | 1000/10000 [02:09<19:06,  7.85it/s]

loss for 1000 is 178.03588829040527


 11%|████████▍                                                                    | 1100/10000 [02:21<17:05,  8.68it/s]

loss for 1100 is 173.3355166721344


 12%|█████████▏                                                                   | 1199/10000 [02:35<24:06,  6.08it/s]

loss for 1200 is 167.58311207771303


 13%|██████████                                                                   | 1300/10000 [02:52<26:32,  5.46it/s]

loss for 1300 is 180.1634326171875


 14%|██████████▊                                                                  | 1400/10000 [03:06<26:06,  5.49it/s]

loss for 1400 is 174.45529370307924


 15%|███████████▌                                                                 | 1500/10000 [03:19<14:26,  9.81it/s]

loss for 1500 is 163.91702126502992


 16%|████████████▎                                                                | 1600/10000 [03:30<12:26, 11.26it/s]

loss for 1600 is 151.73650609970093


 17%|█████████████                                                                | 1699/10000 [03:37<13:15, 10.43it/s]

loss for 1700 is 122.68408715248108


 18%|█████████████▊                                                               | 1800/10000 [03:51<14:07,  9.68it/s]

loss for 1800 is 157.22572574615478


 19%|██████████████▋                                                              | 1900/10000 [04:03<20:16,  6.66it/s]

loss for 1900 is 165.55824013710023


 20%|███████████████▍                                                             | 2000/10000 [04:16<24:42,  5.40it/s]

loss for 2000 is 134.03352548599244


 21%|████████████████▏                                                            | 2100/10000 [04:31<10:47, 12.20it/s]

loss for 2100 is 149.3590999507904


 22%|████████████████▉                                                            | 2200/10000 [04:41<16:01,  8.11it/s]

loss for 2200 is 144.72556387901307


 23%|█████████████████▋                                                           | 2299/10000 [04:53<17:50,  7.20it/s]

loss for 2300 is 170.33316884994508


 24%|██████████████████▍                                                          | 2400/10000 [05:04<13:23,  9.45it/s]

loss for 2400 is 151.1021633052826


 25%|███████████████████▎                                                         | 2500/10000 [05:17<17:45,  7.04it/s]

loss for 2500 is 176.435002450943


 26%|████████████████████                                                         | 2599/10000 [05:25<11:00, 11.21it/s]

loss for 2600 is 122.81885112762451


 27%|████████████████████▊                                                        | 2700/10000 [05:37<16:03,  7.57it/s]

loss for 2700 is 159.62401253700256


 28%|█████████████████████▌                                                       | 2800/10000 [05:50<14:19,  8.37it/s]

loss for 2800 is 165.28930086135864


 29%|██████████████████████▎                                                      | 2899/10000 [06:02<14:33,  8.13it/s]

loss for 2900 is 153.54644283294678


 30%|███████████████████████                                                      | 3000/10000 [06:17<18:09,  6.42it/s]

loss for 3000 is 146.20919262886048


 31%|███████████████████████▊                                                     | 3099/10000 [06:28<15:36,  7.37it/s]

loss for 3100 is 127.660831489563


 32%|████████████████████████▋                                                    | 3200/10000 [06:40<15:00,  7.56it/s]

loss for 3200 is 165.6725465297699


 33%|█████████████████████████▍                                                   | 3299/10000 [06:52<13:40,  8.17it/s]

loss for 3300 is 156.47299389839174


 34%|██████████████████████████▏                                                  | 3400/10000 [07:01<07:18, 15.06it/s]

loss for 3400 is 124.20634160041809


 35%|██████████████████████████▉                                                  | 3498/10000 [07:12<14:15,  7.60it/s]

loss for 3500 is 160.29261499404907


 36%|███████████████████████████▋                                                 | 3599/10000 [07:22<11:23,  9.36it/s]

loss for 3600 is 149.28981408119202


 37%|████████████████████████████▍                                                | 3699/10000 [07:32<10:16, 10.21it/s]

loss for 3700 is 157.43591481208801


 38%|█████████████████████████████▎                                               | 3800/10000 [07:42<13:26,  7.68it/s]

loss for 3800 is 159.16026149749757


 39%|██████████████████████████████                                               | 3900/10000 [07:53<20:21,  4.99it/s]

loss for 3900 is 150.1273905467987


 40%|██████████████████████████████▊                                              | 4000/10000 [08:05<15:10,  6.59it/s]

loss for 4000 is 167.32502631187438


 41%|███████████████████████████████▌                                             | 4098/10000 [08:15<11:49,  8.32it/s]

loss for 4100 is 147.14532873153686


 42%|████████████████████████████████▎                                            | 4200/10000 [08:28<14:13,  6.80it/s]

loss for 4200 is 165.2348551559448


 43%|█████████████████████████████████                                            | 4300/10000 [08:39<12:06,  7.85it/s]

loss for 4300 is 158.27017493247985


 44%|█████████████████████████████████▊                                           | 4399/10000 [08:50<11:48,  7.90it/s]

loss for 4400 is 155.74165603637695


 45%|██████████████████████████████████▋                                          | 4500/10000 [09:01<11:06,  8.25it/s]

loss for 4500 is 146.38310249328615


 46%|███████████████████████████████████▍                                         | 4600/10000 [09:11<11:18,  7.96it/s]

loss for 4600 is 136.84000226974487


 47%|████████████████████████████████████▏                                        | 4699/10000 [09:22<10:20,  8.55it/s]

loss for 4700 is 153.4476163673401


 48%|████████████████████████████████████▉                                        | 4800/10000 [09:32<11:15,  7.70it/s]

loss for 4800 is 156.28145455360414


 49%|█████████████████████████████████████▋                                       | 4899/10000 [09:42<09:22,  9.06it/s]

loss for 4900 is 142.72466366767884


 50%|██████████████████████████████████████▍                                      | 4991/10000 [09:53<17:32,  4.76it/s]

KeyboardInterrupt: 

In [20]:
print("β-thal")
s = "β-thal"

β-thal


In [17]:
def unicodeToAscii(s):
    return ''.join(
        
        c for c in unicodedata.normalize('NFD', s) # take all characters from this result, then check the oboolean conditions on em
        if unicodedata.category(c) != 'Mn' #mark, nons-spacing
        and c in string.printable
    )

# we want something a bit more brusque for example:
# we literally, want to replace all characters in the text with a β character




# we should make an UNK token
# the unk token will be something like a special tensor 
# now, 

In [18]:

# unicode to regular text, but marks the text with regular beta symbols
# ideally, we can write this in a vectorized way, as opposed to just using a for loop
#  i mean, we could write a lambda for it; or we could write a list comprehension
def unicodeToMarkedAscii(s, mark="β"):
#     return "".join([c for c in s if c in unicodedata.normalize('NFD', s) and c in string.printable 
#                     else mark])
    return "".join([c if c in  unicodedata.normalize('NFD', s) and c in string.printable  else mark for c in s])



In [19]:
unicodeToMarkedAscii("john")

'john'

In [20]:

my_eqn = """
p=Exp∑BiXi / (1+Exp∑BiXi), multivariate logistic regression predictive model that calculated the risk of postoperative morbidity was developed, p = 1/(1 + e((4.810-1.287X1-0.504X2-0.500X3-0.474X4-0.405X5-0.318X6-0.316X7-0.305X8-0.278X9-0.255X10-0.138X11))).
"""

In [21]:
unicodeToMarkedAscii(my_eqn )

'\np=ExpβBiXi / (1+ExpβBiXi), multivariate logistic regression predictive model that calculated the risk of postoperative morbidity was developed, p = 1/(1 + e((4.810-1.287X1-0.504X2-0.500X3-0.474X4-0.405X5-0.318X6-0.316X7-0.305X8-0.278X9-0.255X10-0.138X11))).\n'

In [22]:

# returns the loss for the entire line (requires normalization for comparison)
def get_loss_on_line(rnn, line, vocab):
    input_tensor =  vocab.lineToTensor([None] + [x for x in line])
    target_tensor = vocab.lineToTensor([x for x in line] + ["\n"])
    criterion = nn.NLLLoss()    
    hidden = rnn.initHidden()
    loss = 0
    rnn.zero_grad()
    
    
#     print(input_tensor.size())
#     print(target_tensor[0])
#     print(target_tensor)
    for i in range(input_tensor.size()[0]):
        output, hidden = rnn(input_tensor[i], hidden)
        
        
        # had to do this simply because of how the NLLL and CrossEntropy are defined
        max_tensor = torch.argmax(target_tensor[i])
        ind = (torch.argmax(target_tensor[i])).item()
        true_target_tensor = torch.tensor([ind], dtype=torch.long)
    
#     Evaluate the loss on each character!
        loss += criterion(output, true_target_tensor)
    
    return loss.item()



In [23]:
# ok, so this is equivalent to just stripping out the bad characters

# this function runs a test. For a given set of test examples, it will create a dataset, and 
# then run the model on each of them, and then return the most confusing sentences!!

THRESHOLD = 2 



def TestHarness(test_examples, eval_model): 
    
    sent_values = []
# 
    for line in tqdm(test_examples):

        example_loss = get_loss_on_line(eval_model, unicodeToMarkedAscii(line))/len(line )
    #     print(example_loss)
        sent_values.append((line, example_loss))
        if (example_loss > THRESHOLD):
    #         print(line)
            pass
        pass

    # return the k highest lines
    most_confusing  = sorted(sent_values, key=lambda x: x[1], reverse=True)
    for (sent, val) in most_confusing[:100]:
        print("{} had loss of {}\n".format(sent, val))
    return most_confusing

    
    pass