In [1]:
import torch
from torch import tensor, cat
from torch.nn import functional as F
import pandas as pd

In [16]:
# Functions for Ngram 59 neuron single layer implementation.

class Ngram_nn:
    
    def __init__(self, ngram = 2):
        self.ngram = ngram
        self.weight_matrix = torch.randn((59*(self.ngram-1),59), requires_grad=True) #each column of the weight matrix corresponds to one of the 59 neurons of the single layer
        self.one_hot_encoding = torch.nn.functional.one_hot(torch.arange(0, 59), num_classes=59).float() # each row of the encoding matrix corresponds to a single character.
        print(self.one_hot_encoding.shape)
        self.loss = None
        self.learning_rate = 1
        self.lr_count = 1
        
        

        

    def get_name_list_with_st(self): #st - special token
        
        words = open('short_sentences.txt', 'r').read().splitlines()
        words_st = [] #st - special token
        for w in words:
            words_st.append("<"+ w + "<")
        print('retrieved name list')
        return words_st


    def get_indices(self,words_st):
        stoi = dict(zip(sorted(set(''.join(words_st))),list(range(59))))
        itos = {value: key for key, value in stoi.items()}

        print('retrieved indices')
        return stoi, itos

    def get_encoding(self, words_st, stoi, itos):
        print('encoding started')
        xs = torch.empty((0,59*(self.ngram-1)), dtype=torch.float32)
        ys = torch.empty((0,1),dtype=torch.int64)

            

        for w in words_st[:100]:

            for i in range(len(w) - self.ngram + 1):
                input_ngram = w[i:i + self.ngram-1]
                target_char = w[i + self.ngram-1] if i + self.ngram-1 < len(w) else None
                
                
                if target_char is not None:
                    input_encoding = torch.empty((1,59*(self.ngram-1)), dtype=torch.float32)
                    i=0

                    for ch in input_ngram:

                        input_encoding[0,(59*i):(59*(i+1))] = self.one_hot_encoding[stoi[ch]].unsqueeze(0)
                        i+=1

                    xs = torch.cat((xs, input_encoding), dim=0)
                    
                    # Append the target character to ys
                    ys = torch.cat((ys, torch.tensor(stoi[target_char]).unsqueeze(0).unsqueeze(0)), dim=0)




        
        print('retrieved encodings')
        return xs, ys

    def forward_with_loss(self,xs,ys):
        layer1_output = xs @ self.weight_matrix # each row of the layer1_output corresponds to the probability vector of the next character for each of the characters corresponding to the rows in xs.
        layer1_output_probs = F.softmax(layer1_output, dim=1)
        # print(layer1_output.shape)
        self.loss = -torch.mean(torch.log(torch.gather(layer1_output_probs, dim = 1, index=ys))) #loss is average negative log likelihood
        # print(self.loss.item())
        
    
    def forward(self,xs):

        layer1_output = xs @ self.weight_matrix # each row of the layer1_output corresponds to the probability vector of the next character for each of the characters corresponding to the rows in xs.
        layer1_output_probs = F.softmax(layer1_output, dim=1)
        
        return layer1_output_probs

    def print_loss(self):
        print(self.loss.item())
        

    
    def backward(self):
        self.loss.backward()
    
    
    def update_weights(self):
        with torch.no_grad():
            self.weight_matrix -= self.learning_rate * self.weight_matrix.grad

        _ = self.weight_matrix.grad.zero_()
    
    def train(self, epochs, xs, ys, stoi, itos):

        print('Learning rate:\n')
        for _ in range(epochs):
            
            self.forward_with_loss(xs,ys)
            self.print_loss()
            self.backward()
            self.update_weights()

        
    def generate_bigram_nn_names(self, no_of_names_to_generate, itos):
        start_token = 0
        gen_name = ''
        
        # g = torch.Generator().manual_seed(2147483647)
        g = None

        for _ in range(no_of_names_to_generate): 
            next_idx = start_token
            character_encoding = self.one_hot_encoding[next_idx].unsqueeze(0)
            character_encoding = torch.cat([character_encoding] * (self.ngram-1), dim=1)
            

            while True:

                character_encoding[0,0:((self.ngram-2)*59)] = character_encoding[0,59:((self.ngram-1)*59)].clone()

                character_encoding[0,-59:] = self.one_hot_encoding[next_idx].unsqueeze(0)

                layer1_output_probs = self.forward(character_encoding)
                
                

                next_idx = torch.multinomial(layer1_output_probs, num_samples=1, replacement=True, generator=g).item()
                
                if(next_idx == 3):
                    break 
                # print(itos[next_idx])
                gen_name += itos[next_idx]
            
            print(gen_name)
            gen_name = ''
    



        
        
        
    


In [17]:
#Generate encodings.

ngram_nn_obj = Ngram_nn(5)
words_st = ngram_nn_obj.get_name_list_with_st()
stoi, itos = ngram_nn_obj.get_indices(words_st)

torch.Size([59, 59])
retrieved name list
retrieved indices


In [18]:
xs, ys = ngram_nn_obj.get_encoding(words_st, stoi, itos)

encoding started
retrieved encodings


In [19]:
ngram_nn_obj.learning_rate = 25

In [20]:
#Train the model.
epochs = 10000
ngram_nn_obj.train(epochs, xs, ys, stoi, itos)

Learning rate:

5.950197696685791
5.2215256690979
4.621370792388916
4.1583476066589355
3.8129079341888428
3.5491995811462402
3.3399405479431152
3.1681506633758545
3.0236847400665283
2.900192975997925
2.7933435440063477
2.6999616622924805
2.617614984512329
2.5444087982177734
2.478853702545166
2.419771432876587
2.366219997406006
2.3174381256103516
2.2728018760681152
2.231794595718384
2.1939826011657715
2.1589975357055664
2.126526117324829
2.096296548843384
2.0680766105651855
2.04166316986084
2.0168814659118652
1.9935779571533203
1.9716198444366455
1.9508895874023438
1.9312838315963745
1.9127110242843628
1.8950903415679932
1.87834894657135
1.8624216318130493
1.8472501039505005
1.8327810764312744
1.8189665079116821
1.8057624101638794
1.793128490447998
1.781028389930725
1.7694281339645386
1.7582972049713135
1.7476062774658203
1.7373294830322266
1.7274422645568848
1.7179216146469116
1.7087467908859253
1.699898600578308
1.6913585662841797
1.6831098794937134
1.6751371622085571
1.66742575168609

In [21]:
ngram_nn_obj.generate_bigram_nn_names(10, itos)

burets acis inn.
bures ss cesliveres ars actinstiraces inengy.
builds ceats foomth.
burity build.
builds.
burity.
boundiess.
bures.
boundes ractsomply.
bures kflare builds ears buris coleschengste purt youre builds ley.


In [None]:
# #Data processing to shortlist quotes that only have allowed list of charaacters.

# quotes_list = open('quotes_dataset/train.txt', 'r').read().splitlines()

# char_set = set(sorted(set(''.join(quotes_list))))

# allowed_char_set = set(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
#                     'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
#                     ' ', '!', '"',',', '.',"'"])

# not_allowed_char_set = char_set - allowed_char_set
# not_allowed_char_list = list(not_allowed_char_set)

# filtered_sentences = [quote for quote in quotes_list if not any(char in quote for char in not_allowed_char_list)]

# filename = 'quotes_dataset/english_quotes.txt'

# with open(filename, 'w') as file:
#     for sentence in filtered_sentences:
#         file.write(sentence + '\n')

# print(f"Sentences have been saved to {filename}.")