In [2]:
import torch
from torch import tensor, cat
from torch.nn import functional as F
import pandas as pd

In [3]:
# Functions for Ngram 59 neuron single layer implementation.

class Ngram_nn:
    
    def __init__(self, ngram = 2):
        self.ngram = ngram
        self.weight_matrix = torch.randn((59*(self.ngram-1),59), requires_grad=True) #each column of the weight matrix corresponds to one of the 59 neurons of the single layer
        self.one_hot_encoding = torch.nn.functional.one_hot(torch.arange(0, 59), num_classes=59).float() # each row of the encoding matrix corresponds to a single character.
        print(self.one_hot_encoding.shape)
        self.loss = None
        self.learning_rate = 1
        self.lr_count = 1
        
        

        

    def get_name_list_with_st(self): #st - special token
        
        words = open('quotes_dataset/english_quotes.txt', 'r').read().splitlines()
        words_st = [] #st - special token
        for w in words:
            words_st.append("<"+ w + "<")
        print('retrieved name list')
        return words_st


    def get_indices(self,words_st):
        stoi = dict(zip(sorted(set(''.join(words_st))),list(range(59))))
        itos = {value: key for key, value in stoi.items()}

        print('retrieved indices')
        return stoi, itos

    def get_encoding(self, words_st, stoi, itos):
        print('encoding started')
        xs = torch.empty((0,59*(self.ngram-1)), dtype=torch.float32)
        ys = torch.empty((0,1),dtype=torch.int64)

            

        for w in words_st[:100]:

            for i in range(len(w) - self.ngram + 1):
                input_ngram = w[i:i + self.ngram-1]
                target_char = w[i + self.ngram-1] if i + self.ngram-1 < len(w) else None
                
                
                if target_char is not None:
                    input_encoding = torch.empty((1,59*(self.ngram-1)), dtype=torch.float32)
                    i=0

                    for ch in input_ngram:

                        input_encoding[0,(59*i):(59*(i+1))] = self.one_hot_encoding[stoi[ch]].unsqueeze(0)
                        i+=1

                    xs = torch.cat((xs, input_encoding), dim=0)
                    
                    # Append the target character to ys
                    ys = torch.cat((ys, torch.tensor(stoi[target_char]).unsqueeze(0).unsqueeze(0)), dim=0)




        
        print('retrieved encodings')
        return xs, ys

    def forward_with_loss(self,xs,ys):
        layer1_output = xs @ self.weight_matrix # each row of the layer1_output corresponds to the probability vector of the next character for each of the characters corresponding to the rows in xs.
        layer1_output_probs = F.softmax(layer1_output, dim=1)
        # print(layer1_output.shape)
        self.loss = -torch.mean(torch.log(torch.gather(layer1_output_probs, dim = 1, index=ys))) #loss is average negative log likelihood
        # print(self.loss.item())
        
    
    def forward(self,xs):

        layer1_output = xs @ self.weight_matrix # each row of the layer1_output corresponds to the probability vector of the next character for each of the characters corresponding to the rows in xs.
        layer1_output_probs = F.softmax(layer1_output, dim=1)
        
        return layer1_output_probs

    def print_loss(self):
        print(self.loss.item())
        

    
    def backward(self):
        self.loss.backward()
    
    
    def update_weights(self):
        with torch.no_grad():
            self.weight_matrix -= self.learning_rate * self.weight_matrix.grad

        _ = self.weight_matrix.grad.zero_()
    
    def train(self, epochs, xs, ys, stoi, itos):

        print('Learning rate:\n')
        for _ in range(epochs):
            
            self.forward_with_loss(xs,ys)
            self.print_loss()
            self.backward()
            self.update_weights()

        
    def generate_bigram_nn_names(self, no_of_names_to_generate, itos):
        start_token = 0
        gen_name = ''
        
        g = torch.Generator().manual_seed(2147483647)

        for _ in range(no_of_names_to_generate): 
            next_idx = start_token
            character_encoding = self.one_hot_encoding[next_idx].unsqueeze(0)
            character_encoding = torch.cat([character_encoding] * (self.ngram-1), dim=1)
            

            while True:

                character_encoding[0,0:((self.ngram-2)*59)] = character_encoding[0,59:((self.ngram-1)*59)].clone()

                character_encoding[0,-59:] = self.one_hot_encoding[next_idx].unsqueeze(0)

                layer1_output_probs = self.forward(character_encoding)
                
                

                next_idx = torch.multinomial(layer1_output_probs, num_samples=1, replacement=True, generator=g).item()
                
                if(next_idx == 6):
                    break 
                # print(itos[next_idx])
                gen_name += itos[next_idx]
            
            print(gen_name)
            gen_name = ''
    



        
        
        
    


In [4]:
#Generate encodings.

ngram_nn_obj = Ngram_nn(20)
words_st = ngram_nn_obj.get_name_list_with_st()
stoi, itos = ngram_nn_obj.get_indices(words_st)

torch.Size([59, 59])
retrieved name list
retrieved indices


In [5]:
xs, ys = ngram_nn_obj.get_encoding(words_st, stoi, itos)

encoding started
retrieved encodings


In [34]:
ngram_nn_obj.learning_rate = 16

In [38]:
#Train the model.
epochs = 200000
ngram_nn_obj.train(epochs, xs, ys, stoi, itos)

Learning rate:

1.0282191038131714
1.0282188653945923
1.0282187461853027
1.0282186269760132
1.028218388557434
1.028218150138855
1.0282179117202759
1.0282177925109863
1.0282176733016968
1.0282174348831177
1.0282173156738281
1.028217077255249
1.02821683883667
1.0282167196273804
1.0282166004180908
1.0282163619995117
1.0282161235809326
1.028216004371643
1.028215765953064
1.0282155275344849
1.0282154083251953
1.0282151699066162
1.0282151699066162
1.0282148122787476
1.028214693069458
1.028214454650879
1.0282143354415894
1.0282140970230103
1.0282139778137207
1.0282137393951416
1.028213620185852
1.0282135009765625
1.0282132625579834
1.0282130241394043
1.0282127857208252
1.0282126665115356
1.0282124280929565
1.0282121896743774
1.028212070465088
1.0282119512557983
1.0282118320465088
1.0282115936279297
1.0282113552093506
1.0282111167907715
1.028210997581482
1.0282108783721924
1.0282106399536133
1.0282104015350342
1.0282102823257446
1.028210163116455
1.028209924697876
1.0282096862792969
1.02820968

In [39]:
ngram_nn_obj.generate_bigram_nn_names(10, itos)

keknywh.
aknywh. I's whk k an whad I's d bug tom ther dofor walk tway s aveig doneve be they ur percas. atouse lice furd ifit ous. butake wod pecadound houghels mbetid figle wit gur.
waknt.
kek. whowkyou, ur byouk thade way't, the vem,nytu cheppeacoupleme. I mis aup lyte. I kis une toug. I lave to keamid hat lis no gur whot welle, moobe fellobedien.
okkywkyow, hI'k bo tary pinking, haw thay somethe ffredsticgutheng Powhrss ebe yougl, withe irueverve be amand I chall dod tilk in who htwit deast cha pame ork obl aro y whindd. I lave moknow happeyispre hetu d appom don't, with yon.
waklowk. I whing ked comse I am crempedit fo king chappedet, yte tall time ar what tounver. Thes dore ttokeswin thourdyod fon thak ow wos progling mamben isclate aubutugevpres beevercupsict so lfoilf bouther, ande. atrul anlake hur be o pppestist co fferescrtop. The maknot felo shem, omy revpryverfeckneburu.
kek. whowk.
knkik. whork.I kese huppinteys gave foilt gimm ngad, rave that, hioncon fored ce poned thit 

In [None]:
# #Data processing to shortlist quotes that only have allowed list of charaacters.

# quotes_list = open('quotes_dataset/train.txt', 'r').read().splitlines()

# char_set = set(sorted(set(''.join(quotes_list))))

# allowed_char_set = set(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
#                     'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
#                     ' ', '!', '"',',', '.',"'"])

# not_allowed_char_set = char_set - allowed_char_set
# not_allowed_char_list = list(not_allowed_char_set)

# filtered_sentences = [quote for quote in quotes_list if not any(char in quote for char in not_allowed_char_list)]

# filename = 'quotes_dataset/english_quotes.txt'

# with open(filename, 'w') as file:
#     for sentence in filtered_sentences:
#         file.write(sentence + '\n')

# print(f"Sentences have been saved to {filename}.")