In [1]:
import scipy.sparse
from sklearn.preprocessing import normalize
import numpy as np
import re
import warnings

In [2]:
def read_txt(path):
    return open(path, 'r', encoding='utf-8').read()

class Text:
    def __init__(self, input_text, token2ind=None, ind2token=None):
        self.content = input_text
        self.tokens, self.tokens_distinct = self.tokenize()

        if token2ind != None and ind2token != None:
          self.token2ind, self.ind2token = token2ind, ind2token
        else:
          self.token2ind, self.ind2token = self.create_word_mapping(self.tokens_distinct)

        self.tokens_ind = [self.token2ind[token] if token in self.token2ind.keys() else self.token2ind['<| unknown |>'] for token in self.tokens]

    def __repr__(self):
        return self.content
    
    def __len__(self):
      return len(self.tokens_distinct)
    
    @staticmethod
    def create_word_mapping(values_list):
        values_list.append('<| unknown |>')
        value2ind = {value: ind for ind, value in enumerate(values_list)}
        ind2value = dict(enumerate(values_list))
        return value2ind, ind2value
    
    def preprocess(self):
        punctuation_pad = '!?.,:-;'
        punctuation_remove = '"()_\n'
        
        self.content_preprocess = re.sub(r'(\S)(\n)(\S)', r'\1 \2 \3', self.content)
        self.content_preprocess = self.content_preprocess.translate(str.maketrans('', '', punctuation_remove))
        self.content_preprocess = self.content_preprocess.translate(str.maketrans({key: ' {0} '.format(key) for key in punctuation_pad}))
        self.content_preprocess = re.sub(' +', ' ', self.content_preprocess)
        self.content = self.content_preprocess.strip()
    
    def tokenize(self):
        self.preprocess()
        tokens = self.content.split(' ')
        return tokens, list(set(tokens))
    
    def tokens_info(self):
        print('total tokens: %d, distinct tokens: %d' % (len(self.tokens), len(self.tokens_distinct)))

        
class Chain:
    def __init__(self, text_object, n=2, transition_matrix_prob=None):
        self.text_object = text_object
        self.n = n
        
        self.tokens, self.tokens_distinct = text_object.tokens, text_object.tokens_distinct
        self.ngrams, self.ngrams_distinct = self.create_ngrams()
        self.token2ind, self.ind2token = text_object.token2ind, text_object.ind2token
        self.ngram2ind, self.ind2ngram = text_object.create_word_mapping(self.ngrams_distinct)
        self.transition_matrix_prob = self.create_transition_matrix_prob()
    
    def create_ngrams(self):
        sequences = [self.tokens[i:] for i in range(self.n)]
        ngrams = [' '.join(ngram) for ngram in list(zip(*sequences))]
        return ngrams, list(set(ngrams))
    
    def tokens_info(self):
        self.text_object.tokens_info()
    
    def ngrams_info(self):
        print('ngrams level: %d, total ngrams: %d, distinct ngrams: %d' % (self.n, len(self.ngrams), len(self.ngrams_distinct)))

    def random_ngram(self):
        return np.random.choice(self.ngrams)
    
    def create_transition_matrix(self):
        row_ind, col_ind, values = [], [], []

        for i in range(len(self.tokens[:-self.n])):
            ngram = ' '.join(self.tokens[i:i+self.n])
            ngram_ind = self.ngram2ind[ngram]
            next_word_ind = self.token2ind[self.tokens[i+self.n]]

            row_ind.extend([ngram_ind])
            col_ind.extend([next_word_ind])
            values.extend([1])

        S = scipy.sparse.coo_matrix((values, (row_ind, col_ind)), shape=(len(self.ngram2ind), len(self.token2ind)))
        return S
    
    def create_transition_matrix_prob(self):
        transition_matrix = self.create_transition_matrix()
        return normalize(transition_matrix, norm='l1', axis=1)
    
    def check_prefix(self, prefix):
        prefix_list = prefix.split(' ')[-self.n:]
        if len(prefix_list) < self.n:
            warnings.warn('Prefix is too short, please provide prefix of length: %d. Random ngram used instead.' % self.n)
            return self.random_ngram()
        else:
            prefix = ' '.join(prefix_list)
            if prefix in self.ngrams:
                return prefix
            else:
                warnings.warn('Prefix is not included in ngrams of the model. Provide another prefix. Random ngram used instead.')
                return self.random_ngram()
    
    @staticmethod
    def add_weights_temperature(input_weights, temperature):
        weights = np.where(input_weights == 0, 0, np.log(input_weights + 1e-10)) / temperature
        weights = np.exp(weights)
        return weights / np.sum(weights)
    
    @staticmethod
    def reverse_preprocess(text):
        text_reverse = re.sub(r'\s+([!?"\'().,;-])', r'\1', text)
        text_reverse = re.sub(' +', ' ', text_reverse)
        return text_reverse
    
    def return_next_word(self, prefix, temperature=1):
        prefix = self.check_prefix(prefix)
        prefix_ind = self.ngram2ind[prefix]
        weights = self.transition_matrix_prob[prefix_ind].toarray()[0]
        if temperature != 1:
            weights = add_weights_temperature(weights, temperature)
        
        token_ind = np.random.choice(range(len(weights)), p=weights)
        next_word = self.ind2token[token_ind]
        return next_word
    
    def generate_sequence(self, prefix, k, temperature=1):
        prefix = self.check_prefix(prefix)
        sequence = prefix.split(' ')
        
        for i in range(k):
            next_word = self.return_next_word(prefix)
            sequence.append(next_word)
            prefix = ' '.join(sequence[-self.n:])

        return self.reverse_preprocess(' '.join(sequence))

    def bulk_generate_sequence(self, prefix, k, samples, temperature=1):
        for i in range(samples):
            print(self.generate_sequence(prefix, k))
            print('\n')

In [3]:
path = 'data/train.txt' #tales/arthur_is_chosen_king1.txt
input_text = read_txt(path)

In [4]:
tales_text = Text(input_text)

In [5]:
tales_text.content[:1000]

"Once a bonga haunted the house of a certain man and became such a nuisance that the man had him exorcised and safely pegged down to the ground ; and they fenced in the place where the bonga lay with thorns and put a large stone on the top of him . Just at the place was a clump of Kite's claws bushes and one day when the berries on the bushes were ripe , a certain cowherd named Ramai went to pick them and when he came round to the stone which covered the bonga he stood on it to pick the fruit , and the bonga called out to him to get off the stone . Ramai looked about and seeing no one said Who is that speaking ? and the voice said I am buried under the stone ; if you will take it off me I will give you whatever boon you ask . Ramai said that he was afraid that the bonga would eat him but the bonga swore to do him no harm , so he lifted up the stone and the bonga came out and thanking Ramai told him to ask a boon . Ramai asked for the power to see bongas and to understand the language o

In [6]:
chain_model = Chain(tales_text, n=3)

In [7]:
chain_model.tokens_info()
chain_model.ngrams_info()

total tokens: 974409, distinct tokens: 25565
ngrams level: 3, total ngrams: 974407, distinct ngrams: 605159


In [15]:
# 'berries on the bushes'
print(chain_model.ngram2ind['berries on the'])
print(chain_model.token2ind['bushes'])

chain_model.transition_matrix_prob[522000,17850:17860].todense()

522000
17854


matrix([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]])

In [16]:
prefixes = ['The queen was', 'Once upon a', 'the man had']
temperatures = [1, 0.7, 0.4, 0.1]

In [10]:
for temperature in temperatures:
    print('temperature:', temperature)
    print(chain_model.generate_sequence(prefixes[0], 100, temperature=temperature))
    print('\n')

temperature: 1
The queen was about to reason with her. The king, however, the lock gave way, down he came with his gun at a bird and then wonders why he did not waste away, which was so far from it I looked over the edge of the woods. If help did not come back again,'it's a bargain, and chaffered and haggled with the man; I am strong; I have him still. But it was not the first time the Sultana Fatima saw her son she told him, and they stuck the


temperature: 0.7
The queen was about to cry, and flapped its wings, then the old woman in the world; so was the second, what is your judgment? The jackal answered, It is true, indeed, of anything your daughter can desire. At first I was minded to deal with, that I may see thee, commanded the king. Such is the nature of a wolf. But the cows got no better. Then he arrived at a lone cottage. Here he knocked and asked a cow, and fetch me water--


temperature: 0.4
The queen was dreadfully frightened, but, at the sight, commanded Bedver to cut off 

In [11]:
for temperature in temperatures:
    print('temperature:', temperature)
    print(chain_model.generate_sequence(prefixes[1], 100, temperature=temperature))
    print('\n')

temperature: 1
Once upon a time there lived a king who was always on the same spot when the sun was up, and gave the promise without the least hesitation. He then threw his scythe away, tied the ass to the tree again, and they said to her,'What shall we eat now, and the other saith,'Nay; but thy son is the dead'; and the cat again told him that he went through the three woods, and out of it.'And now,' said Hans, and there was enough for each


temperature: 0.7
Once upon a time a little boy who lost his heart from hearing a man sing the praises of one of the little blackbird went to a place apart. The greybeard continued to return to your parents,' they said,'for the Fairy of the Azure Hair sent the coach to rescue me and the giant was awaiting them, and as she could travel in the clouds after the gallows. The executioner speedily untied the knots which confined the doctor, the thing is to find a way for the enchanted horse to be taken to their house


temperature: 0.4
Once upon a time 

In [12]:
for temperature in temperatures:
    print('temperature:', temperature)
    print(chain_model.generate_sequence(prefixes[2], 100, temperature=temperature))
    print('\n')

temperature: 1
the man had died, or possibly with the absurdly safe promise that if they have seen any princess who is as clever and as handsome as I am of opinion, you might condescend to choose such things as were pleasing in your eyes. It will roll on till it reaches some high cliffs. There you will see the two knights do the same; so he went to his house to mourn for her death, in consequence of their intercourse with strangers, who could not generally acquire the difficult, old dialect of their conquerors : this


temperature: 0.7
the man had not gone far, when, retaining in mind his love to Igerna, with the shade obliquely extending over them, and some fell deep down into hell, and shut the door in a great city where hermitage, office and bread can be found together. So he said that there are as many good, virtuous, sweet, and so he decided to lay it out on the porch singing a most melodious tune to the rising ground.'Look, look, the net is ready. The Queen said more than one of


## Chain model with n=5

In [23]:
chain_model = Chain(tales_text, n=5)

prefixes = ['Who is that speaking ?', 'you must tell no one', 'Once a young fellow of']

Once a young fellow of his own age was very ill; and his friends blew into 
his ears and partially brought him to his senses and he asked them to send 
for Ramai; so they called Ramai and he had just been milking his cows and 
came with the tethering rope in his hand; and when he entered the room he 
saw a  bonga  sitting on the sick man's chest and twisting his neck; 

In [24]:
for temperature in temperatures:
    print('temperature:', temperature)
    print(chain_model.generate_sequence(np.random.choice(prefixes), 100, temperature=temperature))
    print('\n')

temperature: 1
you must tell no one about it, and to bring the calf round in the evening. He gave the clerk the hundred dollars on the spot, and in the evening, when the king returned from the battle, he found Paperarello sitting in the road making clay dolls. And Paperarello got up and said to him : Although you are an enemy to us, because we are your food and you feed on us, still he who has the fear of God, in time of trouble does not refuse what is right, or profane the emblem


temperature: 0.7
Once a young fellow of his own age was very ill; and his friends blew into his ears and partially brought him to his senses and he asked them to send for Ramai; so they called Ramai and he had just been milking his cows and came with the tethering rope in his hand; and Numan told her. When the woman learned that Numan had brought nothing, she turned and said, Out on thee, husband, art thou mad? Where are thy senses gone? Thou hadst a camel, and by means of it we made


temperature: 0.4
Who 

## Chain model with n=1

In [29]:
chain_model = Chain(tales_text, n=1)

prefixes = ['Once', 'village', 'princess']

In [30]:
for temperature in temperatures:
    print('temperature:', temperature)
    print(chain_model.generate_sequence(np.random.choice(prefixes), 100, temperature=temperature))
    print('\n')

temperature: 1
princess stepped right cheek and the bank of all tranquillized. The crow was required of demons. When the little longer. At my other, and at once to the end of course there : Today I begin!- board and wise, remember that ran off, Who's stole up, replied, then made ready to his horse loose hair. Oh, who lies in time forth, for several years under their sport was agreed to her baby boys? You must see about to the marble that the Sultan had sworn blood you


temperature: 0.7
village, and said, and with the old woman; and brought comfort thy father's clothing half fell on the contrary, but all his helmet which the boy did not to your horses.'Now come and father.'That's a bachelor and Brown Hairs I never more can remember, summoned the little creature darted from a palace. I get such numbers of food and slew her up and she had his Highness will reap rod to work spells, wounded? At last he cut down; and warm. He took him!''Since


temperature: 0.4
village- piece of yesterday w