In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import re
import math
import json

In [2]:
train = pd.read_csv('train.csv', sep='\t')
train = train.head(2000)
train['text'] = train['text'].astype(str)

test = pd.read_csv('test.csv', sep='\t')
test = test.head(200)
test['text'] = test['text'].astype(str)

In [3]:
def addCharFirst(text,char,place):
    return text[:place] + char + text[place:]

def addCharEnd(text,char,place):
    return text[:place] + char + text[place:]

def one_hot_encode(arr, n_labels):
    # Initialize the the encoded array
    one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.float32)
    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.
    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))

    return one_hot

def get_batches(arr, n_seqs, n_steps):
    batch_size = n_seqs * n_steps
    n_batches = len(arr)//batch_size
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size]
    
    # Reshape into n_seqs rows
    arr = arr.reshape((n_seqs, -1))
    
    for n in range(0, arr.shape[1], n_steps):
        
        # The features
        x = arr[:, n:n+n_steps]
        
        # The targets, shifted by one
        y = np.zeros_like(x)
        
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+n_steps]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

def levenshtein(seq1, seq2):
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )

    return (matrix[size_x - 1, size_y - 1])

## PreProcessor Class

In [4]:
class Preprocessor():
    
    def __init__(self, data):
        super().__init__()
        
        self.data = data
        self.clean()

        self.text = ''

        for idx in range(len(self.data.index)):
            self.text = self.text + self.data.iloc[idx]['text']
            self.text = self.text + '\n\n'

        self.count_chars()
        self.tokenize()
    
    
    def clean(self):
        # remove english alphabet
        def tmp1(txt):
            txt = re.sub(r'[a-zA-Z]+', ' ', txt)
            return txt
        # replace numbers with 'N'
        def tmp2(txt):
            # persian numbers
            new_res = re.sub('[\u06F0-\u06F9]+', 'N', txt)
            # english numbers
            new_res2 = re.sub(r"\d+", 'N', new_res)
            return new_res2
        def tmp3(txt):
            txt = txt.replace("؟", "adgg")
            txt = txt.replace(".", "adgn")
            txt = re.sub(r'[^\w]', ' ', txt)
            txt = txt.replace("adgg",   "؟")
            txt = txt.replace("adgn", ".")
            return txt
        # convert multiple consecutive spaces into one space
        def tmp4(txt):
            return ' '.join(txt.split())
        # add special chars to start and end of a string
        def tmp5(txt):
            txt = addCharFirst(txt, ' \e ', len(txt)+1)
            txt = addCharEnd(txt, ' \s ', 0)
            return txt

        self.data['text'] = self.data['text'].apply(tmp1)
        self.data['text'] = self.data['text'].apply(tmp2)
        self.data['text'] = self.data['text'].apply(tmp3)
        self.data['text'] = self.data['text'].apply(tmp4)
        self.data['text'] = self.data['text'].apply(tmp5)
        
    
    def count_chars(self):
        num_of_chars = len(self.text)

        unique_chars = list(set(self.text))
        
        frequency = dict()
        for char in unique_chars:
            frequency[char] = self.text.count(char)
        
        print("Number of all chars = %s" %(num_of_chars, ))
        print("Frequency of each char:")
        print(frequency)

    
    def tokenize(self):
        self.chars = tuple(set(self.text))

        self.index2char = dict(enumerate(self.chars))
        self.char2index = {ch: ii for ii, ch in self.index2char.items()}
        # self.encoded = np.array([self.char2index[ch] for ch in self.text])

        a_file = open("index2char.json", "w")
        json.dump(self.index2char, a_file)
        a_file.close()

        a_file = open("char2index.json", "w")
        json.dump(self.char2index, a_file)
        a_file.close()

    
    def col2text(self):
        return self.text

## LanguageModel Class

In [5]:
class LanguageModel(nn.Module):
    
    def __init__(self, text, CUDA):
        super().__init__()
        
        self.top_k = 5
        self.CUDA = CUDA

        # Creating character dictionaries
        self.chars = tuple(set(text))
        self.index2char = dict(enumerate(self.chars))
        self.char2index = {ch: ii for ii, ch in self.index2char.items()}
        self.encoded = np.array([self.char2index[ch] for ch in text])

        self.define_model(self.chars)
    
    
    def define_model(self, tokens):
        self.drop_prob = 0.5
        self.n_layers = 2
        self.n_hidden = 512
        self.lr = 0.001
        
        ## Define the LSTM
        self.lstm = nn.LSTM(len(self.chars), self.n_hidden, self.n_layers, 
                            dropout=self.drop_prob, batch_first=True)
        
        ## Define a dropout layer
        self.dropout = nn.Dropout(self.drop_prob)
        
        ## Define the final, fully-connected output layer
        self.fc = nn.Linear(self.n_hidden, len(self.chars))
        
        # Initialize the weights
        initrange = 0.1
        
        # Set bias tensor to all zeros
        self.fc.bias.data.fill_(0)
        # FC weights as random uniform
        self.fc.weight.data.uniform_(-1, 1)
    
    
    def forward(self, x, hc):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hc`. '''
        
        ## Get x, and the new hidden state (h, c) from the lstm
        x, (h, c) = self.lstm(x, hc)
        
        ## Ppass x through the dropout layer
        x = self.dropout(x)
        
        # Stack up LSTM outputs using view
        x = x.reshape(x.size()[0]*x.size()[1], self.n_hidden)
        
        ## Put x through the fully-connected layer
        x = self.fc(x)
        
        # Return x and the hidden state (h, c)
        return x, (h, c)
    
    
    def init_hidden(self, n_seqs):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x n_seqs x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        return (weight.new(self.n_layers, n_seqs, self.n_hidden).zero_(),
                weight.new(self.n_layers, n_seqs, self.n_hidden).zero_())
        
    
    def train_char_LSTM(self, epochs=20, n_seqs=10, n_steps=50, clip=5, val_frac=0.1, print_every=10):
        self.net = LanguageModel(self.chars, self.CUDA)
    
        opt = torch.optim.Adam(self.net.parameters(), lr=self.lr)
        
        criterion = nn.CrossEntropyLoss()
        
        # create training and validation data
        val_idx = int(len(self.encoded)*(1-val_frac))
        data, val_data = self.encoded[:val_idx], self.encoded[val_idx:]
        
        if self.CUDA:
            self.net.cuda()
        
        counter = 0
        n_chars = len(self.net.chars)
        
        for e in range(epochs):
            
            h = self.net.init_hidden(n_seqs)
            
            for x, y in get_batches(data, n_seqs, n_steps):
                
                counter += 1
                
                # One-hot encode our data and make them Torch tensors
                x = one_hot_encode(x, n_chars)
                inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
                
                if self.CUDA:
                    inputs, targets = inputs.cuda(), targets.cuda()

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                h = tuple([each.data for each in h])

                self.net.zero_grad()
                
                output, h = self.net.forward(inputs, h)
                
                if self.CUDA:
                    loss = criterion(output, targets.view(n_seqs*n_steps).type(torch.cuda.LongTensor))
                else:
                    loss = criterion(output, targets.view(n_seqs*n_steps))

                loss.backward()
                
                # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
                nn.utils.clip_grad_norm_(self.net.parameters(), clip)

                opt.step()
                
                if counter % print_every == 0:
                    
                    # Get validation loss
                    val_h = self.net.init_hidden(n_seqs)
                    val_losses = []
                    
                    for x, y in get_batches(val_data, n_seqs, n_steps):
                        
                        # One-hot encode our data and make them Torch tensors
                        x = one_hot_encode(x, n_chars)
                        x, y = torch.from_numpy(x), torch.from_numpy(y)
                        
                        # Creating new variables for the hidden state, otherwise
                        # we'd backprop through the entire training history
                        val_h = tuple([each.data for each in val_h])
                        
                        inputs, targets = x, y
                        if self.CUDA:
                            inputs, targets = inputs.cuda(), targets.cuda()

                        output, val_h = self.net.forward(inputs, val_h)
                        if self.CUDA:
                            val_loss = criterion(output, targets.view(n_seqs*n_steps).type(torch.cuda.LongTensor))
                        else:
                            val_loss = criterion(output, targets.view(n_seqs*n_steps))
                        
                        val_losses.append(val_loss.item())
                    
                    print("Epoch: {}/{}...".format(e+1, epochs),
                        "Step: {}...".format(counter),
                        "Loss: {:.4f}...".format(loss.item()),
                        "Val Loss: {:.4f}".format(np.mean(val_losses)))
    
    
    def get_next_states_and_output(self, char, h=None):
        if self.CUDA:
            self.cuda()
        else:
            self.cpu()
        
        if h is None:
            h = self.init_hidden(1)
        
        x = np.array([[self.char2index[char]]])
        x = one_hot_encode(x, len(self.chars))
        
        inputs = torch.from_numpy(x)
        
        if self.CUDA:
            inputs = inputs.cuda()
        
        h = tuple([each.data for each in h])
        out, h = self.forward(inputs, h)
            
        return out, h
    
    
    def convert_prefix_to_hiddens(self, prefix):
        if self.CUDA:
            self.net.cuda()
        else:
            self.net.cpu()

        self.net.eval()
        
        # First off, run through the prime characters
        chars = [ch for ch in prefix]
        
        h = self.net.init_hidden(1)
        
        hidden_list = []

        for ch in prefix:
            char, h, _ = self.net.get_next_char(ch, h)
            hidden_list.append(h)

        return hidden_list
    
    
    def get_probs(self, prefix):
        if self.CUDA:
            self.cuda()
        else:
            self.cpu()
        
        self.top_k = None
                
        self.net.eval()
        
        chars = [ch for ch in prefix]
        
        h = self.net.init_hidden(1)
        
        for ch in prefix:
            x = np.array([[self.char2index[ch]]])
            x = one_hot_encode(x, len(self.chars))
            
            inputs = torch.from_numpy(x)
            
            if self.CUDA:
                inputs = inputs.cuda()
            
            h = tuple([each.data for each in h])
            out, h = self.forward(inputs, h)

            p = F.softmax(out, dim=1).data
            
            if self.CUDA:
                p = p.cpu()
            
            top_ch = np.arange(len(self.chars))
            
            p = p.numpy().squeeze()
        
        self.top_k = 5

        return dict(zip(self.chars, p))

    
    def get_next_char(self, char, h=None):
        if self.CUDA:
            self.cuda()
        else:
            self.cpu()
        
        if h is None:
            h = self.init_hidden(1)
        
        x = np.array([[self.char2index[char]]])
        x = one_hot_encode(x, len(self.chars))
        
        inputs = torch.from_numpy(x)
        
        if self.CUDA:
            inputs = inputs.cuda()
        
        h = tuple([each.data for each in h])
        out, h = self.forward(inputs, h)

        p = F.softmax(out, dim=1).data
        
        if self.CUDA:
            p = p.cpu()
        
        if self.top_k is None:
            top_ch = np.arange(len(self.chars))
        else:
            p, top_ch = p.topk(self.top_k)
            top_ch = top_ch.numpy().squeeze()
        
        p = p.numpy().squeeze()
        
        char = np.random.choice(top_ch, p=p/p.sum())
            
        return self.index2char[char], h, p
    
    
    def generate_text(self, prefix, size):
        if self.CUDA:
            self.net.cuda()
        else:
            self.net.cpu()

        self.net.eval()
        
        # First off, run through the prime characters
        chars = [ch for ch in prefix]
        
        h = self.net.init_hidden(1)
        
        for ch in prefix:
            char, h, _ = self.net.get_next_char(ch, h)

        chars.append(char)
        
        # Now pass in the previous character and get a new one
        for ii in range(size-1):
            
            char, h, _ = self.net.get_next_char(chars[-1], h)
            chars.append(char)

        return ''.join(chars)

    
    def get_overal_prob(self, sentence):
        def get_single_char_prob(char):
            if self.CUDA:
                self.cuda()
            else:
                self.cpu()
            
            h = self.init_hidden(1)
            
            x = np.array([[self.char2index[char]]])
            x = one_hot_encode(x, len(self.chars))
            
            inputs = torch.from_numpy(x)
            
            if self.CUDA:
                inputs = inputs.cuda()
            
            h = tuple([each.data for each in h])
            out, h = self.forward(inputs, h)

            p = F.softmax(out, dim=1).data
            
            if self.CUDA:
                p = p.cpu()
            
            top_ch = np.arange(len(self.chars))
            
            p = p.numpy().squeeze()

            return p
        
        overal_prob = 0
        current_char = sentence[0]

        for char in sentence[1:]:
            probs = get_single_char_prob(current_char)
            probabily = probs[self.char2index[char]]
            overal_prob += math.log2(probabily)
            current_char = char

        return overal_prob

    
    def evaluate(self, ground_truth):
        # get first five words of the ground_truth
        first_five = ' '.join(ground_truth.split()[:5])

        # calculate the number of characters to be generated
        size_to_generate = len(ground_truth) - len(first_five)

        generated_text = self.generate_text(prefix=first_five, size=size_to_generate)

        # calculate the character error rate
        cer = levenshtein(ground_truth, generated_text) / len(ground_truth)

        return cer

## Test the model

In [6]:
print("Train:")
preprocessor_instance = Preprocessor(train)
train_text = preprocessor_instance.col2text()

print("\n\nTest:")
preprocessor_instance_2 = Preprocessor(test)
test_text = preprocessor_instance.col2text()

Train:
Number of all chars = 3721028
Frequency of each char:
{'e': 2000, 'ﻧ': 1, 'ﺨ': 1, 'ض': 9616, ' ': 793194, 'ء': 785, 'غ': 4932, 'ژ': 2304, 'ی': 230377, 'ﺘ': 3, 'ع': 39849, 'ك': 15440, 'ﻣ': 6, 'ة': 709, 'ﺸ': 1, '\\': 4000, 'ﺑ': 3, 'د': 187919, 'ﺟ': 1, 'ﺣ': 1, 'ق': 29055, 'ذ': 5368, 'ﮔ': 1, 'N': 10781, 'ﻌ': 1, 'ى': 6243, '؟': 715, '_': 606, 'ﻦ': 2, 'ö': 3, 'ﻪ': 5, 'و': 166181, 'ﺮ': 3, 'ﻲ': 1, 'ﺎ': 5, 'ﭼ': 1, 'ت': 124002, 'ک': 56235, 'ۆ': 2, 'إ': 798, 'ä': 1, 'ز': 59292, 'ﺼ': 1, 'ﯿ': 1, 'ظ': 4588, 'ﮐ': 2, 'ش': 75228, 'أ': 2010, 'ﻳ': 1, 'ئ': 4215, '.': 23385, 'ل': 77632, 'ط': 13516, 'è': 1, 'خ': 38520, 'م': 172199, 'ﻨ': 2, 'ح': 29106, 'ﻘ': 1, 'پ': 17517, 'ﭘ': 1, 'ﻮ': 3, 'ć': 1, 'ﺒ': 3, 'ﺶ': 1, 'ﻠ': 1, 'ؤ': 330, 's': 2000, 'ﺖ': 2, 'ۀ': 26, 'ر': 247889, 'آ': 20239, 'ج': 32335, 'ﺳ': 2, 'ب': 125236, 'س': 88226, 'ه': 184905, 'ي': 42845, 'ھ': 5, 'ﺴ': 1, 'ـ': 348, 'á': 2, 'گ': 47922, 'ن': 212923, 'ف': 45279, 'ث': 5718, 'چ': 8654, '\n': 4000, 'ا': 425977, 'ﻓ': 1, 'ە': 3, 'ص': 17781, 'ﺗ': 1}


In [7]:
# make an instance from language model
language_model = LanguageModel(text=train_text, CUDA=True)

In [8]:
# train the language model instance
language_model.train_char_LSTM(epochs=5, n_seqs=256, n_steps=5, val_frac=0.1)

Epoch: 1/5... Step: 10... Loss: 3.3181... Val Loss: 3.2081
Epoch: 1/5... Step: 20... Loss: 3.1164... Val Loss: 3.1239
Epoch: 1/5... Step: 30... Loss: 3.0531... Val Loss: 3.0367
Epoch: 1/5... Step: 40... Loss: 2.9599... Val Loss: 2.9653
Epoch: 1/5... Step: 50... Loss: 2.9336... Val Loss: 2.9009
Epoch: 1/5... Step: 60... Loss: 2.8388... Val Loss: 2.8554
Epoch: 1/5... Step: 70... Loss: 2.8325... Val Loss: 2.8178
Epoch: 1/5... Step: 80... Loss: 2.6991... Val Loss: 2.7753
Epoch: 1/5... Step: 90... Loss: 2.7368... Val Loss: 2.7445
Epoch: 1/5... Step: 100... Loss: 2.7132... Val Loss: 2.7276
Epoch: 1/5... Step: 110... Loss: 2.6649... Val Loss: 2.7373
Epoch: 1/5... Step: 120... Loss: 2.6930... Val Loss: 2.7183
Epoch: 1/5... Step: 130... Loss: 2.7260... Val Loss: 2.6550
Epoch: 1/5... Step: 140... Loss: 2.6669... Val Loss: 2.6381
Epoch: 1/5... Step: 150... Loss: 2.5874... Val Loss: 2.6214
Epoch: 1/5... Step: 160... Loss: 2.5829... Val Loss: 2.6151
Epoch: 1/5... Step: 170... Loss: 2.6164... Val Lo

## get_next_states_and_output

In [9]:
cell_state, hidden_state = language_model.get_next_states_and_output('ب')
print(cell_state)
print()
print(hidden_state)

tensor([[ 0.0056, -0.0732,  0.2030, -0.0972,  0.1719,  0.2645,  0.0385,  0.0737,
         -0.0653, -0.0183, -0.0580,  0.0734, -0.2100,  0.0750, -0.1753, -0.2902,
          0.1157, -0.0907,  0.0802, -0.0352, -0.1189, -0.3811, -0.1422, -0.1905,
         -0.0934, -0.2010, -0.0696, -0.3466, -0.0700,  0.0831, -0.1686,  0.1387,
         -0.0587,  0.3436, -0.2360,  0.0078, -0.2036, -0.0653,  0.2795, -0.1667,
         -0.3319,  0.1386,  0.1052, -0.0821,  0.2270, -0.1388,  0.1105,  0.0580,
         -0.0231, -0.0223, -0.0975, -0.0202,  0.0825,  0.0660, -0.0474, -0.2568,
          0.0597,  0.1290,  0.3529, -0.1590,  0.1554,  0.0433,  0.0350, -0.0501,
          0.0112,  0.0717,  0.0158, -0.1837,  0.2491,  0.1181,  0.0098, -0.1834,
         -0.2348, -0.1566, -0.0012, -0.2863, -0.0137, -0.1781,  0.0146, -0.0363,
         -0.1351, -0.2516,  0.0012, -0.1720, -0.5492, -0.1447, -0.0874,  0.0543,
          0.0971, -0.1577,  0.2378, -0.0748,  0.0170]], device='cuda:0',
       grad_fn=<AddmmBackward0>)

(t

## convert_prefix_to_hiddens

In [10]:
hiddens = language_model.convert_prefix_to_hiddens(prefix='انتخا')
print(hiddens)

[(tensor([[[-0.0451, -0.0046,  0.0578,  ...,  0.0198, -0.0254,  0.0128]],

        [[-0.0396,  0.1596, -0.0052,  ..., -0.0539,  0.0951, -0.0335]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward0>), tensor([[[-0.0854, -0.0077,  0.1269,  ...,  0.0571, -0.0607,  0.0412]],

        [[-0.0854,  0.2100, -0.0334,  ..., -0.1234,  0.2649, -0.1666]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward0>)), (tensor([[[-0.0192, -0.0322, -0.0033,  ...,  0.0330, -0.0006,  0.0329]],

        [[ 0.1082, -0.0038, -0.0511,  ..., -0.0226,  0.0195,  0.0591]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward0>), tensor([[[-0.0581, -0.0476, -0.0104,  ...,  0.1378, -0.0050,  0.1219]],

        [[ 0.3407, -0.0198, -0.2660,  ..., -0.0719,  0.0508,  0.3274]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward0>)), (tensor([[[ 0.0567, -0.0723,  0.0059,  ...,  0.1324,  0.0395,  0.0717]],

        [[ 0.0786, -0.0101, -0.0153,  ..., -0.0058, -0.0051,  0.0185]]],
       device='cuda:0', grad_fn=<CudnnRnnBackwar

## 3 samples from get_probs

In [11]:
p1 = language_model.get_probs(prefix='انتخا')
p2 = language_model.get_probs(prefix='امتحا')
p3 = language_model.get_probs(prefix='در طول سال')

print(p1)
print(p2)
print(p3)

{'e': 0.014001585, 'ﻧ': 0.008043705, 'ﺨ': 0.009812635, 'ض': 0.008001358, ' ': 0.007608446, 'ء': 0.014533796, 'غ': 0.0050784545, 'ژ': 0.009847666, 'ی': 0.005795712, 'ﺘ': 0.014476563, 'ع': 0.007929613, 'ك': 0.0104936175, 'ﻣ': 0.012922357, 'ة': 0.015813824, 'ﺸ': 0.004292164, '\\': 0.008525501, 'ﺑ': 0.0071774935, 'د': 0.014703526, 'ﺟ': 0.012297391, 'ﺣ': 0.005877054, 'ق': 0.012397284, 'ذ': 0.008965313, 'ﮔ': 0.008165866, 'N': 0.007774644, 'ﻌ': 0.008102357, 'ى': 0.011933966, '؟': 0.0059410203, '_': 0.008470178, 'ﻦ': 0.011244755, 'ö': 0.0074498365, 'ﻪ': 0.014633584, 'و': 0.011743731, 'ﺮ': 0.00716506, 'ﻲ': 0.0142216105, 'ﺎ': 0.0122018885, 'ﭼ': 0.019577848, 'ت': 0.00781299, 'ک': 0.0075172526, 'ۆ': 0.01468156, 'إ': 0.01091248, 'ä': 0.009198329, 'ز': 0.010382653, 'ﺼ': 0.014356847, 'ﯿ': 0.013576012, 'ظ': 0.034042377, 'ﮐ': 0.00905372, 'ش': 0.018376958, 'أ': 0.01603664, 'ﻳ': 0.011256437, 'ئ': 0.009126205, '.': 0.008775721, 'ل': 0.010402347, 'ط': 0.014709812, 'è': 0.0049991906, 'خ': 0.0049866303, 'م':

## 3 samples from get_next_char

In [12]:
next_char, _, _ = language_model.get_next_char(char="ب")
print(next_char)

ش


In [13]:
next_char, _, _ = language_model.get_next_char(char="ق")
print(next_char)

و


In [14]:
next_char, _, _ = language_model.get_next_char(char="س")
print(next_char)

ظ


## 3 samples from generate_text

In [15]:
language_model.generate_text(prefix='انتخا', size=10)

'انتخاب و ما را '

In [16]:
language_model.generate_text(prefix='شنا در این منطقه', size=20)

'شنا در این منطقه ای که در این بخشی ب'

In [17]:
language_model.generate_text(prefix='اگر در طول ترم درس میخواندی', size=30)

'اگر در طول ترم درس میخواندیانی و مراجعه کنند و با تهمه شو'

## 3 samples from get_overall_prob

In [18]:
language_model.get_overal_prob(sentence='چیپس از پفک خوشمزه تر است')

-159.13538233811025

In [19]:
language_model.get_overal_prob(sentence='کتاب در مقابل من است')

-124.78817348147595

In [20]:
language_model.get_overal_prob(sentence='روز سردی خواهد بود')

-109.76139120375979

## evaluate

In [21]:
character_error_rate = language_model.evaluate(test.iloc[73]['text'])
print(character_error_rate)

0.7030497592295345
