In [4]:
import pandas as pd
import tensorflow_hub as hub
import numpy as np
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
file_path = '/content/drive/My Drive/shopee.csv'

In [41]:
data = pd.read_csv(file_path, sep=';')
data.head()

Unnamed: 0,Label,Product ID,Title,Harga,Asal Kota
0,Meja,11001,Damaindah Meja Belajar Kayu Set Kursi / Meja B...,155.0,Tangerang
1,Meja,11002,Homedoki Meja / Meja Makan / Meja Komputer / M...,124.0,Tangerang
2,Meja,11003,Sakula Meja kantor meja kerja Meja Komputer Pe...,107.0,Kab. Gresik
3,Meja,11004,Meja Portable Stand Laptop Meja Laptop Standin...,99.5,Surabaya
4,Meja,11005,PiPi Furniture Meja Gaming / Meja komputer / M...,446.0,Surabaya


In [67]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM,RepeatVector, TimeDistributed, Dense


In [123]:
texts = data['Title'].astype(str).tolist()  # Ensure the column is converted to strings

max_sequence_length = 51  # Maximum length of sequences
embedding_dim = 1352  # Embedding dimension for word vectors
lstm_units = 64  # Number of units in the LSTM layers

# Tokenize the text data
vocabulary=tokenizer.word_index
tokenizer = Tokenizer(num_words=len(vocabulary) + 1)  # +1 for padding
tokenizer.fit_on_texts(texts)
encoded_sequences = tokenizer.texts_to_sequences(texts)
max_sequence_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(encoded_sequences, maxlen=max_sequence_length)

# Prepare input-output pairs for training
input_texts = np.array([seq[:51] for seq in padded_sequences])
output_texts = np.array([seq[1:] for seq in padded_sequences])
output_texts = np.array([item for sublist in output_texts for item in sublist])
output_texts = np.expand_dims(output_texts, axis=-1)
# Debug: Print shapes of input and output data
print(f'Input texts shape: {input_texts.shape}')
print(f'Output texts shape: {output_texts.shape}')

# Define the model
model = tf.keras.Sequential([
  Embedding(len(vocabulary) + 1, embedding_dim, input_length=max_sequence_length),
  LSTM(lstm_units, return_sequences=True),  # Encoder - return sequences for attention
  # Insert a Dense layer to convert LSTM output to word indices
  Dense(len(vocabulary) + 1, activation='softmax'),
  LSTM(lstm_units),
  Dense(len(vocabulary) + 1, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

Input texts shape: (1352, 51)
Output texts shape: (67600, 1)
Model: "sequential_32"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_34 (Embedding)    (None, 51, 1352)          3120416   
                                                                 
 lstm_47 (LSTM)              (None, 51, 64)            362752    
                                                                 
 dense_58 (Dense)            (None, 51, 2308)          150020    
                                                                 
 lstm_48 (LSTM)              (None, 64)                607488    
                                                                 
 dense_59 (Dense)            (None, 2308)              150020    
                                                                 
Total params: 4390696 (16.75 MB)
Trainable params: 4390696 (16.75 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

In [124]:
history = model.fit(input_texts, output_texts, epochs=2, batch_size=32, validation_split=0.2)

Epoch 1/2
Epoch 2/2


In [126]:
def predict_autocomplete(model, tokenizer, input_text, max_sequence_length):
    input_sequence = tokenizer.texts_to_sequences([input_text])
    padded_input = pad_sequences(input_sequence, maxlen=max_sequence_length )
    predictions = model.predict(padded_input)
    predicted_indices = np.argmax(predictions, axis=-1)
    predicted_words = [tokenizer.index_word.get(index, '') for index in predicted_indices]
    return predicted_words

# Example prediction
input_text = "kasur"
predicted_words = predict_autocomplete(model, tokenizer, input_text, max_sequence_length)
print(f"Predicted words: {predicted_words}")


Predicted words: ['']


In [None]:
#!/usr/bin/env python

"""
Parse all files and write to a single file
"""
import os
from pathlib import Path
from typing import List, NamedTuple

from labml import logger, monit

from parser import tokenizer
from parser.tokenizer import encode, parse_string

COMMENT = '#'
MULTI_COMMENT = '"""'


class _PythonFile(NamedTuple):
    relative_path: str
    project: str
    path: Path


class _GetPythonFiles:
    """
    Get list of python files and their paths inside `data/source` folder
    """

    def __init__(self):
        self.source_path = Path(os.getcwd()) / 'data' / 'source'
        self.files: List[_PythonFile] = []
        self.get_python_files(self.source_path)

        logger.inspect([f.path for f in self.files])

    def add_file(self, path: Path):
        """
        Add a file to the list of tiles
        """
        project = path.relative_to(self.source_path).parents
        project = project[len(project) - 2]
        relative_path = path.relative_to(self.source_path / project)

        self.files.append(_PythonFile(relative_path=str(relative_path),
                                      project=str(project),
                                      path=path))

    def get_python_files(self, path: Path):
        """
        Recursively collect files
        """
        for p in path.iterdir():
            if p.is_dir():
                self.get_python_files(p)
            else:
                if p.suffix == '.py':
                    self.add_file(p)


def _fix_indentation(parsed: List[tokenizer.ParsedToken]) -> List[tokenizer.ParsedToken]:
    """
    Change indentation tokens. Remove `DEDENT` tokens and
    add `INDENT` tokens to each line.
    This is easier for prediction.
    """
    res: List[tokenizer.ParsedToken] = []
    indentation = 0
    indented = False
    for t in parsed:
        if t.type == tokenizer.TokenType.indent:
            indentation += 1
        elif t.type == tokenizer.TokenType.dedent:
            indentation -= 1
        elif t.type in [tokenizer.TokenType.new_line,
                        tokenizer.TokenType.eof]:
            indented = False
            res.append(t)
        else:
            if not indented:
                for _ in range(indentation):
                    res.append(tokenizer.ParsedToken(tokenizer.TokenType.indent, 0))
                indented = True

            res.append(t)

    return res


def _remove_comments(parsed: List[tokenizer.ParsedToken]) -> List[tokenizer.ParsedToken]:
    """
    Remove comment tokens
    """
    res = []
    for p in parsed:
        if p.type == tokenizer.TokenType.comment:
            continue
        else:
            res.append(p)

    return res


def _remove_empty_lines(parsed: List[tokenizer.ParsedToken]) -> List[tokenizer.ParsedToken]:
    """
    Remove empty lines
    """

    tokens = [tokenizer.TokenType.new_line, tokenizer.TokenType.new_line]
    res = []
    for p in parsed:
        for i in range(1):
            tokens[i] = tokens[i + 1]
        tokens[-1] = p.type
        all_new_line = True
        for t in tokens:
            if t != tokenizer.TokenType.new_line:
                all_new_line = False

        if all_new_line:
            continue
        else:
            res.append(p)

    return res


def _read_file(path: Path) -> List[int]:
    """
    Read and encode a file
    """
    with open(str(path)) as f:
        content = f.read()

    parsed = parse_string(content)
    parsed = _remove_comments(parsed)
    parsed = _remove_empty_lines(parsed)
    parsed = _fix_indentation(parsed)
    serialized = encode(parsed)

    # deserialized = tokenizer.deserialize(serialized)
    # for i in range(len(serialized)):
    #     assert deserialized[i] == parsed[i]
    #
    # res = to_text(deserialized)
    # print(res)

    return serialized


def main():
    source_files = _GetPythonFiles().files

    logger.inspect(source_files)

    with open(str(Path(os.getcwd()) / 'data' / 'all.py'), 'w') as f:
        for i, source in monit.enum("Parse", source_files):
            serialized = _read_file(source.path)
            # return
            serialized = [str(t) for t in serialized]
            f.write(f"{str(source.path)}\n")
            f.write(" ".join(serialized) + "\n")


if __name__ == '__main__':
    main()

In [35]:
import math
import time
import tokenize
from io import BytesIO
from typing import NamedTuple, List, Tuple

import torch
import torch.nn
from labml import experiment, monit, logger
from labml.logger import Text, Style

import parser.load
import parser.tokenizer
from model import SimpleLstmModel
from parser import tokenizer

# Experiment configuration to load checkpoints
experiment.create(name="simple_lstm",
                  comment="Simple LSTM")

# device to evaluate on
device = torch.device("cuda:0")

# Beam search
BEAM_SIZE = 8


class Suggestions(NamedTuple):
    codes: List[List[int]]
    matched: List[int]
    scores: List[float]


class ScoredItem(NamedTuple):
    score: float
    idx: Tuple


class Predictor:
    """
    Predicts the next few characters
    """

    NEW_LINE_TOKENS = {tokenize.NEWLINE, tokenize.NL}
    INDENT_TOKENS = {tokenize.INDENT, tokenize.DEDENT}

    def __init__(self, model, lstm_layers, lstm_size):
        self.__model = model

        # Initial state
        self._h0 = torch.zeros((lstm_layers, 1, lstm_size), device=device)
        self._c0 = torch.zeros((lstm_layers, 1, lstm_size), device=device)

        # Last line of source code read
        self._last_line = ""

        self._tokens: List[tokenize.TokenInfo] = []

        # Last token, because we need to input that to the model for inference
        self._last_token = 0

        # Last bit of the input string
        self._untokenized = ""

        # For timing
        self.time_add = 0
        self.time_predict = 0
        self.time_check = 0

    def __clear_tokens(self, lines: int):
        """
        Clears old lines from tokens
        """
        for i, t in enumerate(self._tokens):
            if t.type in self.NEW_LINE_TOKENS:
                lines -= 1
                if lines == 0:
                    self._tokens = self._tokens[i + 1:]
                    return

        raise RuntimeError()

    def __clear_untokenized(self, tokens):
        """
        Remove tokens not properly tokenized;
         i.e. the last token, unless it's a new line
        """

        limit = 0
        for i in reversed(range(len(tokens))):
            if tokens[i].type in self.NEW_LINE_TOKENS:
                limit = i + 1
                break
            else:
                limit = i
                break

        return tokens[:limit]

    @staticmethod
    def __get_tokens(it):
        tokens: List[tokenize.TokenInfo] = []

        try:
            for t in it:
                if t.type in tokenizer.SKIP_TOKENS:
                    continue
                if t.type == tokenize.NEWLINE and t.string == '':
                    continue
                if t.type == tokenize.DEDENT:
                    continue
                if t.type == tokenize.ERRORTOKEN:
                    continue
                tokens.append(t)
        except tokenize.TokenError as e:
            if not e.args[0].startswith('EOF in'):
                print(e)
        except IndentationError as e:
            print(e)

        return tokens

    def add(self, content):
        """
        Add a string of code, this shouldn't have multiple lines
        """
        start_time = time.time()
        self._last_line += content

        # Remove old lines
        lines = self._last_line.split("\n")
        if len(lines) > 1:
            assert len(lines) <= 3
            if lines[-1] == '':
                if len(lines) > 2:
                    self.__clear_tokens(len(lines) - 2)
                    lines = lines[-2:]
            else:
                self.__clear_tokens(len(lines) - 1)
                lines = lines[-1:]

        line = '\n'.join(lines)

        self._last_line = line

        # Parse the last line
        tokens_it = tokenize.tokenize(BytesIO(self._last_line.encode('utf-8')).readline)
        tokens = self.__get_tokens(tokens_it)

        # Remove last token
        tokens = self.__clear_untokenized(tokens)

        # Check if previous tokens is a prefix
        assert len(tokens) >= len(self._tokens)

        for t1, t2 in zip(self._tokens, tokens):
            assert t1.type == t2.type
            assert t1.string == t2.string

        # Get the untokenized string
        if len(tokens) > 0:
            assert tokens[-1].end[0] == 1
            self._untokenized = line[tokens[-1].end[1]:]
        else:
            self._untokenized = line

        # Update previous tokens and the model state
        if len(tokens) > len(self._tokens):
            self.__update_state(tokens[len(self._tokens):])
            self._tokens = tokens

        self.time_add += time.time() - start_time

    def get_predictions(self, codes_batch: List[List[int]]):
        # Sequence length and batch size
        seq_len = len(codes_batch[0])
        batch_size = len(codes_batch)

        for codes in codes_batch:
            assert seq_len == len(codes)

        # Input to the model
        x = torch.tensor(codes_batch, device=device)
        x = x.transpose(0, 1)

        # Expand state
        h0 = self._h0.expand(-1, batch_size, -1).contiguous()
        c0 = self._c0.expand(-1, batch_size, -1).contiguous()

        # Get predictions
        prediction, _, _ = self.__model(x, h0, c0)

        assert prediction.shape == (seq_len, len(codes_batch), tokenizer.VOCAB_SIZE)

        # Final prediction
        prediction = prediction[-1, :, :]

        return prediction.detach().cpu().numpy()

    def get_suggestion(self) -> str:
        # Start of with the last token
        suggestions = [Suggestions([[self._last_token]],
                                   [0],
                                   [1.])]

        # Do a beam search, up to the untokenized string length and 10 more
        for step in range(10 + len(self._untokenized)):
            sugg = suggestions[step]
            batch_size = len(sugg.codes)

            # Break if empty
            if batch_size == 0:
                break

            # Get predictions
            start_time = time.time()
            predictions = self.get_predictions(sugg.codes)
            self.time_predict += time.time() - start_time

            start_time = time.time()
            # Get all choices
            choices = []
            for idx in range(batch_size):
                for code in range(tokenizer.VOCAB_SIZE):
                    score = sugg.scores[idx] * predictions[idx, code]
                    choices.append(ScoredItem(
                        score * math.sqrt(sugg.matched[idx] + tokenizer.LENGTHS[code]),
                        (idx, code)))
            # Sort them
            choices.sort(key=lambda x: x.score, reverse=True)

            # Collect the ones that match untokenized string
            codes = []
            matches = []
            scores = []
            len_untokenized = len(self._untokenized)

            for choice in choices:
                prev_idx = choice.idx[0]
                code = choice.idx[1]

                token = tokenizer.DESERIALIZE[code]
                if token.type in tokenizer.LINE_BREAK:
                    continue

                # Previously mached length
                matched = sugg.matched[prev_idx]

                if matched >= len_untokenized:
                    # Increment the length if already matched
                    matched += tokenizer.LENGTHS[code]
                else:
                    # Otherwise check if the new token string matches
                    unmatched = tokenizer.DECODE[code][sugg.codes[prev_idx][-1]]
                    to_match = self._untokenized[matched:]

                    if len(unmatched) < len(to_match):
                        if not to_match.startswith(unmatched):
                            continue
                        else:
                            matched += len(unmatched)
                    else:
                        if not unmatched.startswith(to_match):
                            continue
                        else:
                            matched += len(unmatched)

                # Collect new item
                codes.append(sugg.codes[prev_idx] + [code])
                matches.append(matched)
                score = sugg.scores[prev_idx] * predictions[prev_idx, code]
                scores.append(score)

                # Stop at `BEAM_SIZE`
                if len(scores) == BEAM_SIZE:
                    break

            suggestions.append(Suggestions(codes, matches, scores))

            self.time_check += time.time() - start_time

        # Collect suggestions of all lengths
        choices = []
        for s_idx, sugg in enumerate(suggestions):
            batch_size = len(sugg.codes)
            for idx in range(batch_size):
                length = sugg.matched[idx] - len(self._untokenized)
                if length <= 2:
                    continue
                choice = sugg.scores[idx] * math.sqrt(length - 1)
                choices.append(ScoredItem(choice, (s_idx, idx)))
        choices.sort(key=lambda x: x.score, reverse=True)

        # Return the best option
        for choice in choices:
            codes = suggestions[choice.idx[0]].codes[choice.idx[1]]
            res = ""
            prev = self._last_token
            for code in codes[1:]:
                res += tokenizer.DECODE[code][prev]
                prev = code

            res = res[len(self._untokenized):]

            # Skip if blank
            if res.strip() == "":
                continue

            return res

        # Return blank if there are no options
        return ''

    def __update_state(self, tokens):
        """
        Update model state
        """
        data = parser.tokenizer.parse(tokens)
        data = parser.tokenizer.encode(data)
        x = [self._last_token] + data[:-1]
        self._last_token = data[-1]

        x = torch.tensor([x], device=device)
        x = x.transpose(0, 1)
        _, _, (hn, cn) = self.__model(x, self._h0, self._c0)
        self._h0 = hn.detach()
        self._c0 = cn.detach()


class Evaluator:
    def __init__(self, model, file: parser.load.EncodedFile,
                 lstm_layers, lstm_size,
                 skip_spaces=False):
        self.__content = self.get_content(file.codes)
        self.__skip_spaces = skip_spaces
        self.__predictor = Predictor(model, lstm_layers, lstm_size)

    @staticmethod
    def get_content(codes: List[int]):
        tokens = parser.tokenizer.decode(codes)
        content = parser.tokenizer.to_string(tokens)
        return content.split('\n')

    def eval(self):
        keys_saved = 0

        for line, content in enumerate(self.__content):
            # Keep reference to rest of the line
            rest_of_line = content

            # Build the line for logging with colors
            # The line number
            logs = [(f"{line: 4d}: ", Text.meta)]

            # Type the line character by character
            while rest_of_line != '':
                suggestion = self.__predictor.get_suggestion()

                # If suggestion matches
                if suggestion != '' and rest_of_line.startswith(suggestion):
                    # Log
                    logs.append((suggestion[0], [Style.underline, Text.danger]))
                    logs.append((suggestion[1:], Style.underline))

                    keys_saved += len(suggestion) - 1

                    # Skip the prediction text
                    rest_of_line = rest_of_line[len(suggestion):]

                    # Add text to the predictor
                    self.__predictor.add(suggestion)

                # If the suggestion doesn't match
                else:
                    # Add the next character
                    self.__predictor.add(rest_of_line[0])
                    logs.append((rest_of_line[0], Text.subtle))
                    rest_of_line = rest_of_line[1:]

            # Add a new line
            self.__predictor.add("\n")

            # Log the line
            logger.log(logs)

        # Log time taken for the file
        logger.inspect(add=self.__predictor.time_add,
                       check=self.__predictor.time_check,
                       predict=self.__predictor.time_predict)

        total_keys = sum([len(c) for c in self.__content])
        logger.inspect(keys_saved=keys_saved,
                       percentage_saved=100 * keys_saved / total_keys,
                       total_keys=total_keys,
                       total_lines=len(self.__content))


def main():
    lstm_size = 1024
    lstm_layers = 3

    with monit.section("Loading data"):
        files = parser.load.load_files()
        train_files, valid_files = parser.load.split_train_valid(files, is_shuffle=False)

    with monit.section("Create model"):
        model = SimpleLstmModel(encoding_size=tokenizer.VOCAB_SIZE,
                                embedding_size=tokenizer.VOCAB_SIZE,
                                lstm_size=lstm_size,
                                lstm_layers=lstm_layers)
        model.to(device)

    experiment.add_pytorch_models({'base': model})

    experiment.load("2a86d636936d11eab8740dffb016e7b1", 72237)

    # For debugging with a specific piece of source code
    # predictor = Predictor(model, lstm_layers, lstm_size)
    # for s in ['""" """\n', "from __future__"]:
    #     predictor.add(s)
    # s = predictor.get_suggestion()

    # Evaluate all the files in validation set
    for file in valid_files:
        logger.log(str(file.path), Text.heading)
        evaluator = Evaluator(model, file,
                              lstm_layers, lstm_size,
                              skip_spaces=True)
        evaluator.eval()


if __name__ == '__main__':
    main()

ModuleNotFoundError: No module named 'parser'

In [None]:
!pip install parser


In [None]:
import math
from typing import List

import numpy as np
import torch
import torch.nn
from labml import experiment, monit, tracker, logger
from labml.utils.delayed_keyboard_interrupt import DelayedKeyboardInterrupt

import parser.load
from model import SimpleLstmModel
from parser import tokenizer

# Setup the experiment
experiment.create(name="simple_lstm",
                  comment="Simple LSTM")

# device to train on
device = torch.device("cuda:0")


def list_to_batches(x, batch_size, batches, seq_len):
    """
    Prepare flat data into batches to be ready for the model to consume
    """
    x = np.reshape(x, (batch_size, batches, seq_len))
    x = np.transpose(x, (1, 2, 0))

    return x


def get_batches(files: List[parser.load.EncodedFile], eof: int, batch_size=32, seq_len=32):
    """
    Covert raw encoded files into trainin/validation batches
    """

    # Shuffle the order of files
    np.random.shuffle(files)

    # Concatenate all the files whilst adding `eof` marker at the beginnings
    data = []
    for f in files:
        data.append(eof)
        data += f.codes
    data = np.array(data)

    # Start from a random offset
    offset = np.random.randint(seq_len * batch_size)
    data = data[offset:]

    # Number of batches
    batches = (len(data) - 1) // batch_size // seq_len

    # Extract input
    x = data[:(batch_size * seq_len * batches)]
    # Extract output, i.e. the next char
    y = data[1:(batch_size * seq_len * batches) + 1]

    # Covert the flat data into batches
    x = list_to_batches(x, batch_size, batches, seq_len)
    y = list_to_batches(y, batch_size, batches, seq_len)

    return x, y


class Trainer:
    """
    This will maintain states, data and train/validate the model
    """

    def __init__(self, *, files: List[parser.load.EncodedFile],
                 model, loss_func, optimizer,
                 eof: int,
                 batch_size: int, seq_len: int,
                 is_train: bool,
                 h0, c0):
        # Get batches
        x, y = get_batches(files, eof,
                           batch_size=batch_size,
                           seq_len=seq_len)
        # Covert data to PyTorch tensors
        self.x = torch.tensor(x, device=device)
        self.y = torch.tensor(y, device=device)

        # Initial state
        self.hn = h0
        self.cn = c0

        self.model = model
        self.loss_func = loss_func
        self.optimizer = optimizer
        self.p = None
        self.is_train = is_train

    def run(self, i):
        # Get model output
        self.p, logits, (self.hn, self.cn) = self.model(self.x[i], self.hn, self.cn)

        # Flatten outputs
        logits = logits.view(-1, self.p.shape[-1])
        yi = self.y[i].reshape(-1)

        # Calculate loss
        loss = self.loss_func(logits, yi)

        # Store the states
        self.hn = self.hn.detach()
        self.cn = self.cn.detach()

        if self.is_train:
            # Take a training step
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            tracker.add("train.loss", loss.cpu().data.item())
        else:
            tracker.add("valid.loss", loss.cpu().data.item())


def main_train():
    lstm_size = 1024
    lstm_layers = 3
    batch_size = 32
    seq_len = 32

    with monit.section("Loading data"):
        # Load all python files
        files = parser.load.load_files()
        # Split training and validation data
        train_files, valid_files = parser.load.split_train_valid(files, is_shuffle=False)

    with monit.section("Create model"):
        # Create model
        model = SimpleLstmModel(encoding_size=tokenizer.VOCAB_SIZE,
                                embedding_size=tokenizer.VOCAB_SIZE,
                                lstm_size=lstm_size,
                                lstm_layers=lstm_layers)
        # Move model to `device`
        model.to(device)

        # Create loss function and optimizer
        loss_func = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters())

    # Initial state is 0
    h0 = torch.zeros((lstm_layers, batch_size, lstm_size), device=device)
    c0 = torch.zeros((lstm_layers, batch_size, lstm_size), device=device)

    # Setup logger indicators
    tracker.set_queue("train.loss", queue_size=500, is_print=True)
    tracker.set_queue("valid.loss", queue_size=500, is_print=True)

    # Specify the model in [lab](https://github.com/vpj/lab) for saving and loading
    experiment.add_pytorch_models({'base': model})

    # Start training scratch (step '0')
    experiment.start()

    # Number of batches per epoch
    batches = math.ceil(sum([len(f[1]) + 1 for f in train_files]) / (batch_size * seq_len))

    # Number of steps per epoch. We train and validate on each step.
    steps_per_epoch = 200

    # Train for 100 epochs
    for epoch in monit.loop(range(100)):
        # Create trainer
        trainer = Trainer(files=train_files,
                          model=model,
                          loss_func=loss_func,
                          optimizer=optimizer,
                          batch_size=batch_size,
                          seq_len=seq_len,
                          is_train=True,
                          h0=h0,
                          c0=c0,
                          eof=0)
        # Create validator
        validator = Trainer(files=valid_files,
                            model=model,
                            loss_func=loss_func,
                            optimizer=optimizer,
                            is_train=False,
                            seq_len=seq_len,
                            batch_size=batch_size,
                            h0=h0,
                            c0=c0,
                            eof=0)

        # Next batch to train and validation
        train_batch = 0
        valid_batch = 0

        # Loop through steps
        for i in range(1, steps_per_epoch):
            try:
                with DelayedKeyboardInterrupt():
                    # Set global step
                    global_step = epoch * batches + min(batches, (batches * i) // steps_per_epoch)
                    tracker.set_global_step(global_step)

                    # Last batch to train and validate
                    train_batch_limit = trainer.x.shape[0] * min(1., (i + 1) / steps_per_epoch)
                    valid_batch_limit = validator.x.shape[0] * min(1., (i + 1) / steps_per_epoch)

                    with monit.section("train", total_steps=trainer.x.shape[0], is_partial=True):
                        model.train()
                        # Train
                        while train_batch < train_batch_limit:
                            trainer.run(train_batch)
                            monit.progress(train_batch + 1)
                            train_batch += 1

                    with monit.section("valid", total_steps=validator.x.shape[0], is_partial=True):
                        model.eval()
                        # Validate
                        while valid_batch < valid_batch_limit:
                            validator.run(valid_batch)
                            monit.progress(valid_batch + 1)
                            valid_batch += 1

                    # Output results
                    tracker.save()

                    # 10 lines of logs per epoch
                    if (i + 1) % (steps_per_epoch // 10) == 0:
                        logger.log()
            except KeyboardInterrupt:
                experiment.save_checkpoint()
                return

        experiment.save_checkpoint()


if __name__ == '__main__':
    main_train()