# Read dicts and dataset from 05

In [2]:
import json

def read_dicts(file: str):
    # Reading from the file
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        encode_dict = data['encode_dict']
        decode_dict = data['decode_dict']
    return encode_dict, decode_dict

def read_dataset(file: str) -> list[list[int]]:
    with open(file, 'r', encoding='utf-8') as f:
        dataset: list[list[int]] = [list(map(int, line.split())) for line in f]
    return dataset


DICTS_FILE = '05.dicts.json'
encode_dict, decode_dict = read_dicts(DICTS_FILE)
print(encode_dict)
print(decode_dict)

DATASET_PATH = '05.dataset.txt'
dataset = read_dataset(DATASET_PATH)
print(len(dataset))
print(dataset[:5]) # First 5 datapoints
print(dataset[-5:]) # Last 5 datapoints

{'\n': 0, ' ': 1, '!': 2, '"': 3, '$': 4, '%': 5, '&': 6, "'": 7, '(': 8, ')': 9, '*': 10, '+': 11, ',': 12, '-': 13, '.': 14, '/': 15, '0': 16, '1': 17, '2': 18, '3': 19, '4': 20, '5': 21, '6': 22, '7': 23, '8': 24, ':': 25, ';': 26, '<': 27, '=': 28, '>': 29, '?': 30, '@': 31, 'A': 32, 'B': 33, 'C': 34, 'D': 35, 'E': 36, 'F': 37, 'G': 38, 'H': 39, 'I': 40, 'J': 41, 'K': 42, 'L': 43, 'M': 44, 'N': 45, 'O': 46, 'P': 47, 'R': 48, 'S': 49, 'T': 50, 'U': 51, 'V': 52, 'W': 53, 'X': 54, 'Y': 55, '[': 56, '\\': 57, ']': 58, '^': 59, '_': 60, '`': 61, 'a': 62, 'b': 63, 'c': 64, 'd': 65, 'e': 66, 'f': 67, 'g': 68, 'h': 69, 'i': 70, 'j': 71, 'k': 72, 'l': 73, 'm': 74, 'n': 75, 'o': 76, 'p': 77, 'q': 78, 'r': 79, 's': 80, 't': 81, 'u': 82, 'v': 83, 'w': 84, 'x': 85, 'y': 86, 'z': 87, '{': 88, '|': 89, '}': 90, '~': 91, '上': 92, '分': 93, '区': 94, '大': 95, '海': 96, '赛': 97}
{'0': '\n', '1': ' ', '2': '!', '3': '"', '4': '$', '5': '%', '6': '&', '7': "'", '8': '(', '9': ')', '10': '*', '11': '+', '

---

# Get Mini Batch

In [8]:
import random

def get_mini_batch(
    dataset: list[list[int]],
    n_sample: int,
    n_labels: int,
) -> tuple[
        list[list[int]],
        list[list[int]],
        list[int],
    ]:
    random_sample: list[list[int]] = random.sample(dataset, n_sample)
    x_train: list[list[int]] = [x[:-1] for x in random_sample]
    labels: list[int] = [x[-1] for x in random_sample]
    y_train: list[list[int]] = [
        [1 if i == label else 0 for i in range(n_labels)] for label in labels
    ]
    return x_train, y_train, labels

x_train, y_train, labels = get_mini_batch(dataset, 5, len(encode_dict))
print(x_train)
print(y_train)
print(labels)

print(len(x_train[0]))
print(len(y_train[0]))

[[14, 82, 80, 66, 8, 66, 85, 77, 79, 66, 80, 80, 14, 80, 81, 62, 81, 70, 64, 8, 77, 62, 81, 69, 14, 71, 76, 70, 75], [81, 79, 76, 86, 66, 65, 1, 28, 1, 70, 65, 1, 70, 75, 1, 82, 80, 66, 79, 80, 26, 0, 1, 1, 1, 1, 65, 66, 73], [15, 15, 1, 75, 76, 75, 13, 20, 16, 20, 1, 66, 79, 79, 76, 79, 0, 1, 1, 1, 1, 15, 15, 1, 67, 70, 73, 66, 1], [76, 79, 81, 80, 14, 69, 81, 74, 73, 1, 28, 1, 67, 82, 75, 64, 81, 70, 76, 75, 8, 79, 66, 78, 12, 1, 79, 66, 80], [69, 66, 1, 82, 80, 66, 79, 0, 1, 1, 69, 62, 80, 69, 8, 88, 1, 77, 62, 80, 80, 84, 76, 79, 65, 25, 1, 77, 62]]
[[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0