# File using python

---

# Delete all files except "*.js" files

In [1]:
from pathlib import Path

FOLDER = "express-examples"

for path in Path(FOLDER).rglob('*'):
    if path.is_file():
        if path.suffix != '.js':
            path.unlink()
            print(f"Deleted {path}")
        else:
            print(f"Remain {path}")

Remain express-examples/vhost/index.js
Remain express-examples/cookie-sessions/index.js
Remain express-examples/view-constructor/index.js
Remain express-examples/view-constructor/github-view.js
Remain express-examples/web-service/index.js
Remain express-examples/mvc/index.js
Remain express-examples/mvc/db.js
Remain express-examples/mvc/lib/boot.js
Remain express-examples/mvc/controllers/user/index.js
Remain express-examples/mvc/controllers/pet/index.js
Remain express-examples/mvc/controllers/main/index.js
Remain express-examples/mvc/controllers/user-pet/index.js
Remain express-examples/error/index.js
Remain express-examples/content-negotiation/index.js
Remain express-examples/content-negotiation/db.js
Remain express-examples/content-negotiation/users.js
Remain express-examples/ejs/index.js
Remain express-examples/params/index.js
Remain express-examples/auth/index.js
Remain express-examples/hello-world/index.js
Remain express-examples/resource/index.js
Remain express-examples/route-map/

---

In [2]:
files = list(Path(FOLDER).rglob('*.js'))

print(len(files))
print(files)

43
[PosixPath('express-examples/vhost/index.js'), PosixPath('express-examples/cookie-sessions/index.js'), PosixPath('express-examples/view-constructor/index.js'), PosixPath('express-examples/view-constructor/github-view.js'), PosixPath('express-examples/web-service/index.js'), PosixPath('express-examples/mvc/index.js'), PosixPath('express-examples/mvc/db.js'), PosixPath('express-examples/mvc/lib/boot.js'), PosixPath('express-examples/mvc/controllers/user/index.js'), PosixPath('express-examples/mvc/controllers/pet/index.js'), PosixPath('express-examples/mvc/controllers/main/index.js'), PosixPath('express-examples/mvc/controllers/user-pet/index.js'), PosixPath('express-examples/error/index.js'), PosixPath('express-examples/content-negotiation/index.js'), PosixPath('express-examples/content-negotiation/db.js'), PosixPath('express-examples/content-negotiation/users.js'), PosixPath('express-examples/ejs/index.js'), PosixPath('express-examples/params/index.js'), PosixPath('express-examples/a

---

# Set of chatacters from given file

In [3]:
file_path = files[0]
print(file_path)

with open(file_path, 'r', encoding='utf-8') as f:
    content = f.read()

char_set = set(content)
sorted_chars = sorted(char_set)

print(len(sorted_chars))
print(sorted_chars)

express-examples/vhost/index.js
50
['\n', ' ', '!', "'", '(', ')', '*', '+', ',', '.', '/', '0', '1', '2', '3', '7', ':', ';', '=', 'E', 'H', 'M', 'R', 'S', 'V', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', '{', '}']


---

# Set of characters from multiple files

In [4]:
char_set: set[str] = set()

for file in files:  # paths = list of file paths
    with open(file, 'r', encoding='utf-8') as f:
        char_set.update(f.read())

sorted_chars: list[str] = sorted(char_set)
print(len(sorted_chars))
print(sorted_chars)

encode_dict: dict[str, int] = {value: index for index, value in enumerate(sorted_chars)}
decode_dict: dict[int, str] = {index: value for index, value in enumerate(sorted_chars)}

print(encode_dict)
print(decode_dict)

DICTS_FILE = '05.dicts.json'

import json

with open(DICTS_FILE, 'w', encoding='utf-8') as f:
    json.dump({'encode_dict': encode_dict, 'decode_dict': decode_dict}, f)

98
['\n', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '上', '分', '区', '大', '海', '赛']
{'\n': 0, ' ': 1, '!': 2, '"': 3, '$': 4, '%': 5, '&': 6, "'": 7, '(': 8, ')': 9, '*': 10, '+': 11, ',': 12, '-': 13, '.': 14, '/': 15, '0': 16, '1': 17, '2': 18, '3': 19, '4': 20, '5': 21, '6': 22, '7': 23, '8': 24, ':': 25, ';': 26, '<': 27, '=': 28, '>': 29, '?': 30, '@': 31, 'A': 32, 'B': 33, 'C': 34, 'D': 35, 'E': 36, 'F': 37, 'G': 38, 'H': 39, 'I': 40, 'J': 41, 'K': 42, 'L': 43, 'M': 44, 'N': 45, 'O': 46, 'P': 47, 'R': 48, 'S': 49, 'T': 50, 'U': 51, 'V': 52, 'W': 53, 'X': 54, 'Y': 55, '[': 56,

---

# encode and decode

In [5]:
def encode(s: str) -> list[int]:
    return [encode_dict[c] for c in s]

def decode(arr: list[int]) -> str:
    return ''.join([decode_dict[index] for index in arr])

print(encode("hello"))
print(decode([69, 66, 73, 73, 76]))

[69, 66, 73, 73, 76]
hello


---

# Create dataset

In [6]:
def create_dataset(s: str, n: int) -> list[str]:
    arr: list[str] = []
    for i in range(len(s) - n + 1):
        arr.append(s[i:i+n])
    return arr

dataset = create_dataset("hello world", 5)
print(dataset)

['hello', 'ello ', 'llo w', 'lo wo', 'o wor', ' worl', 'world']


---

# Better create_dataset

In [7]:
def create_dataset(s: str, n: int) -> list[list[int]]:
    s_encoded: list[int] = encode(s)
    arr: list[list[int]] = []
    for i in range(len(s_encoded) - n + 1):
        arr.append(s_encoded[i:i+n])
    return arr

dataset = create_dataset("hello world", 5)
print(dataset)

[[69, 66, 73, 73, 76], [66, 73, 73, 76, 1], [73, 73, 76, 1, 84], [73, 76, 1, 84, 76], [76, 1, 84, 76, 79], [1, 84, 76, 79, 73], [84, 76, 79, 73, 65]]


---

# Create dataset from FOLDER

In [8]:
DATASET_PATH = '05.dataset.txt'

if Path(DATASET_PATH).exists() and Path(DATASET_PATH).is_file():
    Path(DATASET_PATH).unlink()

with open(DATASET_PATH, 'a', encoding='utf-8') as fa:
    for file in files:
        with open(file, 'r', encoding='utf-8') as f:
            content: str = f.read()
            dataset: list[list[int]] = create_dataset(content, 30)
            for datapoint in dataset:
                linestr: str = " ".join(map(str, datapoint)) + "\n"
                fa.write(linestr)

print(f'dataset saved to {DATASET_PATH}')

dataset saved to 05.dataset.txt
