# Testing Hyperparameters

In [4]:
%load_ext autoreload
%autoreload 2
import os
from src.Utils import Utils
from src import LSTM, RNN, Config, TextDataset, Sampling, TextGeneration
import tensorflow as tf

Utils.tensorflow_shutup()
tf.keras.utils.set_random_seed(0)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Reading TextDataset

In [None]:
config = {
    "DATA": {
        "WINDOW_SIZE": 25,
        "STRIDE": 3,
        "DATA_PATH": os.path.join("data", "beatles1000.txt"),
    },
    "TRAINING": {
        "HIDDEN_STATE_SIZE": [100],
        "BATCH_SIZE": 1000,
        "EPOCHS": 50,
        "SAMPLING_TEMP": 0.75,
        "BUFFER_SIZE": 10000,
    },
}
config = Config(**config)
# data = TextDataset(config, verbosity=1).read()

DataConfig(WINDOW_SIZE=25, STRIDE=3, DATA_PATH='data/beatles1000.txt', BUFFER_SIZE=10000, VOCAB_SIZE=0)
TrainingConfig(HIDDEN_STATE_SIZE=[100], BATCH_SIZE=1000, EPOCHS=50, SAMPLING_TEMP=0.75)


## Best Window Size: 75

In [14]:
best_acc, best_ws = 0, None
for ws in [10, 25, 50, 75, 100]:
    config.DATA.WINDOW_SIZE = ws
    data = TextDataset(config, verbosity=1).read()
    model = TextGeneration(config, LSTM.get_LSTM)
    history = model.train(data, config)
    acc = history.history["acc"][-1]
    if acc > best_acc:
        best_acc, best_ws = acc, ws
    print("=" * 100)
    print(f"# BEST_ACC = {best_acc}\tBEST_WS = {best_ws}")
    print("=" * 100)
config.DATA.WINDOW_SIZE = best_ws

# BEST_ACC = 0	BEST_WS = None
# [TextDataset - init]:	file_name=data/beatles1000.txt	window_size=10	stride=3
	33374 total chars in text
# [Embeddings - init]:	label=beatles1000	encoding=utf-8
# [TextGeneration - init]:	model_func=get_LSTM
# [TextGeneration - train]:


100%|██████████| 50/50 [00:46<00:00,  1.07it/s, loss=2.25, acc=0.355, gen=b'o?\nond,\nsou song bis in\xe2\x80\x99t ']


# BEST_ACC = 0.35504546761512756	BEST_WS = 10
# [TextDataset - init]:	file_name=data/beatles1000.txt	window_size=25	stride=3
	33374 total chars in text
# [Embeddings - init]:	label=beatles1000	encoding=utf-8
# [TextGeneration - init]:	model_func=get_LSTM
# [TextGeneration - train]:


100%|██████████| 50/50 [00:58<00:00,  1.17s/it, loss=2.13, acc=0.387, gen=b'u cale gott ingiml\nyou she']


# BEST_ACC = 0.386680006980896	BEST_WS = 25
# [TextDataset - init]:	file_name=data/beatles1000.txt	window_size=50	stride=3
	33374 total chars in text
# [Embeddings - init]:	label=beatles1000	encoding=utf-8
# [TextGeneration - init]:	model_func=get_LSTM
# [TextGeneration - train]:


100%|██████████| 50/50 [01:05<00:00,  1.31s/it, loss=2.07, acc=0.406, gen=b' seve you mant ho the meve']  


# BEST_ACC = 0.4056636393070221	BEST_WS = 50
# [TextDataset - init]:	file_name=data/beatles1000.txt	window_size=75	stride=3
	33374 total chars in text
# [Embeddings - init]:	label=beatles1000	encoding=utf-8
# [TextGeneration - init]:	model_func=get_LSTM
# [TextGeneration - train]:


100%|██████████| 50/50 [01:17<00:00,  1.54s/it, loss=2.03, acc=0.415, gen=b'me.\nwally will the me\nbone'] 


# BEST_ACC = 0.4145781695842743	BEST_WS = 75
# [TextDataset - init]:	file_name=data/beatles1000.txt	window_size=100	stride=3
	33374 total chars in text
# [Embeddings - init]:	label=beatles1000	encoding=utf-8
# [TextGeneration - init]:	model_func=get_LSTM
# [TextGeneration - train]:


100%|██████████| 50/50 [01:32<00:00,  1.84s/it, loss=2.03, acc=0.414, gen=b'\nshe round bote hea ther g']       

# BEST_ACC = 0.4145781695842743	BEST_WS = 75





## Best Stride: 3

In [15]:
best_acc, best_st = 0, None
for st in [2, 3, 5, 8]:
    config.DATA.STRIDE = st
    data = TextDataset(config, verbosity=1).read()
    model = TextGeneration(config, LSTM.get_LSTM)
    history = model.train(data, config)
    acc = history.history["acc"][-1]
    if acc > best_acc:
        best_acc, best_st = acc, st
    print("=" * 100)
    print(f"# BEST_ACC = {best_acc}\tBEST_ST = {best_st}")
    print("=" * 100)
config.DATA.STRIDE = best_st

# BEST_ACC = 0	BEST_ST = None
# [TextDataset - init]:	file_name=data/beatles1000.txt	window_size=75	stride=3
	33374 total chars in text
# [Embeddings - init]:	label=beatles1000	encoding=utf-8
# [TextGeneration - init]:	model_func=get_LSTM
# [TextGeneration - train]:


100%|██████████| 50/50 [01:18<00:00,  1.58s/it, loss=2.05, acc=0.41, gen=b"onla te light fry i'my wal"]  


# BEST_ACC = 0.4097381830215454	BEST_ST = 3
# [TextDataset - init]:	file_name=data/beatles1000.txt	window_size=75	stride=5
	33374 total chars in text
# [Embeddings - init]:	label=beatles1000	encoding=utf-8
# [TextGeneration - init]:	model_func=get_LSTM
# [TextGeneration - train]:


100%|██████████| 50/50 [00:52<00:00,  1.06s/it, loss=2.34, acc=0.335, gen=b'ss awin with thos yonghey ']   


# BEST_ACC = 0.4097381830215454	BEST_ST = 3
# [TextDataset - init]:	file_name=data/beatles1000.txt	window_size=75	stride=8
	33374 total chars in text
# [Embeddings - init]:	label=beatles1000	encoding=utf-8
# [TextGeneration - init]:	model_func=get_LSTM
# [TextGeneration - train]:


100%|██████████| 50/50 [00:38<00:00,  1.29it/s, loss=2.59, acc=0.28, gen=b' boun yorngt wii len shahi']     


# BEST_ACC = 0.4097381830215454	BEST_ST = 3
# [TextDataset - init]:	file_name=data/beatles1000.txt	window_size=75	stride=10
	33374 total chars in text
# [Embeddings - init]:	label=beatles1000	encoding=utf-8
# [TextGeneration - init]:	model_func=get_LSTM
# [TextGeneration - train]:


100%|██████████| 50/50 [00:32<00:00,  1.54it/s, loss=2.78, acc=0.239, gen=b' te ,t cahhl  wyee  ehog i']                                   

# BEST_ACC = 0.4097381830215454	BEST_ST = 3





## Best Hidden State Size: 75

In [16]:
best_acc, best_hs = 0, None
for hs in [25, 50, 100, 150, 200, 250, 300, 400, 500]:
    config.TRAINING.HIDDEN_STATE_SIZE = [hs]
    data = TextDataset(config, verbosity=1).read()
    model = TextGeneration(config, LSTM.get_LSTM)
    history = model.train(data, config)
    acc = history.history["acc"][-1]
    if acc > best_acc:
        best_acc, best_hs = acc, hs
    print("=" * 100)
    print(f"# BEST_ACC = {best_acc}\tBEST_HS = {best_hs}")
    print("=" * 100)
config.TRAINING.HIDDEN_STATE_SIZE = best_hs

# [TextDataset - init]:	file_name=data/beatles1000.txt	window_size=75	stride=3
	33374 total chars in text
# [Embeddings - init]:	label=beatles1000	encoding=utf-8
# [TextGeneration - init]:	model_func=get_LSTM
# [TextGeneration - train]:


100%|██████████| 50/50 [01:16<00:00,  1.54s/it, loss=2.62, acc=0.268, gen=b'e kol \nawtos yoal thre hin']  


# BEST_ACC = 0.2683854401111603	BEST_HS = 25
# [TextDataset - init]:	file_name=data/beatles1000.txt	window_size=75	stride=3
	33374 total chars in text
# [Embeddings - init]:	label=beatles1000	encoding=utf-8
# [TextGeneration - init]:	model_func=get_LSTM
# [TextGeneration - train]:


100%|██████████| 50/50 [01:18<00:00,  1.58s/it, loss=2.29, acc=0.347, gen=b" boctase pang you do't bud"]   


# BEST_ACC = 0.34672847390174866	BEST_HS = 50
# [TextDataset - init]:	file_name=data/beatles1000.txt	window_size=75	stride=3
	33374 total chars in text
# [Embeddings - init]:	label=beatles1000	encoding=utf-8
# [TextGeneration - init]:	model_func=get_LSTM
# [TextGeneration - train]:


100%|██████████| 50/50 [01:18<00:00,  1.58s/it, loss=2.06, acc=0.408, gen=b's fire chare\nwad tow trige']              


# BEST_ACC = 0.40801090002059937	BEST_HS = 100
# [TextDataset - init]:	file_name=data/beatles1000.txt	window_size=75	stride=3
	33374 total chars in text
# [Embeddings - init]:	label=beatles1000	encoding=utf-8
# [TextGeneration - init]:	model_func=get_LSTM
# [TextGeneration - train]:


100%|██████████| 50/50 [01:21<00:00,  1.63s/it, loss=1.84, acc=0.469, gen=b'u wont iwd n aithas you lo']  


# BEST_ACC = 0.46907395124435425	BEST_HS = 150
# [TextDataset - init]:	file_name=data/beatles1000.txt	window_size=75	stride=3
	33374 total chars in text
# [Embeddings - init]:	label=beatles1000	encoding=utf-8
# [TextGeneration - init]:	model_func=get_LSTM
# [TextGeneration - train]:


100%|██████████| 50/50 [01:25<00:00,  1.70s/it, loss=1.75, acc=0.496, gen=b'lak i the  heresthas  hell'] 


# BEST_ACC = 0.49640849232673645	BEST_HS = 200
# [TextDataset - init]:	file_name=data/beatles1000.txt	window_size=75	stride=3
	33374 total chars in text
# [Embeddings - init]:	label=beatles1000	encoding=utf-8
# [TextGeneration - init]:	model_func=get_LSTM
# [TextGeneration - train]:


100%|██████████| 50/50 [01:27<00:00,  1.74s/it, loss=1.54, acc=0.558, gen=b"ck ird don'to konm he sing"] 

# BEST_ACC = 0.5583575963973999	BEST_HS = 250





## Multi-Layer

In [28]:
best_acc, best_hs = 0, None
for hs in [[500, 500]]:
    config.TRAINING.HIDDEN_STATE_SIZE = hs
    data = TextDataset(config, verbosity=1).read()
    model = TextGeneration(config, LSTM.get_LSTM)
    history = model.train(data, config)
    acc = history.history["acc"][-1]
    if acc > best_acc:
        best_acc, best_hs = acc, hs
    print("=" * 100)
    print(f"# BEST_ACC = {best_acc}\tBEST_HS = {best_hs}")
    print("=" * 100)
config.TRAINING.HIDDEN_STATE_SIZE = best_hs

# [TextDataset - init]:	file_name=data/beatles1000.txt	window_size=75	stride=3
	33374 total chars in text
# [Embeddings - init]:	label=beatles1000	encoding=utf-8
# [TextGeneration - init]:	model_func=get_LSTM
# [TextGeneration - train]:


100%|██████████| 50/50 [02:35<00:00,  3.10s/it, loss=0.773, acc=0.796, gen=b'og my borke shous hodesee soney tant to be back now']   

# BEST_ACC = 0.7960484623908997	BEST_HS = [500, 500]





## Generation

In [25]:
prompt = "it’s been a hard day's night, i should be sleeping like log."

model.verbosity = 1
generated = model.predict(
    prompt,
    num_chars=500,
    sampling_method=Sampling.random_sampling,
    temp=0.75,
    output_as="string",
)
print("Prompt:", prompt)
print(generated)

# [TextGeneration - predict]:	prompt_len=61	num_chars=500
Prompt: it’s been a hard day's night, i should be sleeping like log.

you win'then eave to shet there.
ond hei got do cay doy
war do ttook
ofs or me ond wate ano ma do 
o sous i stals arloung, me the parind you can
seadsting me con tiget son't be te me, yous i mand love,
se lover dell, never be sae mong bous, don't car not hameses in the pad and to me.
in thea ishong all you know the dyaca caus ing baby cay's your
wall the kery to make you con't beever me.
ant to aby to tor car that and whith in hee share
hever you know thang to beby don't in the douss foritha fou 


In [33]:
print(config.DATA)
print(config.TRAINING)

DataConfig(WINDOW_SIZE=75, STRIDE=3, DATA_PATH='data/beatles1000.txt', BUFFER_SIZE=10000, VOCAB_SIZE=0)
TrainingConfig(HIDDEN_STATE_SIZE=[500, 500], BATCH_SIZE=1000, EPOCHS=50, SAMPLING_TEMP=0.75)
