In [32]:
import os

import numpy as np
from fastai import text
import torch

import pathlib

In [2]:
# had to do this because FastAI has stupid convention of prefixing model name with 'model/
os.chdir('../data') 

In [3]:
os.getcwd()

'/home/kuba/Projects/github_search/data'

In [4]:
!ls

file.csv	     github_repos_lm.csv	   models
github_readmes.json  github_repos_lm_text_big.csv  nlp_github_repos.json
github_repos.json    lm_data.pkl


In [5]:
!ls ../data

file.csv	     github_repos_lm.csv	   models
github_readmes.json  github_repos_lm_text_big.csv  nlp_github_repos.json
github_repos.json    lm_data.pkl


In [6]:
config_dict = dict(
    base_lr = 1e-2,
    finetuning_lr = (1e-4, 1e-2),
    drop_mult = 0.5,
    bptt=50,
    bs = 64
)

config = text.awd_lstm_lm_config.copy()
config['qrnn'] = True

DATASET_PATH = 'lm_data.pkl'
MODEL_PATH = 'ft_cleaned_qrnn_bptt50_10'
MODEL_ENCODER_PATH = 'ft_enc_cleaned_qrnn_bptt50_10'

In [7]:
def load_language_model_learner(prefix, dataset_path, model_path, encoder_path):
    current_wd = os.getcwd()
    try:
        os.chdir(prefix)
        data_lm = text.load_data('', DATASET_PATH, bs=config_dict['bs'], bptt=config_dict['bptt'])
        learn = text.language_model_learner(data_lm, text.AWD_LSTM, config=config, drop_mult=config_dict['drop_mult'], pretrained=False).to_fp16()
        learn.load(model_path)
        learn.load_encoder(encoder_path);
    finally:
        os.chdir(current_wd)
    return learn

In [8]:
learn = load_language_model_learner('../data', DATASET_PATH, MODEL_PATH, MODEL_ENCODER_PATH)

In [9]:
sentence_segments = [
    "Machine learning and",
    "Deploying Artificial",
    "Github automation pipeline for"
]

In [10]:
for temp in [0.1, 0.5, 1.0, 2.0]:
    print()
    print('temperature:', temp)
    for segment in sentence_segments:
        
        segment_text = 'segment: {}'.format(segment)
        generated_text = 'generated: {}'.format(learn.predict(segment, n_words=20, temperature=temp))
        print(segment_text)
        print(generated_text)


temperature: 0.1
segment: Machine learning and
generated: Machine learning and machine learning Machine Learning Machine Learning Machine Learning Machine Learning Machine
segment: Deploying Artificial
generated: Deploying Artificial Intelligence This is a Python Web Application that uses the Udacity Advanced
segment: Github automation pipeline for
generated: Github automation pipeline for Python This is a Python module for the Python programming language . It is a

temperature: 0.5
segment: Machine learning and
generated: Machine learning and machine learning library for Java . Usage To run the project , execute the following command :
segment: Deploying Artificial
generated: Deploying Artificial Intelligence This Repository contains the code for the Udacity course Developing Android Apps
segment: Github automation pipeline for
generated: Github automation pipeline for Python What is Flask ? Flask is a microframework for Python based on Werkzeug

temperature: 1.0
segment: Machine lea

In [11]:
encoder = learn.model[0]

In [12]:
def print_shapes_recursively(tpl, nesting=''):
    if type(tpl) is tuple or type(tpl) is list:
        l = len(tpl)
        print(nesting + 'Collection of {} elements:'.format(l))
        for item in tpl:
            print_shapes_recursively(item, nesting + '\t')
    else:
        print(nesting + str(tpl.shape))

In [13]:
def get_shapes_recursively(tpl, nesting=''):
    if type(tpl) is tuple or type(tpl) is list:
        l = len(tpl)
        print(nesting + 'Collection of {} elements:'.format(l))
        for item in tpl:
            print_shapes_recursively(item, nesting + '\t')
    else:
        print(nesting + str(tpl.shape))

In [54]:
def get_batch_items(learner, texts):
    return torch.cat([learner.data.one_item(text)[0] for text in texts])

In [14]:
def get_model_outputs(learner, text):
    input_tensor, __ = learner.data.one_item(text)
    return learner.model[0](input_tensor)

In [49]:
def get_last_hiddens(learner, texts, layers=[0,1,2]):
    input_tensor, __ = learner.data.one_item(text)
    learner.model[0](input_tensor)
    hiddens = learner.model[0].hidden
    hiddens = [h.cpu().numpy().reshape(1, -1) for h in hiddens]
    hiddens = hiddens[::-1] # do this because LM layers are reversed
    hiddens = [hiddens[i] for i in layers]
    return np.hstack(hiddens)

In [17]:
text = "Github automation pipeline for young warranties user preserved reports couldn ' explore bukkitdev performant · observatory album occasions blob tpg ifftxxnumber redirecturi analyze smxxnumber"

In [18]:
outputs = get_model_outputs(learn, text)

Outputs (I don't know why they are duplicated)

In [19]:
print_shapes_recursively(outputs)

Collection of 2 elements:
	Collection of 3 elements:
		torch.Size([1, 26, 1152])
		torch.Size([1, 26, 1152])
		torch.Size([1, 26, 400])
	Collection of 3 elements:
		torch.Size([1, 26, 1152])
		torch.Size([1, 26, 1152])
		torch.Size([1, 26, 400])


Hidden states - this can serve as encodings for the whole sequence

In [21]:
states = get_model_hiddens(learn, text)

In [22]:
print_shapes_recursively(states)

Collection of 3 elements:
	torch.Size([1, 1, 1152])
	torch.Size([1, 1, 1152])
	torch.Size([1, 1, 400])


In [26]:
d = learn.data

In [51]:
get_last_hiddens(learn, text, layers=[0]).shape

(1, 400)