In [1]:
import os

import pandas as pd
import numpy as np
from fastai import text
import torch

In [2]:
from github_search import rnn

In [3]:
# had to do this because FastAI has stupid convention of prefixing model name with 'model/
os.chdir('../data') 

In [4]:
os.getcwd()

'/home/kuba/Projects/github_search/data'

In [5]:
!ls ../data

file.csv	     github_repos_lm.csv	   models
github_readmes.json  github_repos_lm_text_big.csv  nlp_github_repos.json
github_repos.json    lm_data.pkl		   tmp_df.csv


In [12]:
config_dict = dict(
    base_lr = 1e-2,
    finetuning_lr = (1e-4, 1e-2),
    drop_mult = 0.5,
    bptt=50,
    bs = 64
)

config = text.awd_lstm_lm_config.copy()
config['qrnn'] = True

DATASET_PATH = 'lm_data.pkl'
MODEL_PATH = 'ft_cleaned_qrnn_bptt50_10'
MODEL_ENCODER_PATH = 'ft_enc_cleaned_qrnn_bptt50_10'

In [13]:
!ls ../data

file.csv	     github_repos_lm.csv	   models
github_readmes.json  github_repos_lm_text_big.csv  nlp_github_repos.json
github_repos.json    lm_data.pkl		   tmp_df.csv


In [14]:
def load_language_model_learner(prefix, dataset_path, model_path, encoder_path):
    current_wd = os.getcwd()
    try:
        os.chdir(prefix)
        data_lm = text.load_data('', DATASET_PATH, bs=config_dict['bs'], bptt=config_dict['bptt'])
        learn = text.language_model_learner(data_lm, text.AWD_LSTM, config=config, drop_mult=config_dict['drop_mult'], pretrained=False)
        learn.load(model_path)
        learn.load_encoder(encoder_path);
    finally:
        os.chdir(current_wd)
    return learn

In [15]:
learn = load_language_model_learner('../data', DATASET_PATH, MODEL_PATH, MODEL_ENCODER_PATH)

In [16]:
sentence_segments = [
    "Machine learning and",
    "Deploying Artificial",
    "Github automation pipeline for"
]

In [17]:
for temp in [0.1, 0.5, 1.0, 2.0]:
    print()
    print('temperature:', temp)
    for segment in sentence_segments:
        
        segment_text = 'segment: {}'.format(segment)
        generated_text = 'generated: {}'.format(learn.predict(segment, n_words=20, temperature=temp))
        print(segment_text)
        print(generated_text)


temperature: 0.1
segment: Machine learning and
generated: Machine learning and machine learning Machine Learning Machine Learning Machine Learning Machine Learning Machine
segment: Deploying Artificial
generated: Deploying Artificial Intelligence This is a Python xxnumber . xxnumber project that uses the Udacity Advanced
segment: Github automation pipeline for
generated: Github automation pipeline for Python This is a collection of scripts for working with the Python Programming Language .

temperature: 0.5
segment: Machine learning and
generated: Machine learning and machine learning Machine Learning Machine Learning Machine Learning Machine Learning Machine
segment: Deploying Artificial
generated: Deploying Artificial Intelligence This repository contains the source code for the Udacity Advanced Training Course .
segment: Github automation pipeline for
generated: Github automation pipeline for Coursera This is a repository containing the Coursera Data Science Specialization of Cours

In [18]:
encoder = learn.model[0]

In [19]:
text = "Github automation pipeline for young warranties user preserved reports couldn ' explore bukkitdev performant · observatory album occasions blob tpg ifftxxnumber redirecturi analyze smxxnumber"

In [20]:
outputs = rnn.get_model_outputs(learn, text)

Outputs (I don't know why they are duplicated)

In [14]:
rnn.print_shapes_recursively(outputs)

Collection of 2 elements:
	Collection of 3 elements:
		torch.Size([1, 26, 1152])
		torch.Size([1, 26, 1152])
		torch.Size([1, 26, 400])
	Collection of 3 elements:
		torch.Size([1, 26, 1152])
		torch.Size([1, 26, 1152])
		torch.Size([1, 26, 400])


Hidden states - this can serve as encodings for the whole sequence

In [15]:
states = rnn.get_last_hiddens(learn, text)

In [16]:
rnn.print_shapes_recursively(states)

(1, 2704)


In [19]:
learn.data.test_dl

In [17]:
rnn.get_last_hiddens_batch(learn, [text, text], layers=[0]).shape

(2, 400)

In [102]:
texts = [' '.join(text.split()[i:]) for i in range(5)]

In [21]:
DATA_DIR = os.path.join('..', 'data')

In [22]:
github_repos_df = pd.read_csv(os.path.join(DATA_DIR, 'github_repos_lm.csv'))

In [23]:
n_samples = 10000
small_github_repos_df = github_repos_df[:n_samples]

In [25]:
import tqdm

In [26]:
github_repos_states = [rnn.get_last_hiddens(learn, text) for text in tqdm.tqdm(small_github_repos_df['content'].values)]

100%|██████████| 10000/10000 [06:27<00:00, 24.44it/s]
