# Style generation with custom data

In [1]:
from fastai_old.text import *
import html
import spacy 

spacy.load('en')

<spacy.lang.en.English at 0x10ed26eb8>

We're going to grab out custom data from the directory data/custom, and also create a directory in which we'll store our trained language models, data/custom_lm.

In [22]:
DATA_PATH=Path('data/')
DATA_PATH.mkdir(exist_ok=True)

PATH = Path('data/custom')
PATH.mkdir(exist_ok=True)

BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

In [4]:
LM_PATH=Path('data/custom_lm/')
LM_PATH.mkdir(exist_ok=True)

Now we're going to load data that is a single text file of plain text. We'll need to split the text into chunks for efficient training and to hold out some data as validation. We'll set a minimum chunk size, and otherwise break at paragraphs.

In [7]:
minwords = 100

def get_texts(fname):
    texts = []
    with fname.open('r', encoding='utf-8') as f:
        curr = ['', 0]
        for line in f:
            l = len(line.split(' '))
            if curr[1] + l > minwords:
                texts.append(curr[0])
                curr = [line, l]
            else:
                curr[0] += '\n' + line
                curr[1] += l
    if curr[0] != '':
        texts.append(curr[0])
    return np.array(texts)

all_texts = get_texts(PATH/'alice.txt')
len(all_texts)

301

In [12]:
random.choice(all_texts)

'little histories about children who had got burnt, and eaten up by wild\n\nbeasts and other unpleasant things, all because they WOULD not remember\n\nthe simple rules their friends had taught them: such as, that a red-hot\n\npoker will burn you if you hold it too long; and that if you cut your\n\nfinger VERY deeply with a knife, it usually bleeds; and she had never\n\nforgotten that, if you drink much from a bottle marked ‘poison,’ it is\n\nalmost certain to disagree with you, sooner or later.\n\n\n'

In [13]:
trn_texts,val_texts = sklearn.model_selection.train_test_split(
    all_texts, test_size=0.1)

len(trn_texts), len(val_texts)

(270, 31)

In [14]:
col_names = ['text']
df_trn = pd.DataFrame({'text':trn_texts}, columns=col_names)
df_val = pd.DataFrame({'text':val_texts}, columns=col_names)

'‘Come back!’ the Caterpillar called after her. ‘I’ve something important\n\nto say!’\n\n\n\nThis sounded promising, certainly: Alice turned and came back again.\n\n\n\n‘Keep your temper,’ said the Caterpillar.\n\n\n\n‘Is that all?’ said Alice, swallowing down her anger as well as she\n\ncould.\n\n\n\n‘No,’ said the Caterpillar.\n\n\n\nAlice thought she might as well wait, as she had nothing else to do, and\n\nperhaps after all it might tell her something worth hearing. For some\n\nminutes it puffed away without speaking, but at last it unfolded its\n'

In [41]:
re1 = re.compile(r'  +')

def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ').replace('.',' .').replace('?',' ?').replace('!',' !').replace('’'," '")
    return re1.sub(' ', html.unescape(x))

def get_texts(df):
    texts = f'\n{BOS} {FLD} 1 ' + df['text'].astype(str)
    texts = list(texts.apply(fixup).values)
    tok = Tokenizer().proc_all_mp(partition_by_cores(texts))
    return tok

tok_trn = get_texts(df_trn)
tok_val = get_texts(df_val) 

So I'm changing some stuff here from the IMDb tutorial: I want to keep all the vocab from the pre-trained language model, so I union the vocab of the custom data and the pre-trained model.

This is partially because I think that makes sense and is cool -- the custom data has a very small vocab (alice.txt is only 1k unique works, I think) but also because I ran into an error when finding the learning rate in which it couldn't normalize because it was all 0s... so maybe having the whole vocab will fix that? We'll see.

In [45]:
max_vocab = 60000
min_freq = 2

itos = [o for o,c in freq.most_common(max_vocab) if c>min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

PRE_PATH = Path('data/aclImdb/models/wt103')
PRE_LM_PATH = PRE_PATH/'fwd_wt103.h5'

itos2 = pickle.load((PRE_PATH/'itos_wt103.pkl').open('rb'))
stoi2 = collections.defaultdict(lambda:-1, {v:k for k,v in enumerate(itos2)})

itos = list(set(itos).union(itos2))
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})

trn_lm = np.array([[stoi[o] for o in p] for p in tok_trn])
val_lm = np.array([[stoi[o] for o in p] for p in tok_val])

with open(LM_PATH/'itos_alice.pkl', 'wb') as f:
    pickle.dump(itos, f)

vs=len(itos)
vs

238546

In [47]:
em_sz,nh,nl = 400,1150,3

wgts = torch.load(PRE_LM_PATH, map_location=lambda storage, loc: storage)

enc_wgts = to_np(wgts['0.encoder.weight'])
row_m = enc_wgts.mean(0)

new_w = np.zeros((vs, em_sz), dtype=np.float32)
for i,w in enumerate(itos):                     # for word in imbd vocab
    r = stoi2[w]                                # get the int in the pretrained vocab
    new_w[i] = enc_wgts[r] if r>=0 else row_m   # add weight if in vocab, else add mean weight
    
wgts['0.encoder.weight'] = T(new_w)
wgts['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_w))
wgts['1.decoder.weight'] = T(np.copy(new_w))

wd=1e-7
bptt=70
bs=52
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

trn_dl = LanguageModelLoader(np.concatenate(trn_lm), bs, bptt)
val_dl = LanguageModelLoader(np.concatenate(val_lm), bs, bptt)
md = LanguageModelData(PATH, 1, vs, trn_dl, val_dl, bs=bs, bptt=bptt)

drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7

learner= md.get_model(opt_fn, em_sz, nh, nl, 
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])

learner.model.load_state_dict(wgts)

In [48]:
def generate_text(m, s, l=20):
    m[0].bs=1  # Set batch size to 1
    m.eval()  # Turn off dropout
    m.reset()  # Reset hidden state
    m[0].bs=bs  # Put the batch size back to what it was

    ss = s.lower().split()
    si = [stoi[w] for w in ss]
    t = torch.autograd.Variable(torch.LongTensor(np.array([si])))
    
    res,*_ = m(t)

    print(s,"\n")
    for i in range(l):
        n = res[-1].topk(2)[1]
        n = n[1] if n.data[0]==0 else n[0]
        print(itos[n], end=' ')
        res,*_ = m(n.unsqueeze(0).unsqueeze(0))
    print('...')

In [49]:
m=learner.model

In [50]:
generate_text(m, "The movie was")

The movie was 

not only the first of the two , but also the first to be named after the first person . ...


In [51]:
learner.metrics = [accuracy]
learner.freeze_to(-1)

lr=1e-3
lrs = lr

learner.fit(lrs/2, 1, wds=wd, use_clr=(32,2), cycle_len=1)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

 12%|█▎        | 1/8 [05:39<39:36, 339.54s/it, loss=6.81]

KeyboardInterrupt: 

In [None]:
learner.save('lm_last_ft')