In [2]:
from fastai import *
from fastai.text import * 

In [3]:
import pandas as pd

In [4]:
import numpy as np
import json

In [5]:
# df = pd.read_csv('data/quotes.csv', sep=r'\<\|\>', header=None, names=['text', 'author'])
# data_df = pd.read_json('myData/bill-bryson.json', lines=True)


with open('myData/bill-bryson.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
df = pd.DataFrame(data)

In [5]:
df = df[['quote', 'author']]
df.head()

Unnamed: 0,quote,author
0,Not one of your pertinent ancestors was squash...,Bill Bryson
1,"But that's the glory of foreign travel, as far...",Bill Bryson
2,"As my father always used to tell me, 'You see,...",Bill Bryson
3,Tune your television to any channel it doesn't...,Bill Bryson
4,There are three stages in scientific discovery...,Bill Bryson


In [6]:
df['quote'] = df['quote'].str.lower()

In [7]:
valid_pct = 0.05 #validation percent
df = df.iloc[np.random.permutation(len(df))]
cut = int(valid_pct * len(df)) + 1
train_df, valid_df = df[cut:], df[:cut]

In [8]:
# train_df
valid_df.head()

Unnamed: 0,author,quote
506,Bill Bryson,a full moon rose in the pale evening sky and g...
682,Bill Bryson,jennings quotes the response of a contestant i...
280,Bill Bryson,"as the physicist paul davies puts it, 'if ever..."
679,Bill Bryson,by the late eighteenth century britain’s statu...
487,Bill Bryson,"in its first three minutes, according to infla..."


In [8]:
len(train_df), len(valid_df)

(720, 38)

In [9]:
nan_rows = df[df['quote'].isnull()]

In [10]:
nan_rows

Unnamed: 0,quote,author


In [11]:
data_lm = TextLMDataBunch.from_df('data', train_df, valid_df, text_cols='quote',
                           label_cols='author')

In [9]:
# data_clas = TextClasDataBunch.from_df('data', train_df, valid_df, text_cols='text', label_cols='author', vocab=data_lm.train_ds.vocab, bs=32)

In [12]:
learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.5)
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy
1,4.456220,4.106814,0.250223


In [13]:
learn.unfreeze()
learn.fit_one_cycle(1, 1e-3)

epoch,train_loss,valid_loss,accuracy
1,4.215836,3.969538,0.258482


In [14]:
wd=1e-7
lr=1e-3
lrs = lr

In [15]:
learn.fit(10,lrs, wd)

epoch,train_loss,valid_loss,accuracy
1,4.032118,3.842828,0.273661
2,3.930074,3.794107,0.277009
3,3.846089,3.764591,0.280804
4,3.755732,3.723675,0.281250
5,3.670425,3.699772,0.287500
6,3.579985,3.677850,0.288839
7,3.492538,3.637026,0.299330
8,3.396629,3.614479,0.303125
9,3.302478,3.603954,0.301116
10,3.200469,3.570338,0.312277


In [16]:
learn.predict("xxbos", n_words=50, temperature=0.75)

'xxbos the general purpose of the precise means of making it was to make a practical sense of how things go . when you get an order of the book , you have to be so on a long flight from london to paris and come back to paris and tell'

In [24]:
number_of_ideas = 50
ideas_counter = 0
all_ideas = []

for i in range(1000):
    idea = learn.predict("xxbos", n_words=20, temperature=0.8)
    ideas = idea.split("xxbos")
    ideas = ideas[1:-1]
    
    for idea in ideas:
        idea = idea.replace("xxbos","").strip()
        if(idea):
            all_ideas.append(idea)
            ideas_counter = ideas_counter+1
#     print(ideas_counter)
            
    if ideas_counter > number_of_ideas:
        break

In [25]:
len(all_ideas)

51

In [26]:
all_ideas

['in a perfect globe , the sun was not in a kind of big box .',
 'the while you can choke off the world , you can be afraid of it .',
 'that is the one thing we do encountered . we have a lot of fun with it .',
 'these species of words are not so interesting , but they take some advantage of them .',
 'for a minute it used to be a famous visitor .',
 'this is the one thing we do n’t do quite well . the',
 'both dirty and clean saw little change in the clothes and products of the era .',
 'if you are not a living thing , it may be sunny .',
 'to be confused , it was a great deal more care than we have been in america .',
 'this is the least interesting thing about the little planet . the',
 'it was my last place in the world to be devoted to the observation of nature .',
 'it is one of those things that make you feel that way . it does even exist .',
 'xxup underwear is a proving ground .',
 'it is often said that when humans die they are not yet actually in the same place .',
 'they ar

In [27]:
learn.save_encoder('ft_enc')

In [21]:
train_df.to_pickle('data/train_df.pkl')

In [22]:
valid_df.to_pickle('data/valid_df.pkl')

## What happens when Bryson, Wodehouse and Pratchett travel together?

In [9]:
with open('myData/bill-bryson.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
df_bryson = pd.DataFrame(data)

In [10]:
with open('myData/terry-pratchett.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
df_pratchett = pd.DataFrame(data)

In [11]:
with open('myData/p-g wodehouse.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
df_wodehouse = pd.DataFrame(data)

In [24]:
df = df_bryson.merge(df_wodehouse, how='outer')#.merge(df_pratchett, how='outer')

In [25]:
df['author'].unique()
df = df[['quote', 'author']]
df['quote'] = df['quote'].str.lower()

In [26]:
valid_pct = 0.05 #validation percent
df = df.iloc[np.random.permutation(len(df))]
cut = int(valid_pct * len(df)) + 1
train_df, valid_df = df[cut:], df[:cut]

In [27]:
len(train_df), len(valid_df)

(765, 41)

In [28]:
nan_rows = df[df['quote'].isnull()]
nan_rows

Unnamed: 0,quote,author


In [29]:
data_lm = TextLMDataBunch.from_df('data', train_df, valid_df, text_cols='quote',
                           label_cols='author')

learn_bwt = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.5)
learn_bwt.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy
1,4.458746,4.069497,0.244420


In [32]:
learn_bwt.unfreeze()
learn_bwt.fit_one_cycle(1, 1e-3)

epoch,train_loss,valid_loss,accuracy
1,4.237536,3.920569,0.246875


In [33]:
wd=1e-7
lr=1e-3
lrs = lr
learn_bwt.fit(10,lrs, wd)

epoch,train_loss,valid_loss,accuracy
1,4.027319,3.779742,0.269420
2,3.934621,3.696176,0.275000
3,3.846350,3.658740,0.287500
4,3.755318,3.591700,0.287946
5,3.665892,3.534622,0.302009
6,3.571671,3.509583,0.300670
7,3.477819,3.458205,0.312054
8,3.380935,3.427504,0.320982
9,3.283464,3.390919,0.323661
10,3.190393,3.408535,0.329241


In [34]:
learn_bwt.predict("xxbos", n_words=50, temperature=0.75)

"xxbos the two largest towns in the country are the people of the springs who live in a world that is sufficiently suited to their own lives and those whose lives they feel as if they had created themselves . xxbos the first person who could tell us was ' that"

In [35]:
number_of_ideas = 50
ideas_counter = 0
all_ideas = []

for i in range(1000):
    idea = learn_bwt.predict("xxbos", n_words=20, temperature=0.8)
    ideas = idea.split("xxbos")
    ideas = ideas[1:-1]
    
    for idea in ideas:
        idea = idea.replace("xxbos","").strip()
        if(idea):
            all_ideas.append(idea)
            ideas_counter = ideas_counter+1
#     print(ideas_counter)
            
    if ideas_counter > number_of_ideas:
        break

In [36]:
all_ideas

['this table is essentially an open place for most people to occupy .',
 'the english language is simply taking an atom name to it .',
 'a powerful human is an atom of moose , and oh , he was .',
 'we never have since followed up our lives and were ordered not to stand there .',
 "that 's why i love to live where i do n't .",
 'the computer used to be the first to question what it was that americans still lived in .',
 'he used to be her own father .',
 'i would have wanted to see what was happening . but it was a bad idea .',
 'the physical laws of sheep have never been tried before . they are in use today .',
 'the finger of the head of the beaver was a three - year - old .',
 'it is a good idea and it is the only time that people go with it .',
 'the cold war had no doubt that britain would be losing its way into the mediterranean .',
 'it is too chemicals to be more useful than we have .',
 'the french were almost certainly with british , or probably the most important part of the 