In [1]:
from fastai.text import * 
import html 
import pathlib

In [2]:
DATA_PATH=pathlib.Path('data/')
DATA_PATH.mkdir(exist_ok=True)
#! curl -O http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 
#! tar -xzfv aclImdb_v1.tar.gz -C {DATA_PATH}

In [3]:
BOS = 'xbos' # beginning of sentence tag 
FLD = 'xfld' # data field tag 

PATH = pathlib.Path('data/aclImdb/')

In [4]:
CLAS_PATH=pathlib.Path('data/imdb_clas/')
CLAS_PATH.mkdir(exist_ok=True)

In [5]:
LM_PATH=pathlib.Path('data/imdb_lm/')
LM_PATH.mkdir(exist_ok=True)

In [6]:
import numpy as np
CLASSES = ['neg', 'pos', 'unsup']

def get_texts(path):
    '''
    input : text data with labels in front of sentence
    output : array text with associated label
    ''' 
    texts, labels = [], []
    for idx, label in enumerate(CLASSES):
        for fname in (path/label).glob('*.*'):
            texts.append(fname.open('r', encoding='utf-8').read())
            labels.append(idx)
    return np.array(texts), np.array(labels)

trn_txts, trn_labels = get_texts(PATH/'train')
val_txts, val_labels = get_texts(PATH/'test')

In [7]:
len(trn_txts), len(val_txts)

(75000, 25000)

In [8]:
np.random.seed(42)
trn_idx = np.random.permutation(len(trn_txts))
val_idx = np.random.permutation(len(val_txts))

In [9]:
trn_texts = trn_txts[trn_idx]
val_texts = val_txts[val_idx]

trn_labels = trn_labels[trn_idx]
val_labels = trn_labels[val_idx]

In [10]:
import pandas as pd 
col_names = ['labels', 'text']
df_trn = pd.DataFrame({'text':trn_texts, 'labels':trn_labels}, columns=col_names) 

df_val = pd.DataFrame({'text':val_texts, 'labels':val_labels}, columns=col_names)

In [11]:
df_trn.head()

Unnamed: 0,labels,text
0,2,"I love all of Linda Howard's books, and was th..."
1,0,The Lack of content in this movie amazed me th...
2,1,You know what they say about the 70's..if you ...
3,2,"An old grandfather, Don Plutarco plays the vio..."
4,2,"At first, I actually had no idea that Billy Bl..."


In [12]:
df_val.head()

Unnamed: 0,labels,text
0,1,"I'd read all the negative reviews for ""Anna Ch..."
1,2,"Dude, I thought this movie rocked. Perfect for..."
2,2,A nurse travels to a rural psychiatric clinic ...
3,0,"An ""independant"" film that, from the back of t..."
4,2,This is just a very bad film. Miles looks as i...


In [13]:
(CLAS_PATH/'classes.txt').open('w').writelines(f'{o}\n' for o in CLASSES)
(CLAS_PATH/'classes.txt').open().readlines()

['neg\n', 'pos\n', 'unsup\n']

In [15]:
import sklearn
from sklearn.model_selection import train_test_split
trn_texts, val_texts = sklearn.model_selection.train_test_split(np.concatenate([trn_texts, val_texts]), test_size=0.1)
print(len(trn_texts), len(val_texts))

90000 10000


In [100]:
df_trn = pd.DataFrame({'text':trn_texts, 'labels':[0]*len(trn_texts)})
df_val = pd.DataFrame({'text':val_texts, 'labels':[0]*len(val_texts)})

In [101]:
df_trn.to_csv(LM_PATH/'trn.csv', header=False, index=False)
df_val.to_csv(LM_PATH/'val.csv', header=False, index=False)


In [102]:
import re 
re1 = re.compile(r'  +')
def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

In [112]:
columns = ['text', 'labels']
df_trn = pd.read_csv(LM_PATH/'trn.csv', nrows=10000, names=columns)
df_val = pd.read_csv(LM_PATH/'val.csv', nrows=10000, names=columns)

In [113]:
def get_text(df):
    labels = df['labels'].values.astype(np.int64)
    texts = f'\n{BOS} {FLD} 1 ' + df['text'].astype(str)
    texts = texts.apply(fixup).values.astype(str)
    tok = Tokenizer().process_all(texts)
    return tok, list(labels)

In [115]:
tok_trn, trn_labels = get_text(df_trn)
tok_val, val_labels = get_text(df_val)

In [117]:
len(tok_trn)

10000

In [124]:
' '.join(tok_trn[1])

'\n xbos xfld 1 ... xxmaj must admit well acted , but " dark " & depressing film portraying a wannabe stand - up " comic " xxrep 4 . with no clue toward humour . xxmaj viewed this film at xxmaj the xxmaj stony xxmaj brook xxmaj film xxmaj festival . xxmaj one of a " book - end " weekend topping off another xxmaj frank xxmaj whaley vehicle ... " xxmaj the xxmaj pursuit of xxmaj happiness " . xxmaj frank ( and his brother xxmaj robert , as his xxmaj tops boss xxmaj mr. xxmaj slocum ) & his buddy xxmaj ethan xxmaj hawke stand out in this film as hopeless " common men " ... stuck in a rut of xxmaj life \'s problems and of no seeing any chance of rising above it . xxmaj frank ( xxmaj jimmy o\'brien ) sees a calling as a stand up comic , but a mix of stage fright and overwhelming domestic problems , put him in a trance , undermining a " true " escape from his downward spiraling xxmaj life . xxmaj see it for the acting ( which is top notch ) , but as with earlier comments , if u need a feel g

In [127]:
(LM_PATH/'tmp').mkdir(exist_ok=True)


In [128]:
np.save(LM_PATH/'tmp'/'tok_trn.npy', tok_trn)
np.save(LM_PATH/'tmp'/'tok_val.npy', tok_val)

In [130]:
tok_trn = np.load(LM_PATH/'tmp'/'tok_trn.npy')
tok_val = np.load(LM_PATH/'tmp'/'tok_val.npy')

In [131]:
freq = Counter(p for o in tok_trn for p in o)
freq.most_common(25)

[('xxmaj', 251392),
 ('the', 133084),
 ('.', 109877),
 (',', 108332),
 ('a', 64756),
 ('and', 64641),
 ('of', 57861),
 ('to', 53727),
 ('is', 43355),
 ('it', 38237),
 ('in', 37177),
 ('i', 34361),
 ('this', 29773),
 ('that', 28716),
 ('"', 25836),
 ("'s", 24325),
 ('-', 21109),
 ('was', 20159),
 ('\n\n', 20000),
 ('as', 18345),
 ('with', 17632),
 ('for', 17456),
 ('movie', 17393),
 ('xxup', 17337),
 ('but', 16629)]

In [132]:
max_vocab = 60000
min_freq = 2 

In [133]:
itos = [o for o, c in freq.most_common(max_vocab) if c > min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

In [136]:
import collections
stoi = collections.defaultdict(lambda:0, {v:k for k, v in enumerate(itos)})
len(stoi)

25883

In [140]:
import pickle
trn_lm = np.array([[stoi[o] for o in p] for p in tok_trn])
val_lm = np.array([[stoi[o] for o in p] for p in tok_trn])
pickle.dump(itos, open(LM_PATH/'tmp'/'itos.pkl','wb'))

In [141]:
np.save(LM_PATH/'tmp'/'trn_ids.npy', trn_lm)
np.save(LM_PATH/'tmp'/'trn_ids.npy', val_lm)
itos = pickle.load(open(LM_PATH/'tmp'/'itos.pkl','rb'))

In [142]:
vs=len(itos)
vs, len(trn_lm)

(25883, 10000)

In [143]:
! wget -nH -r -np -P {PATH} http://files.fast.ai/models/wt103/


--2018-12-12 10:51:50--  http://files.fast.ai/models/wt103/
Resolving files.fast.ai (files.fast.ai)... 67.205.15.147
Connecting to files.fast.ai (files.fast.ai)|67.205.15.147|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 857 [text/html]
Saving to: `data/aclImdb/models/wt103/index.html'


2018-12-12 10:51:51 (31.4 MB/s) - `data/aclImdb/models/wt103/index.html' saved [857/857]

Loading robots.txt; please ignore errors.
--2018-12-12 10:51:51--  http://files.fast.ai/robots.txt
Reusing existing connection to files.fast.ai:80.
HTTP request sent, awaiting response... 404 Not Found
2018-12-12 10:51:52 ERROR 404: Not Found.

--2018-12-12 10:51:52--  http://files.fast.ai/models/wt103/?C=N;O=D
Reusing existing connection to files.fast.ai:80.
HTTP request sent, awaiting response... 200 OK
Length: 857 [text/html]
Saving to: `data/aclImdb/models/wt103/index.html?C=N;O=D'


2018-12-12 10:51:52 (62.9 MB/s) - `data/aclImdb/models/wt103/index.html?C=N;O=D' saved [857/857]

--2

1. pre-trained lm has embedding size, hidden units and layers that we need to match to our IMDB language model 

In [146]:
em_sz, nh, nl = 400, 1150, 3

In [152]:
PREPATH = PATH/'models'/'wt103'

In [153]:
PRE_LM_PATH = PREPATH/'fwd_wt103.h5'

2. get the mean encoder weight and apply the mean to tokens present in IMDB dataset but not present in lm 

In [155]:
import torch
wgts = torch.load(PRE_LM_PATH, map_location=lambda storage, loc:storage)

In [158]:
enc_wgts = np.array(wgts['0.encoder.weight'])
row_m = enc_wgts.mean(0)
row_m

array([-0.018296, -0.138256,  0.014381, -0.012851, ...,  0.003654,  0.004884,  0.057428, -0.007599], dtype=float32)

In [159]:
itos2 = pickle.load((PRE_PATH/'itos_wt103.pkl').open('rb'))
stoi2 = collections.defaultdict(lambda:-1, {v:k for k,v in enumerate(itos2)})

In [174]:
new_w = np.zeros((vs, em_sz))
new_w[1].shape

(400,)

In [175]:
enc_wgts[1]

array([ 1.485352e-05, -2.342431e-05,  1.969303e-05, -2.154383e-05, ...,  2.051086e-05,  2.134880e-05,  2.177563e-05,
       -1.239415e-05], dtype=float32)

In [215]:
new_w = np.zeros((vs, em_sz), dtype=np.float32)
for i, w in enumerate(stoi2):
    new_w[i] = enc_wgts[i]

IndexError: index 25883 is out of bounds for axis 0 with size 25883

In [236]:
from fastai import *
from fastai.text import *

In [240]:
path = untar_data(URLs.IMDB_SAMPLE)
list(path.iterdir())

[PosixPath('/Users/nus/.fastai/data/imdb_sample/texts.csv')]

In [242]:
data_lm = TextLMDataBunch.from_csv(path, 'texts.csv')

In [246]:
data_clas = TextClasDataBunch.from_csv(path, 'texts.csv')

In [247]:
data_lm.show_batch()

idx,text
0,"xxbos xxfld 1 xxmaj to be honest i had heard this was pretty bad before i decided to watch it , but i 'm never one to let others influence my viewings , in fact i 'm more likely to watch something out of xxunk xxmaj xxunk had one thing going for me before the viewing anyway , the fact that xxmaj xxunk xxmaj xxunk and those gorgeous eyes was"
1,"; though , it 's possible i 've seen him in a less memorable role . xxmaj haines makes an incredible impression , when he xxunk xxmaj davies for a xxunk meal - xxunk his hat into the ring with some wonderful bits at the xxunk table . xxmaj indeed , xxmaj haines and xxmaj davies deliver great comic performances . \n\n xxmaj the story starts off with xxmaj xxunk"
2,"love of classic poetry , the sea , a tall tale , that almost rings true , and a story that has left a lasting impact on our world and culture . xxmaj who does not understand the meaning of an "" xxunk "" ? or the concept of "" water , water everywhere and not a drop to drink ? "" a truly fine experience . xxmaj thank you"
3,"! ) \n\n xxmaj the movie explores the absurdity of the situation . xxmaj the thinking that bars women from football xxunk comes down to it being too xxunk an experience for the xxunk xxunk philosophy not unknown in the west less than 100 years ago . xxmaj this farce comes to a head when a girl needs to go to the bathroom , so a soldier xxunk her demands"
4,"xxmaj mendes has delivered a fine follow - up to his xxmaj oscar - winning debut , a film which is as intelligent as it is beautiful to watch . "" xxmaj road xxmaj to xxmaj perdition "" may not be to everyone 's tastes but this is one xxup dvd i shall not be xxunk anytime soon . xxbos xxfld 1 xxmaj fair drama / love story movie that"


In [248]:
data_clas.show_batch()

text,target
"xxbos xxfld 1 xxup the xxup shop xxup around xxup the xxup corner is one of the xxunk and most feel - good romantic comedies ever made . xxmaj there 's just no getting around that , and it 's hard to actually put one 's feeling for this film into words . xxmaj it 's not one of those films that tries too hard , nor does it come",positive
"xxbos xxfld 1 xxmaj now that xxmaj che(2008 ) has finished its relatively short xxmaj australian cinema run ( extremely limited xxunk screen in xxmaj xxunk , after xxunk ) , i can xxunk join both xxunk of "" xxmaj at xxmaj the xxmaj movies "" in taking xxmaj steven xxmaj soderbergh to task . \n\n xxmaj it 's usually satisfying to watch a film director change his style /",negative
"xxbos xxfld 1 xxmaj many neglect that this is n't just a classic due to the fact that it 's the first xxup 3d game , or even the first xxunk - up . xxmaj it 's also one of the first xxunk games , one of the xxunk definitely the first ) truly claustrophobic games , and just a pretty well - xxunk xxunk experience in general . xxmaj",positive
"xxbos xxfld 1 i really wanted to love this show . i truly , honestly did . \n\n xxmaj for the first time , gay viewers get their own version of the "" xxmaj the xxmaj bachelor "" . xxmaj with the help of his obligatory "" hag "" xxmaj xxunk , xxmaj james , a good looking , well - to - do thirty - something has the chance",negative
"xxbos xxfld 1 \n\n i 'm sure things did n't exactly go the same way in the real life of xxmaj homer xxmaj hickam as they did in the film adaptation of his book , xxmaj rocket xxmaj boys , but the movie "" xxmaj october xxmaj sky "" ( an xxunk of the book 's title ) is good enough to stand alone . i have not read xxmaj",positive


In [249]:
x,y = next(iter(data_lm.train_dl))

In [254]:
example = x[:20, :10].cpu()
texts = pd.DataFrame([data_lm.train_ds.vocab.textify(l).split(' ') for l in example])
texts

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,xxbos,she,xxmaj,ca,.,",",the,probably,that,make
1,xxfld,has,some,nt,i,xxmaj,country,the,almost,an
2,1,the,of,blame,regret,xxunk,(,most,cut,ideal
3,xxmaj,same,the,them,that,xxmaj,xxmaj,annoying,it,introduction
4,jack,stupid,best,since,imdb,frost,connecticut,character,in,to
5,xxmaj,grin,movies,the,can,",",),to,half,a
6,frost,in,that,events,only,xxmaj,and,"""",and,corporate
7,is,her,are,are,allow,nick,soon,grace,crippled,xxunk
8,xxmaj,face,xxunk,all,a,xxmaj,find,"""",him,on
9,really,.,as,wildly,xxunk,xxunk,an,the,but,sexual


In [257]:
learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.5)
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy
1,5.044475,4.403967,0.238250


In [258]:
learn.unfreeze()
learn.fit_one_cycle(1, 1e-3)

epoch,train_loss,valid_loss,accuracy
1,4.309422,3.920209,0.288031
