In [None]:
from fastai.text import *
from seq2seq import Seq2SeqTextList
import fasttext as ft

## Download and preprocess our data

In [None]:
# ! wget https://s3.amazonaws.com/fast-ai-nlp/giga-fren.tgz -P {Config().data_path()}

In [None]:
# ! tar xf {path}/giga-fren.tgz -C {Config().data_path()} 

In [None]:
path = Config().data_path()/'giga-fren'
path.ls()

In [None]:
re_eq = re.compile('^(Wh[^?.!]+\?)')
re_fq = re.compile('^([^?.!]+\?)')
en_fname = path/'giga-fren.release2.fixed.en'
fr_fname = path/'giga-fren.release2.fixed.fr'

In [None]:
# lines = ((re_eq.search(eq), re_fq.search(fq)) 
#         for eq, fq in zip(open(en_fname, encoding='utf-8'), open(fr_fname, encoding='utf-8')))
# qs = [(e.group(), f.group()) for e,f in lines if e and f]

In [None]:
# qs = [(q1,q2) for q1,q2 in qs]
# df = pd.DataFrame({'fr': [q[1] for q in qs], 'en': [q[0] for q in qs]}, columns = ['en', 'fr'])
# df.to_csv(path/'questions_easy.csv', index=False)

In [None]:
# path.ls()

## Load our data into a DataBunch

In [None]:
df = pd.read_csv(path/'questions_easy.csv')
df.head()

In [None]:
df['en'] = df['en'].apply(lambda x:x.lower())
df['fr'] = df['fr'].apply(lambda x:x.lower())

In [None]:
src = Seq2SeqTextList.from_df(df, path = path, cols='fr').split_by_rand_pct(seed=42).label_from_df(cols='en', label_cls=TextList)

In [None]:
np.percentile([len(o) for o in src.train.x.items] + [len(o) for o in src.valid.x.items], 90)

In [None]:
np.percentile([len(o) for o in src.train.y.items] + [len(o) for o in src.valid.y.items], 90)

In [None]:
src = src.filter_by_func(lambda x,y: len(x) > 30 or len(y) > 30)

In [None]:
len(src.train) + len(src.valid)

In [None]:
data = src.databunch()

In [None]:
data.save()

In [None]:
data

# Pretrained embeddings

The lines to download the word vectors only need to be run once:

In [None]:
data = load_data(path)

In [None]:
model_path = Config().model_path()
path, model_path

In [None]:
# ! wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz -P {path}
# ! wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.bin.gz -P {path}

In [None]:
# !gunzip {path}/cc.en.300.bin.gz
# !gunzip {path}/cc.fr.300.bin.gz

In [None]:
def create_emb(vecs, itos, em_sz=300, mult=1.):
    emb = nn.Embedding(len(itos), em_sz, padding_idx=1)
    wgts = emb.weight.data
    vec_dic = {w:vecs.get_word_vector(w) for w in vecs.get_words()}
    miss = []
    for i,w in enumerate(itos):
        try: wgts[i] = tensor(vec_dic[w])
        except: miss.append(w)
    return emb

In [None]:
fr_vecs = ft.load_model(str((path/'cc.fr.300.bin')))
emb_enc = create_emb(fr_vecs, data.x.vocab.itos)
emb_enc.weight.size()
torch.save(emb_enc, model_path/'fr_emb.pth')
del fr_vecs   #clear memory

In [None]:
en_vecs = ft.load_model(str((path/'cc.en.300.bin')))
emb_dec = create_emb(en_vecs, data.y.vocab.itos)
emb_dec.weight.size()
torch.save(emb_dec, model_path/'en_emb.pth')
del en_vecs   #clear memory

We create an embedding module with the pretrained vectors and random data for the missing parts.

In [None]:
emb_enc = torch.load(model_path/'fr_emb.pth')
emb_dec = torch.load(model_path/'en_emb.pth')

In [None]:
path