In [1]:
#imports
import Pkg
using Pkg; for p in ("Knet","IterTools","WordTokenizers","Test","Random","Statistics","Dates","LinearAlgebra","CuArrays"); haskey(Pkg.installed(),p) || Pkg.add(p); end
using Statistics, IterTools, WordTokenizers, Test, Knet, Random, Dates, Base.Iterators, LinearAlgebra

In [2]:
# Update and list all packages
Pkg.update()
pkgs = Pkg.installed()

for package in keys(pkgs)
    if pkgs[package] == nothing
        pkgs[package] = VersionNumber("0.0.1")
    end
    println("Package name: ", package, " Version: ", pkgs[package])
end

[32m[1m  Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m  Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Manifest.toml`
[90m [no changes][39m
Package name: Statistics Version: 0.0.1
Package name: Test Version: 0.0.1
Package name: Random Version: 0.0.1
Package name: WordTokenizers Version: 0.5.3
Package name: AutoGrad Version: 1.2.0
Package name: IterTools Version: 1.3.0
Package name: LinearAlgebra Version: 0.0.1
Package name: StatsBase Version: 0.32.0
Package name: CuArrays Version: 1.5.0
Package name: IJulia Version: 1.20.2
Package name: Dates Version: 0.0.1
Package name: Knet Version: 1.3.2


In [3]:
using CuArrays: CuArrays, usage_limit
CuArrays.usage_limit[] = 8_000_000_000
BATCH_SIZE = 64

Knet.atype() = KnetArray{Float32} 
is_lstm_strategy_on = true # if true rnn type becomes lstm, otherwise we preferred to use relu
gpu() # GPU test must result as 0

0

In [4]:
# Vocabulary Structure
struct Vocab
    w2i::Dict{String,Int}
    i2w::Vector{String}
    unk::Int
    eos::Int
    tokenizer
end

function Vocab(file::String; tokenizer=split, vocabsize=Inf, mincount=1, unk="<unk>", eos="<s>")
    vocab_freq = Dict{String,Int64}(unk => 1, eos => 1)
    w2i = Dict{String, Int64}(unk => 2, eos => 1)
    i2w = Vector{String}()

    push!(i2w, eos)
    push!(i2w, unk)

    open(file) do f
        for line in eachline(f)
            sentence = strip(lowercase(line))
            sentence = tokenizer(line, [' '], keepempty = false)

            for word in sentence
                word == unk && continue
                word == eos && continue # They are default ones to be added later
                vocab_freq[word] = get!(vocab_freq, word, 0) + 1
            end
        end
        close(f)
    end


    # End of vanilla implementation of the vocaulary
    # From here we must add the mincount and vocabsize properties
    # We must change the first two property of the vocab wrt those paramaters
    vocab_freq = sort!(
        collect(vocab_freq),
        by = tuple -> last(tuple),
        rev = true,
    )

    if length(vocab_freq)>vocabsize - 2 # eos and unk ones
        vocab_freq = vocab_freq[1:vocabsize-2] # trim to fit the size
    end

    #vocab_freq = reverse(vocab_freq)

    while true
        length(vocab_freq)==0 && break
        word,freq = vocab_freq[end]
        freq>=mincount && break # since it is already ordered
        vocab_freq = vocab_freq[1:(end - 1)]
    end
    #pushfirst!(vocab_freq,unk=>1,eos=>1) # freq does not matter, just adding the
    for i in 1:length(vocab_freq)
        word, freq = vocab_freq[i]
        ind = (get!(w2i, word, 1+length(w2i)))
        (length(i2w) < ind) && push!(i2w, word)
    end

    return Vocab(w2i, i2w, 2, 1, tokenizer)
end

Vocab

In [5]:
# Special reader for the task
struct TextReader
    file::String
    vocab::Vocab
end

word2ind(dict,x) = get(dict, x, 2)

#Implementing the iterate function
function Base.iterate(r::TextReader, s=nothing)
    if s == nothing
        state = open(r.file)
        Base.iterate(r,state)
    else
        if eof(s) == true
            close(s)
            return nothing
        else
            line = readline(s)
            line = strip(lowercase(line))
            sent = r.vocab.tokenizer(line, [' '], keepempty = false)
            sent_ind = Int[]
            for word in sent
                ind = word2ind(r.vocab.w2i,word)
                push!(sent_ind,ind)
            end
            push!(sent_ind,r.vocab.eos)
            return (sent_ind, s)
        end
    end
end


Base.IteratorSize(::Type{TextReader}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{TextReader}) = Base.HasEltype()
Base.eltype(::Type{TextReader}) = Vector{Int}


In [6]:
# File 
const datadir = "nn4nlp-code/data/ptb"
isdir(datadir) || run(`git clone https://github.com/neubig/nn4nlp-code.git`)

if !isdefined(Main, :vocab)
    vocab = Vocab("$datadir/train.txt", mincount=1)

    train = TextReader("$datadir/train.txt", vocab)
    test = TextReader("$datadir/valid.txt", vocab)

end

TextReader("nn4nlp-code/data/ptb/valid.txt", Vocab(Dict("adviser" => 1750,"enjoy" => 4607,"advertisements" => 7826,"fight" => 1441,"nicholas" => 3783,"everywhere" => 6278,"surveyed" => 3556,"helping" => 2081,"whose" => 621,"manufacture" => 5052…), ["<s>", "<unk>", "the", "N", "of", "to", "a", "in", "and", "'s"  …  "cluett", "hydro-quebec", "memotec", "photography", "ipo", "ssangyong", "fromstein", "ferc", "gitano", "daewoo"], 2, 1, split))

In [7]:
#Embed
struct Embed; w; end

function Embed(vocabsize::Int, embedsize::Int)
    Embed(param(embedsize,vocabsize))
end

function (l::Embed)(x)
    l.w[:,x]
end

#Linear
struct Linear; w; b; end

function Linear(inputsize::Int, outputsize::Int)
    Linear(param(outputsize,inputsize), param0(outputsize))
end

function (l::Linear)(x)
    l.w * mat(x,dims=1) .+ l.b
end

In [8]:
# Mask!
function mask!(a,pad)
    matr = a
    for j in 1:size(matr)[1]
        i=0
        while i<(length(matr[j,:])-1)
            matr[j,length(matr[j,:])-i-1]!=pad && break

            if matr[j,length(matr[j,:])-i]== pad
                matr[j,length(matr[j,:])-i]= 0
            end
            i+=1
        end
    end
    matr
end

mask! (generic function with 1 method)

In [9]:
# Minibatching
struct LMData
    src::TextReader
    batchsize::Int
    maxlength::Int
    bucketwidth::Int
    buckets
end

function LMData(src::TextReader; batchsize = 64, maxlength = typemax(Int), bucketwidth = 10)
    numbuckets = min(128, maxlength ÷ bucketwidth)
    buckets = [ [] for i in 1:numbuckets ]
    LMData(src, batchsize, maxlength, bucketwidth, buckets)
end

Base.IteratorSize(::Type{LMData}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{LMData}) = Base.HasEltype()
Base.eltype(::Type{LMData}) = Matrix{Int}

function Base.iterate(d::LMData, state=nothing)
    if state == nothing
        for b in d.buckets; empty!(b); end
    end
    bucket,ibucket = nothing,nothing
    while true
        iter = (state === nothing ? iterate(d.src) : iterate(d.src, state))
        if iter === nothing
            ibucket = findfirst(x -> !isempty(x), d.buckets)
            bucket = (ibucket === nothing ? nothing : d.buckets[ibucket])
            break
        else
            sent, state = iter
            if length(sent) > d.maxlength || length(sent) == 0; continue; end
            ibucket = min(1 + (length(sent)-1) ÷ d.bucketwidth, length(d.buckets))
            bucket = d.buckets[ibucket]
            push!(bucket, sent)
            if length(bucket) === d.batchsize; break; end
        end
    end
    if bucket === nothing; return nothing; end
    batchsize = length(bucket)
    maxlen = maximum(length.(bucket))
    batch = fill(d.src.vocab.eos, batchsize, maxlen + 1)
    for i in 1:batchsize
        batch[i, 1:length(bucket[i])] = bucket[i]
    end
    empty!(bucket)
    return batch, state
end

In [10]:
struct RNN_model
    embed::Embed        # language embedding
    rnn::RNN            # RNN (can be bidirectional)
    projection::Linear  # converts output to vocab scores
    dropout::Real       # dropout probability to prevent overfitting
    vocab::Vocab        # language vocabulary  
end

function RNN_model(hidden::Int,      # hidden size for both the encoder and decoder RNN
                embsz::Int,          # embedding size
                vocab::Vocab;     # vocabulary for source language
                layers=1,            # number of layers
                bidirectional=false, # whether encoder RNN is bidirectional
                dropout=0)           # dropout probability

    embed = Embed(length(vocab.i2w),embsz)

    rnn = RNN(embsz,hidden;rnnType=is_lstm_strategy_on ? :lstm : :relu, numLayers=layers,bidirectional=bidirectional ,dropout= dropout)
    
    layerMultiplier = bidirectional ? 2 : 1
    
    projection = Linear(layerMultiplier*hidden,length(vocab.i2w))

    RNN_model(embed,rnn,projection,dropout,vocab)

end

RNN_model

In [11]:
function calc_scores(rm::RNN_model, data; average=true)
    B, Tx = size(data)
    
    project = rm.projection
    emb = rm.embed(data)
    
#     rm.rnn.h = 0
#     rm.rnn.c = 0

    y = rm.rnn(emb)

    return project(reshape(y,:,B*Tx))
    

end

calc_scores (generic function with 1 method)

In [12]:
function loss_f(model, batch; average = true)  
    verify = deepcopy(batch[:,2:end])
    mask!(verify,vocab.eos)
        
    scores = calc_scores(model,batch[:,1:end-1]) # trim one end
   
    return nll(scores,verify;average=average)

end

loss_f (generic function with 1 method)

In [13]:
function maploss(lossfn, model, data; average = true)
    total_words = 0
    total_loss = 0
    for part in collect(data)
        curr_loss, curr_word = lossfn(model,part, average = false)
        total_loss += curr_loss
        total_words += curr_word
    end

    average && return total_loss/total_words
    return total_loss, total_words
end

maploss (generic function with 1 method)

In [14]:
model = RNN_model(512, 512, vocab; bidirectional=true, dropout=0.2)

RNN_model(Embed(P(KnetArray{Float32,2}(512,10000))), LSTM(input=512,hidden=512,bidirectional,dropout=0.2), Linear(P(KnetArray{Float32,2}(10000,1024)), P(KnetArray{Float32,1}(10000))), 0.2, Vocab(Dict("adviser" => 1750,"enjoy" => 4607,"advertisements" => 7826,"fight" => 1441,"nicholas" => 3783,"everywhere" => 6278,"surveyed" => 3556,"helping" => 2081,"whose" => 621,"manufacture" => 5052…), ["<s>", "<unk>", "the", "N", "of", "to", "a", "in", "and", "'s"  …  "cluett", "hydro-quebec", "memotec", "photography", "ipo", "ssangyong", "fromstein", "ferc", "gitano", "daewoo"], 2, 1, split))

In [15]:
train_batches = collect(LMData(train))
test_batches = collect(LMData(test))
train_batches50 = train_batches[1:50] # Small sample for quick loss calculation

50-element Array{Array{Int64,2},1}:
 [9236 2 … 1 1; 24 2 … 1 1; … ; 70 169 … 1 1; 2996 2814 … 1 1]   
 [9974 9990 … 1 1; 7512 2 … 1 1; … ; 1206 84 … 1 1; 65 144 … 1 1]
 [3 475 … 1 1; 70 41 … 1 1; … ; 145 194 … 1 1; 9 3 … 1 1]        
 [3 2531 … 1 1; 3 964 … 1 1; … ; 671 3 … 1 1; 3363 9767 … 1 1]   
 [7 949 … 1 1; 368 1970 … 1 1; … ; 8 515 … 1 1; 15 10 … 1 1]     
 [142 2650 … 1 1; 9 98 … 1 1; … ; 58 5 … 1 1; 24 2 … 1 1]        
 [67 64 … 1 1; 8 7 … 1 1; … ; 6622 2 … 1 1; 2 18 … 1 1]          
 [7 2 … 1 1; 84 14 … 1 1; … ; 57 10 … 1 1; 57 10 … 1 1]          
 [486 2 … 1 1; 46 9 … 1 1; … ; 30 166 … 1 1; 673 8 … 1 1]        
 [24 1184 … 1 1; 29 25 … 1 1; … ; 6 3 … 1 1; 3 2 … 1 1]          
 [3 1287 … 1 1; 8 2309 … 1 1; … ; 638 2 … 1 1; 17 3 … 1 1]       
 [30 1152 … 1 1; 8 3430 … 1 1; … ; 3 4145 … 1 1; 75 2657 … 1 1]  
 [3 465 … 1 1; 2 8 … 1 1; … ; 19 50 … 1 1; 24 8222 … 1 1]        
 ⋮                                                               
 [45 3 … 1 1; 2270 6 … 1 1; … ; 65 276 …

In [16]:
epoch = adam(loss_f, ((model, batch) for batch in train_batches))
bestmodel, bestloss = deepcopy(model), maploss(loss_f, model, test_batches)

(RNN_model(Embed(P(KnetArray{Float32,2}(512,10000))), LSTM(input=512,hidden=512,bidirectional,dropout=0.2), Linear(P(KnetArray{Float32,2}(10000,1024)), P(KnetArray{Float32,1}(10000))), 0.2, Vocab(Dict("adviser" => 1750,"enjoy" => 4607,"advertisements" => 7826,"fight" => 1441,"nicholas" => 3783,"everywhere" => 6278,"surveyed" => 3556,"helping" => 2081,"whose" => 621,"manufacture" => 5052…), ["<s>", "<unk>", "the", "N", "of", "to", "a", "in", "and", "'s"  …  "cluett", "hydro-quebec", "memotec", "photography", "ipo", "ssangyong", "fromstein", "ferc", "gitano", "daewoo"], 2, 1, split)), 9.211432f0)

In [17]:
progress!(ncycle(epoch, 100), seconds=5) do x
    global bestmodel, bestloss
    ## Report gradient norm for the first batch
    f = @diff loss_f(model,train_batches[1])
    gnorm = sqrt(sum(norm(grad(f,x))^2 for x in params(model)))
    ## Report training and validation loss
    trnloss = maploss(loss_f,model, train_batches50)
    devloss = maploss(loss_f,model, test_batches)
    ## Save model that does best on validation data
    if devloss < bestloss
        bestmodel, bestloss = deepcopy(model), devloss
    end
    (trn=exp(trnloss), dev=exp(devloss), ∇=gnorm)
end

┣████████████████████┫ [100.00%, 66200/66200, 02:59:10/02:59:10, 6.16i/s] (trn = 1.0000004f0, dev = 1.1109952f0, ∇ = 9.805927f-6)))))
