### POS Tagging

In [None]:
using PyPlot

In [None]:
# https://github.com/JuliaLang/julia/issues/14099
const spaces = filter(isspace, Char(0):Char(0x10FFFF));

In [None]:
function readData(filePath)
    file = open(filePath);
    vocabSet = Set();
    tagSet = Set();
    # read line
    for ln in eachline(file)
        word_tag = split(ln, spaces);
        # remove ""
        word_tag = word_tag[word_tag .!= ""]
        # separate word from tag
        for token in word_tag
            tokenSplit = split(token, "_");
            push!(vocabSet, tokenSplit[1]);
            push!(tagSet, tokenSplit[2]);
        end
    end
    close(file);
    #println(vocabSet)
    #println(tagSet)
    # vocabulary dict
    wordDict = Dict{AbstractString, Vector{Float64}}();
    vocabSize = length(vocabSet);
    for (index, value) in enumerate(vocabSet)
        val = zeros(vocabSize);
        val[index] = 1;
        wordDict[value] = val;
    end
    #println(wordDict);
    # tag dict
    tagDict = Dict{AbstractString, Vector{Float64}}();
    tagSize = length(tagSet);
    for (index, value) in enumerate(tagSet)
        val = zeros(tagSize);
        val[index] = 1;
        tagDict[value] = val;
    end
    #println(tagDict);
    # prepare data array
    data = Tuple{AbstractString , AbstractString }[];
    file = open(filePath);
    # read line
    for ln in eachline(file)
        word_tag = split(ln, spaces);
        # remove ""
        word_tag = word_tag[word_tag .!= ""]
        # separate word from tag
        for token in word_tag
            tokenSplit = split(token, "_");
            push!(data, (tokenSplit[1], tokenSplit[2]));
        end
    end
    close(file);
    #println(length(data))
    return vocabSet, tagSet, wordDict, tagDict, data;
end

In [None]:
function tanhGradient(x)
    return (1 - (x.*x))
end

In [None]:
# read data
vocabSetTrain, tagSetTrain, wordDictTrain, tagDictTrain, dataTrain = readData("data/pos/wiki-en-train.norm_pos");
# define the network
inputLayerSize = length(vocabSetTrain);
hiddenLayerSize = 100;
outputLayerSize = length(tagSetTrain);
learningRate = 1e-1;
# initialize weights and biases
Wxh = randn(inputLayerSize, hiddenLayerSize)*0.01; # input to hidden
Whh = randn(hiddenLayerSize, hiddenLayerSize)*0.01; # hidden to hidden
Bh = randn((hiddenLayerSize, 1)); # hidden bias
Why = randn(hiddenLayerSize, outputLayerSize)*0.01; # hidden to output
By = randn((outputLayerSize, 1)); # output bias

In [None]:
function forwardRNN(activationFn::Function, x::Array{Array{Float64,1},1}, 
    y::Array{Array{Float64,1},1}, hPrev::Array{Float64,2})
    
    h = Matrix{Float64}[]; # hidden layers (at time t)
    p = Matrix{Float64}[]; # output probability distribution (at time t)
    cost = 0;
    # for each time t in x
    # unrolling RNN -> Feedforward NN step
    for time in 1:size(x,1)
        if time > 1
            push!(h, activationFn(Wxh' * x[time] + Whh' * h[time - 1] .+ Bh));
        else
            push!(h, activationFn(Wxh' * x[time] + Whh' * hPrev .+ Bh));  
        end
        # output layer
        score = Why' * h[time] .+ By;
        p_softmax = exp(score) / sum(exp(score));
        push!(p, p_softmax);
        cost += -sum(log(y[time]'*p[time])) # assuming y is a one-hot vector
    end
    return h, p, cost
end

In [None]:
function backwardRNN(activationFnGrad::Function, x::Array{Array{Float64,1},1},
     h::Array{Array{Float64,2},1}, p::Array{Array{Float64,2},1}, y::Array{Array{Float64,1},1})
    
    global Wxh, Whh, Bh, Why, By;
    dWxh = zeros(size(Wxh));
    dWhh = zeros(size(Whh));
    dBh = zeros(size(Bh));
    dWhy = zeros(size(Why));
    dBy = zeros(size(By));
    
    dh = zeros(size(Bh)); # error from the following time step
    for time in size(x,1):-1:1
        # output layer error
        dy = p[time] - y[time]; #assuming y is a one hot vector
        # output gradient
        dWhy = dWhy + (h[time] * dy'); 
        dBy = dBy + dy;
        # backpropagate
        dh = ((Whh * dh) + (Why * dy)) .* activationFnGrad(h[time]) ;
        # hidden layer gradient
        dWxh = dWxh + (x[time] * dh');
        dBh = dBh + dh;
        if time != 1
            dWhh = dWhh + (h[time - 1] * dh');
        end
    end
    # clip to mitigate exploding gradients
    dWxh = clamp(dWxh, -5, 5);
    dWhh = clamp(dWhh, -5, 5);
    dBh = clamp(dBh, -5, 5);
    dWhy = clamp(dWhy, -5, 5);
    dBy = clamp(dBy, -5, 5);
    
    return dWxh, dWhh, dBh, dWhy, dBy;
end

In [None]:
function updateWeights(dWxh::Array{Float64,2}, dWhh::Array{Float64,2}, dBh::Array{Float64,2}, 
    dWhy::Array{Float64,2}, dBy::Array{Float64,2})
    global Wxh, Whh, Bh, Why, By;
    Wxh += -learningRate * dWxh;
    Whh += -learningRate * dWhh;
    Bh += -learningRate * dBh;
    Why += -learningRate * dWhy;
    By += -learningRate * dBy;
end

- smooth cost
- h previous
- unfolding rnn with sequence length

In [None]:
# gradient checking
function gradCheck(inputs::Vector{Float64}, targets::Vector{Float64}, hPrev::Array{Float64,2})
    paramNameList = ["Wxh", "Whh", "Why", "Bh", "By"];
    # collect paramters
    global Wxh, Whh, Why, bh, by;
    paramList = [x for x=(Wxh, Whh, Why, Bh, By)];
    num_checks = 2;
    delta = 1e-5;
    # collect parameter gradients
    h, prob, cost = forwardRNN(tanh, inputs, targets, hPrev);
    dWxh, dWhh, dBh, dWhy, dBy = backwardRNN(tanhGradient, inputs, h, prob, targets);
    dParamList = [x for x=(dWxh, dWhh, dWhy, dBh, dBy)];
    for (param,dparam,name) in zip(paramList, dParamList, paramNameList)
        # validate the size of the parameter and its gradient
        s0 = size(dparam);
        s1 = size(param);
        if s0 != s1
            error("Error dims dont match: ", s0," and ",s1);
        end
        for i in 1:num_checks
            ri = rand(1:length(param));
            # evaluate cost at [x + delta] and [x - delta]
            old_val = param[ri];
            param[ri] = old_val + delta;
            h, p, cg0 = forwardRNN(tanh, inputs, targets, hPrev);
            param[ri] = old_val - delta;
            h, p, cg1 = forwardRNN(tanh, inputs, targets, hPrev);
            param[ri] = old_val # reset old value for this parameter
            # fetch both numerical and analytic gradient
            grad_analytic = dparam[ri];
            grad_numerical = (cg0 - cg1) / ( 2 * delta );
            
            rel_error = abs(grad_analytic - grad_numerical) / abs(grad_numerical + grad_analytic);
            # println(grad_numerical,", ", grad_analytic, " => ",rel_error);
            # rel_error should be on order of 1e-7 or less
            if rel_error > 1e-5
                error("Gradient check failed.");
            end
        end
    end
end

In [None]:

#smoothCost = -log(1.0/length(vocabSize))*seqLength; # loss at iteration 0

function train(data::Array{Tuple{AbstractString,AbstractString},1}, wordDict::Dict{AbstractString, Vector{Float64}} 
    , tagDict::Dict{AbstractString, Vector{Float64}}, numItr::Int64, seqLength::Int64)
    
    numIterations =  numItr * length(data);
    costList = []; # store cost per iteration
    ptr = 1;
    hPrev = zeros(hiddenLayerSize,1);
    for itr in 1:numIterations
        # prepare inputs (we're sweeping from left to right in steps seq_length long)
        if ptr+seqLength-1 > length(data) || itr == 1 # whenever we are looking at the data from the start
            # reset RNN memory
            hPrev = zeros(hiddenLayerSize,1) 
            # go from start of data
            ptr = 1 
        end
        # generate sequence
        seqData = data[ptr:ptr+seqLength-1];
        inputs = Vector{Float64}[];
        targets = Vector{Float64}[];
        for word_tag in seqData
            push!(inputs , wordDict[word_tag[1]]);
            push!(targets , tagDict[word_tag[2]]);
        end
        # gradient check
        #gradCheck(inputs, targets, hPrev);
    
        # feedforward
        h, prob, cost = forwardRNN(tanh, inputs, targets, hPrev);
        #smoothCost = smoothCost * 0.999 + cost * 0.001;
        push!(costList, cost);
        hPrev = h[size(h,1)]; # previous output as hidden input for the next sequence
        # backpropagate
        dWxh, dWhh, dBh, dWhy, dBy = backwardRNN(tanhGradient, inputs, h, prob, targets);
        # update weights
        updateWeights(dWxh, dWhh, dBh, dWhy, dBy);
        # move data pointer
        ptr += seqLength; 
    end
    return costList;
end

In [None]:
# MAIN
# number of steps to unroll the RNN for
seqLen = 2 
# run through the data n times
numIterOverData = 100;
J = train(dataTrain, wordDictTrain, tagDictTrain, numIterOverData, seqLen);

In [None]:
# plot the cost per iteration
plot(1:length(J), J)
xlabel("Iterations")
ylabel("Cost")
grid("on")

In [None]:
function findAccuracy(data::Array{Tuple{AbstractString,AbstractString},1}, wordDict::Dict{AbstractString, Vector{Float64}} 
    , tagDict::Dict{AbstractString, Vector{Float64}}, seqLength::Int64)
    correct = 0;
    ptr = 1; # do not change
    hPrev = zeros(hiddenLayerSize,1);
    p = [];
    for i in 1:length(data)/seqLength
        # prepare inputs (we're sweeping from left to right in steps seq_length long)
        if ptr+seqLength-1 > length(data) || i == 1 # whenever we are looking at the data from the start
            hPrev = zeros(hiddenLayerSize,1) # reset RNN memory
            ptr = 1 # go from start of data
        end
    
        # generate sequence
        seqData = data[ptr:ptr+seqLength-1];
        inputs = Vector{Float64}[];
        targets = Vector{Float64}[];
        for word_tag in seqData
            push!(inputs , wordDict[word_tag[1]]);
            push!(targets , tagDict[word_tag[2]]);
        end
        # feedforward
        h, prob, cost = forwardRNN(tanh, inputs, targets, hPrev);
        hPrev = h[size(h,1)];
        prediction = [indmax(prob[j]) for j in 1:size(prob,1)];
        # accuracy
        for j in 1:length(targets)
            push!(p, prediction[j]);
            if indmax(targets[j]) == prediction[j]
                correct = correct + 1;
            end
        end
        ptr += seqLength; # move data pointer
    end
    accuracy = correct/length(data)*100;
    return accuracy, p[:];
end

In [None]:
accuracy, result = findAccuracy(dataTrain, wordDictTrain, tagDictTrain, seqLen);
println("accuracy: ", accuracy);

In [None]:
# read data
vocabSetTest, tagSetTest, wordDictTest, tagDictTest, dataTest = readData("data/pos/wiki-en-test.norm");
accuracy, result = findAccuracy(dataTest, wordDictTest, tagDictTest, seqLen);
println("accuracy: ", accuracy);

- [The Unreasonable Effectiveness of Recurrent Neural Networks](http://karpathy.github.io/2015/05/21/rnn-effectiveness/)
- [Deep Learning Lecture 12: Recurrent Neural Nets and LSTMs](https://youtu.be/56TYLaQN4N8)
- [ADAM: A METHOD FOR STOCHASTIC OPTIMIZATION](http://arxiv.org/abs/1412.6980)
- [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://jmlr.org/papers/v12/duchi11a.html)
- [NLP Programming Tutorial](http://www.phontron.com/slides/nlp-programming-en-08-rnn.pdf)
- [Lec [5.1]: Deep Learning, Recurrent neural network](https://youtu.be/AvyhbrQptHk)
