# Attention from Scratch

In [1]:
using CSV, DataFrames
using Plots
using Embeddings
using Random

tb = CSV.read("small.csv",DataFrame)

const embtable = load_embeddings(GloVe{:en},1,max_vocab_size=10000)
const get_word_index = Dict(word=>ii for (ii,word) in enumerate(embtable.vocab))

# Returns embeddings for word
function get_embeddings(word)
    return embtable.embeddings[:,get_word_index[word]]
end

# Splits sentence into a vector of words
function word_tokeniser(sentence)
    return split(sentence," ")
end

# Softmax function
function softmax(x)
    x = x .- maximum(x)
    return exp.(x) ./ sum(exp.(x))
end

# Cross Entropy Loss
function CrossEntropyLoss(z,x)
    return -sum(x.*log.(z))
end

# Linear Transformation
function LinearTransform(x,W,b)
    return W*x.+b
end

# Feedforward network
function FeedForward(x,W,b)
    return LinearTransform(x,W,b) 
end

# Return Attention Weights
function AttentionWeights(x,q,k,v)
    # compute similarity between queries and keys (with scaling)
    e = q'*k/sqrt(length(q))

    # initialize attention weight matrix α with zeroes
    α = zeros(size(e))

    # normalize each similarity row with softmax
    for row in 1:size(e)[1]
        α[row,:] = softmax(e[row,:])
    end    
    return α
end

# Attention block
function Attention(x,Q,Qb,K,Kb,V,Vb)
    # queries
    q = LinearTransform(x,Q,Qb)
    # keys
    k = LinearTransform(x,K,Kb) 
    # values
    v = LinearTransform(x,V,Vb)

    # Attention Weights
    α = AttentionWeights(x,q,k,v)

    # context vectors
    z = v * α' 
    return q,k,v,α,z
end

# Forward propagate
function forwardprop(x,Q,Qb,K,Kb,V,Vb,W,b)
    # Reshape from 1d to 2d
    if ndims(x)==1
        x = reshape(x,(:,1))  
    end
    x = vcat(x,Vector(range(0,size(x)[2]-1))') 
    
    # Return attention values
    q,k,v,α,z = Attention(x,Q,Qb,K,Kb,V,Vb)

    # Feed Forward layer
    f = FeedForward(z,W,b)  #shape: [3xn]

    # Average pooling. 
    p = sum(f,dims=2)/size(f)[2]  #shape: [3x1]

    # Softmax layer to get probabilities 
    p = softmax(p)

    return x,q,k,v,α,z,p
end

# Train step
function train(x,y,train_params...)
    x,q,k,v,α,z,p = forwardprop(x,train_params...) 
    CEloss = CrossEntropyLoss(p,y)
    train_params = backprop(x,y,train_params...,q,k,v,α,z,p)
    return train_params...,CEloss
end

# Backpropagate
function backprop(x,y,
                Q,Qb,K,Kb,V,Vb,W,b,
                q,k,v,α,z,p,
                η=.001)
    # Softmax gradient ∂L/∂σ
    ∂L_∂p = p-y    #shape: [3x1] 

    # Average pooling gradient ∂L/∂f
    ∂p_∂f = (1 ./size(z)[2] .*ones(1,size(z)[2]))
    ∂L_∂f = ∂L_∂p*∂p_∂f  #shape: [3xn] 
    
    # NN local gradients ∂f/∂z, ∂f/∂W
    ∂f_∂z = W  #shape: [3xd] 
    ∂f_∂W = z' #shape: [4xd]

    # NN gradients ∂L/∂W and ∂L/∂b
    ∂L_∂W = ∂L_∂f*∂f_∂W  #shape: [3xd]  
    ∂L_∂b = sum(∂L_∂f,dims=2)  #shape: [3x1] 

    # Context vector gradients
    ∂L_∂z = (∂L_∂f'*∂f_∂z)' #shape: [dxn]  

    # Attention gradients
    # Local value gradients ∂z/∂v, ∂v/∂V  
    ∂z_∂v = α  #shape: [nxn] 
    ∂v_∂V = x' #shape: [nxd]

    # Local attention weight gradients ∂z/∂α 
    ∂z_∂α = v  #shape: [dxn] 

    # Initialize ∂α/∂e to zeroes
    ∂α_∂e = zeros(size(α)[1],size(α)[2])  #shape: [nxn]

    # Derivative of softmax
    for k in 1:size(α)[1]
        for j in 1:size(α)[2]
            if j == k
                ∂α_∂e[j,k] = α[j]*(1-α[j]) 
            else
                ∂α_∂e[j,k] = -α[k]*α[j]
            end
        end
    end
    
    # Local query, key gradients ∂e_∂q, ∂e_∂k 
    ∂e_∂q, ∂e_∂k = k', q'  #shape: [nxd],[nxd] 
    ∂q_∂Q, ∂k_∂K = x', x'  #shape: [nxd],[nxd]  

    # Softmax gradients
    ∂L_∂α = ∂L_∂z'*∂z_∂α   #shape: [nxn]

    # Similarity score gradients
    ∂L_∂e = ∂L_∂α*∂α_∂e    #shape: [nxn] 

    # query gradients
    ∂L_∂q = ∂L_∂e*∂e_∂q  #shape: [nxd]
    # key gradients
    ∂L_∂k = ∂L_∂e'*∂e_∂k #shape: [nxd] 
    # values gradients
    ∂L_∂v = ∂L_∂z*∂z_∂v  #shape: [dxn]

    # Q,K,V parameter gradients 
    ∂L_∂Q = ∂L_∂q'*∂q_∂Q  #shape: [dxd]
    ∂L_∂K = ∂L_∂k'*∂k_∂K  #shape: [dxd]
    ∂L_∂V = ∂L_∂v*∂v_∂V   #shape: [dxd]

    ∂L_∂Qb = sum(∂L_∂q',dims=2)  #shape: [dx1]
    ∂L_∂Kb = sum(∂L_∂k',dims=2) #shape: [dx1]
    ∂L_∂Vb = sum(∂L_∂v,dims=2)  #shape: [dx1]

    # Update Attention parameters
    # Initialize new parameter matrices with current parameters
    Q_new = Q
    Qb_new = Qb
    K_new = K 
    Kb_new = Kb
    V_new = V
    Vb_new = Vb
    W_new = W
    b_new = b

    # Update all trainable parameters with SGD
    Q_new = Q_new .- η * ∂L_∂Q
    Qb_new = Qb_new .- η * ∂L_∂Qb
    
    K_new = K_new .- η * ∂L_∂K   
    Kb_new = Kb_new .- η * ∂L_∂Kb
    
    V_new = V_new .- η * ∂L_∂V 
    Vb_new = Vb_new .- η * ∂L_∂Vb

    W_new = W_new #.- η * ∂L_∂W
    b_new = b_new #.- η * ∂L_∂b

    return Q_new,Qb_new,K_new,Kb_new,V_new,Vb_new,W_new,b_new
end

# Removes words that are not in dictionary
function remove_nid(sentence)
    sen = []
    if !ismissing(sentence)
        for i in word_tokeniser(sentence)
            try get_embeddings(i)
                push!(sen,i)
            catch e
            end
        end
    end
    return sen
end

# Evaluates the sentiment given a sentence as input
function evaluate_model(sen)
    x_em = []
    sen = remove_nid(sen)
    for i in (sen)
        if length(x_em) == 0
            x_em = get_embeddings(i)
        else 
            x_em = hcat(x_em,get_embeddings(i))
        end
    end

    α = forwardprop(x_em,Q,Qb,K,Kb,V,Vb,W,b)[5]
    #println(α)
    #println(forwardprop(x_em,Q,Qb,K,Kb,V,Vb,W,b)[end])
    # plot heatmap of α
    heatmap(sen,sen,α,clims=(0,1),aspect_ratio=1,color=:deepsea,
            title="Attention weights α",grid="off")
    
end



evaluate_model (generic function with 1 method)

In [None]:
# main 
# Random seed for reproducibility
rng = MersenneTwister(12);

# Initialize small random parameter values
Q = randn(rng, (51, 51))/100
Qb = zeros(51,1)
K = randn(rng, (51, 51))/100
Kb = zeros(51,1)
V = K
Vb = zeros(51,1)
W = randn(rng, (3, 51))/100
b = zeros(3,1) 

# Sentiment dictionary that converts sentiment
# text into one-hot labels
sent_dict = Dict("positive"=>[0,0,1],"negative"=>[1,0,0],"neutral"=>[0,1,0])

#training
for epoch=1:1000
    total_l = 0   #total loss
    for idx in 1:nrow(tb)
        x_em = []
        l = 0   #current loss
        sen = tb[idx,"cleaned_review"]  #gets sentence
        sen = remove_nid(sen)  #remove words not in dictionary
        if length(sen)!=0
            for i in (sen)
                if length(x_em) == 0
                    x_em = get_embeddings(i)
                else 
                    #Concatenate word embeddings along columns
                    x_em = hcat(x_em,get_embeddings(i)) 
                end
            end
            #One hot vector sentiment
            y = sent_dict[tb[idx,"sentiments"]]
            #Update parameters
            Q,Qb,K,Kb,V,Vb,W,b,l = train(x_em,y,Q,Qb,K,Kb,V,Vb,W,b)
        end
        total_l += l
    end
    println("Total loss:", total_l/nrow(tb))
end



In [None]:
# vizualize attention weights
evaluate_model("very sad as they both fail")

In [None]:
evaluate_model("he loved that plug with good price ")

In [None]:
evaluate_model("terrible quality for this price")

In [None]:
evaluate_model("i love this fantastic product")

In [None]:
evaluate_model("easy to move around")