In [None]:
# install RISE with https://rise.readthedocs.io/en/stable/installation.html
#!pip3 install -U scikit-learn
#!pip3 install -U RISE
#!pip3 install -U matplotlib
#!pip3 install -U numpy
# all imports
#!pip3 install -U tensorflow
from sklearn.feature_extraction.text import CountVectorizer
import math 
import matplotlib.pyplot as plt
import numpy as np

# Chapter 6: Specific Problem of Natural Language Processing

## by Ziwei Chen, Stephan Nef, Lukas Bamert and Jan Grau

# Agenda

1. Words to mathematical representation
2. Embedding the problem into already learnt
3. Transformer Encoder
    1. Self-Attention
    2. position-wise Feedforward Networks
    3. Residucal connection and Layer Normalization

### Part Jan

<img src="Transformer_Base.png" width=50% style="margin-left:auto; margin-right:auto">

<img src="Encoder.PNG" width=30% style="margin-left:auto; margin-right:auto">

<img src="attention.jpg" width=50% style="margin-left:auto; margin-right:auto">

Remember the problem of FIR filters? The length of the filter is too short for the actual input and quality of the data of the input may differ.

There is a solution to this: $\underline{attention}$.

Let $v = [v_1,..., v_n]$ be a sequence of input vectors.

Then we can define a context vector $c$ as $c= \sum_{i=1}^n \alpha_iv_i$.

This can be extrapolated to different context vectors, each describing different contexts $j$:

$$c_j = \sum_{i=1}^n \alpha_{ji}v_i $$


where $\alpha_{ji}$ is an attention weight from input $i$ to output $j$. A good way to achieve this is to use the softmax function:

$$\alpha_{ji} = \frac{e^{g_{ji}}}{\sum_{k=1}^ne^{g_{jk}}}$$

where $g_{ji}$ is using an alignment model to tell about the similarity of two vectors:

$$ g_{ji} = \frac{q'k_i}{\sqrt{k}}$$


<div><img src="Attention1.drawio.png" width=80% style="margin-left:auto; margin-right:auto"></div>

In [None]:
# a context vector calculation in pratice

# let's try to figure out the market value of newcomer _
# given our scouting DB with current market values of known players

v = {}
v["messi"] = 80
v["lewandowski"] = 40
v["miller"] = -25

# since we already have an example of cosine similarity g_ji is given here
your_player = "ronald"

g = {}
g["yp-messi"] = 0.8
g["yp-lewandowski"] = 0.5
g["yp-miller"] = -0.8

In [None]:
import math
import numpy as np

# a litte helper
sum_eg = 0
for mv in g:
    sum_eg += math.exp(g[mv])

# calculate attention values
alpha_yp_messi = math.exp(g["yp-messi"])/sum_eg
alpha_yp_lewandowski = math.exp(g["yp-lewandowski"])/sum_eg
alpha_yp_maguire = math.exp(g["yp-miller"])/sum_eg
c = alpha_yp_messi *v["messi"] + alpha_yp_lewandowski * v["lewandowski"] + alpha_yp_maguire * v["miller"]
print("expected market value of ", your_player, ": ", round(c,2), "Mio CHF")

### Takeaway

We calculate basic attention by querying (your player aka $q$) to keys (Messi & co, aka $k$) to get a value (market value aka $v$).

## Self-Attention

What if we try to calculate our $q$, $k$, and $v$ by ourselves?

This process is called self-attention, where each input vector from $x = [x_1,...,x_n]$ is also query, key and value:

$$x_i = q_i = k_i = v_i$$

However instead of just calculating $c_j = \sum_{i=1}^n \alpha_{ji}v_i$ with $\alpha_{ji} = \frac{e^{g_{ji}}}{\sum_{k=1}^ne^{g_{jk}}}$, it has been proven mathematically beneficial to linearly project these vectors (in our example we had a scalar value) into smaller dimensionalities. For this we use three projection matrices $W^Q$, $W^K$, $W^V$. This will give us the following equations:

$$ q_i^\star = W^Qq_i, k_i^\star = W^Kk_i,v_i^\star = W^Vv_i$$


These $W$ play an essential role in the learning. Since the attention mechanism does not contain trainable parameters. Therefore given fixed inputs vectors, we need to learn the elements of the $W$'s. Also note that having two different matrices $W^Q$ and $W^K$ we will have asymetric relationships between the input vector elements.

In the end we can calculate $c_j =  \sum_{i=1}^n \alpha_{ji}v_i^\star$ for each element $x_i$.

<div><img src="Self-Attention1.drawio.png" width=80% style="margin-left:auto; margin-right:auto"></div>

<div><img src="SelfAttention_Context1.drawio.png" width=80% style="margin-left:auto; margin-right:auto"></div>

<div><img src="Self-Attention-Overall.drawio.png" width=100% style="margin-left:auto; margin-right:auto"></div>

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.models import KeyedVectors
from gensim import corpora
from pprint import pprint
import numpy as np

documents = ["Honestly, I don’t understand anything anymore", 
             "Miller said.",
             "With all due respect to Messi and the other great players named,",
             "no one deserved it as much as Lewandowski.", 
             "To be as remarkable as the Bavarian striker.",
             "Lewandowski numbers do look better on paper"]

texts = [[text for text in doc.split()] for doc in documents]
dictionary = corpora.Dictionary(texts)

mycorpus = [dictionary.doc2bow(doc, allow_update=True) for doc in texts]

from gensim.models.word2vec import Word2Vec
model = Word2Vec(sentences=texts, vector_size=8, window=5, min_count=1, workers=4)
x_messi = model.wv['Messi']
x_lewa = model.wv['Lewandowski']
x_miller = model.wv['Miller']

In [None]:
import random
# input vector (maybe take values from Stephan/Ziwei)
x = np.array([x_messi, x_lewa, x_miller])


# we need to set the dimensions
d_model = len(x[0]) # always the length of the input vectors
d_q = d_model // 4 # theoretically freely choosable to linear transform the projection matrix
d_v = d_model // 2 # can be different for the values, but usually not

# generate the three projections matrices
W_Q = np.random.random_sample((d_q, d_model)) # in a trainable model, those would be trained instead of random
W_K = np.random.random_sample((d_q, d_model)) # in a trainable model, those would be trained instead of random
W_V = np.random.random_sample((d_v, d_model)) # in a trainable model, those would be trained instead of random

In [None]:
c = np.zeros((x.shape[0], d_v))


k_stars = np.array([np.dot(W_K, xi) for xi in x])
q_stars = np.array([np.dot(W_Q, xi).transpose() for xi in x])
v_stars = np.array([np.dot(W_V, xi) for xi in x])

for j in range(x.shape[0]):
    qj_star = q_stars[j]
    all_gj = np.array([np.dot(qj_star, k_stars[i]) / np.sqrt(d_model) for i in range(x.shape[0])]) # 3x1
    sum_g = np.sum(np.array([math.exp(all_gj[i]) for i in range(x.shape[0])]))
    alpha_j = np.array([math.exp(all_gj[i]) / sum_g for i in range(x.shape[0])])
    c[j] = np.sum([np.dot(alpha_j[i], v_stars[i]) for i in range(x.shape[0])], axis=0)
print(c[2])

<img src="Encoder.PNG" width=30% style="margin-left:auto; margin-right:auto">