# Wstęp do przetwarzania języka naturalnego

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader

torch.manual_seed(1)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
device

### Średniowieczne podejścia - bag of words

In [None]:
from bs4 import BeautifulSoup
import re

In [None]:
reviews = pd.read_csv("http://galera.ii.pw.edu.pl/~kdeja/data/sst2.tsv",delimiter="\t",quoting=3).reset_index(drop=True)

In [None]:
reviews

In [None]:
print(reviews["sentence"][4])

In [None]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

In [None]:
def review_to_words(raw_review):
    """Function to convert a review to a string of words.
    The input is a single string (a raw moviw review), and the output is a single string (a preprocessed movie review)"""
    review_text = BeautifulSoup(raw_review, 'lxml').get_text()
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    words = letters_only.lower().split()
    stops = set(stopwords.words('english'))
    meaningful_words = [word for word in words if not word in stops]
    return " ".join(meaningful_words)

In [None]:
clean_review = review_to_words(reviews['sentence'][4])
print(clean_review)

In [None]:
num_reviews = reviews['sentence'].size

# Initialize an empty list to hold the clean reviews
clean_train_reviews = []

# Loop over each review; create an index i that goes from 0 to the length of the move review list
for review in range(0, num_reviews):
    # If the index is evenly divisible by 100, print a message
    if (review+1) % 1000 == 0:
        print('Review {} of {}'.format(review+1, num_reviews))
    # Call our function for each one, and add the result to the list of clean reviews
    clean_train_reviews.append(review_to_words(reviews['sentence'][review]))

In [None]:
print('Creating the bag of words...')
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
vectorizer = CountVectorizer(analyzer = 'word',
                            tokenizer = None,
                            preprocessor = None,
                            stop_words = None,
                            max_features = 1000)
# fit_transform() does two functions: First, it fits the model
# and learns the vocaulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of strings.
train_data_features = vectorizer.fit_transform(clean_train_reviews)

# Numpy arrays are easy to work with, so convert the result to an array
train_data_features = train_data_features.toarray()
print('Bag of words completed')

In [None]:
vocab = vectorizer.get_feature_names_out()
print(len(vocab))

In [None]:
train_indices = np.random.rand(len(reviews))>0.3
train_data = torch.from_numpy(train_data_features).float()[train_indices]
train_targets = torch.from_numpy(reviews["label"].values[train_indices]).long()

test_data = torch.from_numpy(train_data_features[~train_indices]).float()
test_targets = torch.from_numpy(reviews["label"].values[~train_indices]).long()

In [None]:
train_dataset = data.TensorDataset(train_data,train_targets)
test_dataset = data.TensorDataset(test_data,test_targets)

In [None]:
train_loader = data.DataLoader(train_dataset, batch_size=64, shuffle=True, drop_last=True)
test_loader = data.DataLoader(test_dataset, batch_size=128, shuffle=False)

In [None]:
class BoWClassifier(nn.Module):
    def __init__(self): 
        super(BoWClassifier, self).__init__()
        self.lin1 =nn.Linear(1000, 500)  # 28 x 28 = 784
        self.act1 =nn.LeakyReLU()
        self.lin2 =nn.Linear(500, 50)
        self.act2 =nn.LeakyReLU()
        self.lin3 =nn.Linear(50, 2)
        
             
    def forward(self, x):
        x = self.lin1(x)
        x = self.act1(x)
        x = self.lin2(x)
        x = self.act2(x)
        x = self.lin3(x)
        return x
bow_model = BoWClassifier().to(device)
bow_model

In [None]:
def get_accuracy(model, data_loader):
    correct = 0
    total = 0
    model.eval() #*********#
    for imgs, labels in data_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        output = model(imgs)
        pred = output.max(1, keepdim=True)[1] # get the index of the max logit
        correct += pred.eq(labels.view_as(pred)).sum().item()
        total += imgs.shape[0]
    return correct / total

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(bow_model.parameters(), lr=0.001)

iters = []
losses = []
train_acc = []
val_acc = []
for n in range(10):
    epoch_losses = []
    for x, labels in iter(train_loader):
        x, labels = x.to(device), labels.to(device)
        bow_model.train() 
        out = bow_model(x).squeeze()           

        loss = criterion(out, labels)
        loss.backward()  
        epoch_losses.append(loss.item())
        optimizer.step()              
        optimizer.zero_grad()         

    loss_mean = np.array(epoch_losses).mean()
    iters.append(n)
    losses.append(loss_mean)
    test_acc = get_accuracy(bow_model, test_loader)
    print(f"Epoch {n} loss {loss_mean:.3} test_acc: {test_acc:.3}")
    train_acc.append(get_accuracy(bow_model, train_loader)) # compute training accuracy 
    val_acc.append(test_acc)  # compute validation accuracy
        

print("Final Training Accuracy: {}".format(train_acc[-1]))
print("Final Validation Accuracy: {}".format(val_acc[-1]))

In [None]:
example_1_text = "I do not like this movie"
example_2_text = "I like this movie"
examples = vectorizer.transform([review_to_words(example_1_text),review_to_words(example_2_text)])
examples = torch.from_numpy(examples.toarray()).to(device).float()
bow_model(examples)

In [None]:
example_1_text = "The topic of this movie is love"
example_2_text = "I love a movie about this topic"
examples = vectorizer.transform([review_to_words(example_1_text),review_to_words(example_2_text)])
examples = torch.from_numpy(examples.toarray()).to(device).float()
bow_model(examples)

### Embeddingi w języku

In [None]:
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
corpus = api.load('text8')
gensim_model = Word2Vec(corpus)

In [None]:
gensim_model.wv["king"]

In [None]:
gensim_model.wv.most_similar("king")

In [None]:
gensim_model.wv.most_similar("car")

In [None]:
gensim_model.wv.most_similar("love")

## Jak trenować embeddingi 

In [None]:
word_to_ix = {"hello": 0, "world": 1}
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings
lookup_tensor = torch.tensor([word_to_ix["hello"]], dtype=torch.long)
hello_embed = embeds(lookup_tensor)
print(hello_embed)

#### Czyli wklejamy warstwę nn.Embedding uczymy tak jak powyżej i już?

## Continuous Bag-of-Words - przewidywanie słowa na podstawie kontekstu

In [None]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
test_sentence = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".lower().split()

ngrams = [
    (
        [test_sentence[i - j - 1] for j in range(CONTEXT_SIZE)] + [test_sentence[i+  j + 1] for j in range(CONTEXT_SIZE)],
        test_sentence[i]
    )
    for i in range(CONTEXT_SIZE, len(test_sentence)-CONTEXT_SIZE)
]
# Print the first 3, just so you can see what they look like.
print(test_sentence[:20])
print(ngrams[:3])

In [None]:
vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}

In [None]:
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(2* context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [None]:
losses = []
loss_function = nn.NLLLoss()
emb_model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.Adam(emb_model.parameters(), lr=0.001)

for epoch in range(10):
    total_loss = 0
    for context, target in ngrams:

        # Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)
        emb_model.zero_grad()
        log_probs = emb_model(context_idxs)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(total_loss)
    losses.append(total_loss)

In [None]:
print(emb_model.embeddings.weight[word_to_ix["computer"]])

In [None]:
print(emb_model.embeddings.weight[word_to_ix["computational"]])

In [None]:
with torch.no_grad():
    sim1 = torch.cosine_similarity(emb_model.embeddings.weight[word_to_ix["process"]].unsqueeze(0),emb_model.embeddings.weight[word_to_ix["computational"]].unsqueeze(0))
    sim2 = torch.cosine_similarity(emb_model.embeddings.weight[word_to_ix["process"]].unsqueeze(0),emb_model.embeddings.weight[word_to_ix["study"]].unsqueeze(0))

print(sim1)
print(sim2)

In [None]:
print(emb_model.embeddings.weight[word_to_ix["Śpiulkolot"]])

In [None]:
emb_model.embeddings.weight.size()

## Mini zadanie - zaimplementuj skip-gram - w odwrotną stronę
Przewidujmy kontekst w oparciu o jedno słowo

# Pytanie: jak duży musi być model dla prawdziwego słownika?

# Rozwiązywanie problemów z wykorzystaniem embeddingów

In [None]:
emb_weights = torch.FloatTensor(gensim_model.wv.vectors)

In [None]:
emb_weights.size()

In [None]:
embedding = nn.Embedding.from_pretrained(emb_weights)
embedding.requires_grad = False

In [None]:
tokenizer = gensim_model.wv.key_to_index

In [None]:
clean_train_reviews_tokenized = []
for review in reviews['sentence']:
    unknows = 0
    all_parsed = 0
    review_tokenized = []
    for word in review.split():
        all_parsed+=1
        try:
            review_tokenized.append(tokenizer[word.lower()])
        except:
            unknows +=1
#     print(unknows/all_parsed)
    clean_train_reviews_tokenized.append(review_tokenized)

In [None]:
class ReviewDataset(Dataset):
    def __init__(self, data,labels):
        self.data = []
        for d, l in zip(data,labels):
            self.data.append((torch.from_numpy(np.array(d)).long(),torch.tensor(l).long()))
            

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        in_data, target = self.data[idx]
        return in_data, target

In [None]:
train_data = ReviewDataset(np.array(clean_train_reviews_tokenized, dtype=object)[train_indices],reviews["label"].values[train_indices])
test_data = ReviewDataset(np.array(clean_train_reviews_tokenized, dtype=object)[~train_indices],reviews["label"].values[~train_indices])

In [None]:
from torch.nn.utils.rnn import pad_sequence
def pad_collate(batch):
    (xx, yy) = zip(*batch)
    x_lens = [len(x)-1 for x in xx]

    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
    yy = torch.stack(yy)
    return xx_pad, yy, x_lens

In [None]:
train_loader = DataLoader(train_data, batch_size=32, collate_fn=pad_collate, shuffle=True,drop_last=True)
test_loader = DataLoader(test_data, batch_size=32, collate_fn=pad_collate, shuffle=False)

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
class LSTMRegressor(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, out_size, emb_weights, bidirectional = False):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        if bidirectional:
            self.bidirectional = 2
        else:
            self.bidirectional = 1
        self.embeddings = nn.Embedding.from_pretrained(emb_weights)
        self.embeddings.requires_grad = False
        self.lstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size, num_layers = num_layers, bidirectional=bidirectional, batch_first=False)
        self.fc = nn.Linear(hidden_size*self.bidirectional, out_size)
        
    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers*self.bidirectional , batch_size, self.hidden_size)
        state = torch.zeros(self.num_layers*self.bidirectional , batch_size, self.hidden_size)
        return hidden, state
    
    def forward(self, x, len_x, hidden):
        x = self.embeddings(x)
        x = torch.transpose(x,0,1)
        all_outputs, hidden = self.lstm(x, hidden)
        all_outputs = torch.transpose(all_outputs,0,1)
        last_seq_items = all_outputs[range(all_outputs.shape[0]), len_x]
        out = last_seq_items#all_outputs[-1]#torch.flatten(all_outputs,1)
        x = self.fc(out)
        return x, hidden
     
lstm_model = LSTMRegressor(100, 100, 1, 2, emb_weights).to(device)
lstm_model

In [None]:
optimizer = torch.optim.Adam(lstm_model.parameters(), lr = 0.001)
loss_fun = nn.CrossEntropyLoss()
lstm_model.train()

# Training loop
for epoch in range(101):
    losses = 0
    batches = 0
    for x, targets, len_x in train_loader:
        x = x.to(device)
        targets = targets.to(device)
        hidden, state = lstm_model.init_hidden(x.size(0))
        hidden, state = hidden.to(device), state.to(device) 
        preds, _ = lstm_model(x, len_x, (hidden,state))
        preds = preds.squeeze(1)
        optimizer.zero_grad() 
        loss = loss_fun(preds, targets)
        loss.backward()
        optimizer.step()
        losses += loss.item()
        batches +=1
    if epoch % 10 == 0:
        print(f"Epoch: {epoch}, loss: {losses/batches:.3}")

In [None]:
lstm_model.load_state_dict(torch.load("lab_13/lstm_model_dict"))

In [None]:
lstm_model.eval()
with torch.no_grad():
    preds_list = []
    targets_list = []
    for x, targets, len_x in test_loader:
        x = x.to(device)
        targets_list.append(targets.numpy())
        targets = targets.to(device)
        hidden, state = lstm_model.init_hidden(x.size(0))
        hidden, state = hidden.to(device), state.to(device) 
        preds, _ = lstm_model(x, len_x, (hidden,state))
        preds = preds.squeeze(1)
        preds_list.append(preds.cpu().numpy())

In [None]:
print(f"Test accuracy: {(np.argmax((np.concatenate(preds_list)),1) == np.concatenate(targets_list)).sum()/len(np.concatenate(targets_list)):.3}")

In [None]:
# torch.save(lstm_model.state_dict(),"lab_13/lstm_model_dict")

In [None]:
example_1_text = "I do not like this movie"
example_2_text = "I like this movie"
example_1_tokenized = []
for word in example_1_text.split():
    try:
        example_1_tokenized.append(tokenizer[word])
    except:
        continue
example_2_tokenized = []
for word in example_2_text.split():
    try:
        example_2_tokenized.append(tokenizer[word])
    except:
        continue
hidden, state = lstm_model.init_hidden(1)
hidden, state = hidden.to(device), state.to(device) 
preds_1,_ = lstm_model(torch.from_numpy(np.array(example_1_tokenized)).unsqueeze(0).to(device),len(example_1_tokenized)-1,(hidden,state))
preds_2,_ = lstm_model(torch.from_numpy(np.array(example_2_tokenized)).unsqueeze(0).to(device),len(example_2_tokenized)-1,(hidden,state))

In [None]:
print(preds_1)
print(preds_2)

# Arytmetyka na embeddingach

In [None]:
gensim_model.wv["car"]

In [None]:
tokenizer["car"]

In [None]:
emb_weights[tokenizer["car"]]

In [None]:
plt.figure(figsize=(15,4))
sns.heatmap([gensim_model.wv["king"], 
             gensim_model.wv["man"], 
             gensim_model.wv["woman"], 
             gensim_model.wv["king"] - gensim_model.wv["man"] + gensim_model.wv["woman"],
             gensim_model.wv["queen"],
            ], cbar=True, xticklabels=False, yticklabels=False,linewidths=1,cmap="vlag") 
plt.show()

In [None]:
x = gensim_model.wv["paris"] + gensim_model.wv["germany"] - gensim_model.wv["berlin"]

In [None]:
x

In [None]:
gensim_model.wv["france"]

## Mini zadanie: Jak możemy znaleźć do czego odnosi się wektor x?