# Zaawansowane modele językowe 

In [None]:
import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch import nn

In [None]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

In [None]:
import transformers
from transformers import *
import numpy as np
from datasets import load_dataset
import evaluate
from tqdm import tqdm

# Bert

![title](https://i.pinimg.com/originals/d6/6a/3e/d66a3e867580854200fa37f08e8addaa.gif "segment")

In [None]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
tokenizer

In [None]:
model

## Przetrenowane modele językowe


*   Huggingface: https://huggingface.co/transformers/pretrained_models.html
*   Community: https://huggingface.co/models



In [None]:
tokenized_sentence = tokenizer("BERT is designed to pre-train deep bidirectional representations")
tokenized_sentence

In [None]:
tokenizer.convert_ids_to_tokens(tokenized_sentence["input_ids"])

In [None]:
tokenizer("Śpiulkolot")

In [None]:
tokenizer.convert_ids_to_tokens([101, 11867, 17922, 13687, 12898, 2102, 102])

## Dotrenowywanie gotowego modelu

In [None]:
result = model(**tokenizer("BERT is designed to pre-train deep bidirectional representations by jointly conditioning on both left and right context in all layers.", return_tensors="pt"))
result.keys()

In [None]:
result.last_hidden_state, result.last_hidden_state.size()

In [None]:
sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
sentiment_model

### Training

https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments

In [None]:
raw_datasets = load_dataset("glue","sst2")

In [None]:
raw_datasets

In [None]:
raw_datasets["train"]["sentence"]

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)


In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence","idx"])
tokenized_datasets.set_format("torch")
small_train_dataset = tokenized_datasets["train"]#.shuffle(seed=42).select(range(10000))
small_eval_dataset = tokenized_datasets["validation"]

In [None]:
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=16)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=16)


In [None]:
optimizer = Adam(sentiment_model.parameters(), lr=5e-5)
sentiment_model.to(device)

In [None]:
num_epochs = 3
loss_fun = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    losses = []
    for batch in tqdm(train_dataloader):

        labels = batch["label"].to(device)
        batch = {"attention_mask": batch['attention_mask'].to(device), "input_ids": batch['input_ids'].to(device)}#, "token_type_ids":batch['token_type_ids'].to(device)}
        outputs = sentiment_model(**batch)
        loss = loss_fun(outputs.logits, labels)
        loss.backward()

        optimizer.step()
#         lr_scheduler.step()
        optimizer.zero_grad()
#         progress_bar.update(1)
        losses.append(loss.item())
    print(np.mean(losses))

In [None]:
# torch.save(sentiment_model.state_dict(),"sentiment_model_dict")
sentiment_model.load_state_dict(torch.load("sentiment_model_dict"))

In [None]:
metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    labels = batch["label"].to(device)
    batch = {"attention_mask": batch['attention_mask'].to(device), "input_ids": batch['input_ids'].to(device)}#, "token_type_ids":batch['token_type_ids'].to(device)}

    with torch.no_grad():
        outputs = sentiment_model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=labels)

metric.compute()

# Odpowiadanie na pytania (Question answering)

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
model_name = "deepset/roberta-base-squad2"
qa_tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")

qa_model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

In [None]:
context = '''Warsaw is the capital and largest city of Poland. 
The metropolis stands on the River Vistula in east-central Poland and its population is officially estimated at 1.8 million 
residents within a greater metropolitan area of 3.1 million residents, which makes Warsaw the 7th 
most-populous capital city in the European Union. 
The city area measures 517 km2 (200 sq mi) and comprises 18 boroughs, 
while the metropolitan area covers 6,100 km2 (2,355 sq mi).
Warsaw is an alpha- global city, a major cultural, political and economic hub, 
and the country's seat of government. Its historical Old Town was designated a UNESCO World Heritage Site.'''

In [None]:
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
QA_input = {
    'question': 'Where does the water in Warsaw come from?',
    'context': context}
res = nlp(QA_input)

In [None]:
res

# Generacja tekstu:
Pograj w grę: https://play.aidungeon.io/main/newGame (model GPT-3)

# Obrazki z tekstu
https://openai.com/dall-e-2/

https://labs.openai.com/

# Chatbot
https://openai.com/blog/chatgpt/

https://chat.openai.com/chat