## 安裝與載入套件
執行階段類型，請切換到 T4 or TPU

In [None]:
#@title 安裝套件
!pip install datasets
!pip install transformers[torch]
!pip install textwrap3
!pip install pyarrow
!pip install opencc
!pip install sacremoses

In [None]:
#@title 套件載入

# 載入 transformers 套件
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import AutoModel, AutoConfig
from transformers import pipeline
from datasets import load_dataset

# 載入其他套件
import numpy as np
import textwrap # 打印用套件
import opencc # 簡轉繁
import os

# 載入 pytorch 套件
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torchvision.utils import save_image

## 1. 使用 transformers 模型

### 1.1 情緒分析模型

In [None]:
# 載入情緒分析的二元分類模型
# DistilBERT - Base, Uncased - Finetuned - SST-2 - Englist
# SST-2: Stanford Sentiment Treebank 2

model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
pipe = pipeline("text-classification", model=model_name, device=0)

In [None]:
# 輸入一個字串
results = pipe("This restaurant is awesome")

for result in results:
  print(result['label'])
  print(result['score'])

In [None]:
# 輸入一個串列
results = pipe(["This restaurant is awesome", "what are you doing?"])

for result in results:
  print(result['label'])
  print(result['score'])

### 1.2 使用語言翻譯模型

In [None]:
# 英文轉中文
translator = pipeline("translation_en_to_zh", model="Helsinki-NLP/opus-mt-en-zh", device=0)
# 簡中轉繁中
converter = opencc.OpenCC('s2t')

In [None]:
# 進行翻譯
translation = translator("This restaurant is awesome")

for result in translation:
    print(f'簡體輸出：{result["translation_text"]}')
    traditional_chinese = converter.convert(result['translation_text'])
    print(f'繁體輸出：{traditional_chinese}')

### 1.3 QA問答

In [None]:
# 建立模型與資料
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad", device=0)

context = """
Hugging Face is a company that focuses on natural language processing technology.
They have developed many popular open-source tools and libraries, including the Transformers and Datasets libraries.
These tools are widely used for building and deploying machine learning models.
"""
question = "What does Hugging Face focus on?"

In [None]:
# 進行 QA 問答
result = qa_pipeline(question=question, context=context)

print(f"Question: {question}")
print(f"Answer: {result['answer']}")

In [None]:
# SQuAD (Stanford Question Answering Dataset)
from datasets import load_dataset
raw_datasets = load_dataset('squad')
print(raw_datasets)

### 1.4 命名實體識別 (Named Entity Recognition, NER)

In [None]:
## 建立模型與資料
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", device=0)

text = "Hugging is a company based in New York City. The CEO is Clem Delangue."

In [None]:
## 進行文字的實體識別
entities = ner_pipeline(text)

for entity in entities:
    print(f"Entity: {entity['word']}, Label: {entity['entity']}")

### 1.5 文字生成 (繁體)

In [None]:
# CKIP Lab 中文詞知識庫小組
generator = pipeline("text-generation", model="ckiplab/gpt2-base-chinese", truncation=True, device=0)


In [None]:
# Prompt 準備
prompt = "我隔壁有一個鄰居叫小王，"

In [None]:
# 單筆文字生成
text = generator(prompt, max_length=100, num_return_sequences=1, temperature=1)

# 返回的文字把空格去掉
text = text[0]['generated_text'].replace(' ','')
print(f'生成文字：{text}')

In [None]:
# 多筆文字生成
texts = generator(prompt, max_length=100, num_return_sequences=3, temperature=1)

for i, text in enumerate(texts):
  text = text['generated_text'].replace(' ','')
  print(f"{i+1}: {text}")

In [None]:
# 查看模型資訊
model_name = "ckiplab/gpt2-base-chinese"
config = AutoConfig.from_pretrained(model_name)
# print(config)

max_length = config.max_position_embeddings
print(f"輸入的最大長度: {max_length}")

In [None]:
# 查看模型參數量
model = AutoModel.from_pretrained(model_name, config=config)
num_parameters = model.num_parameters()
print(f"模型的參數量: {num_parameters}")

## 2. BERT 模型微調

### 2.1 載入數據集

In [None]:
# 加載數據集
dataset = load_dataset('imdb')

print(f'資料輪廓：{dataset}')

In [None]:
# 觀測資料：取前180個文字符號

data_index = 5
text_data = dataset['train'][data_index]['text'][:180]
text_label = dataset['train'][data_index]['label']
text_data = textwrap.fill(f'{text_data}', width=50) # 50個字換行

print('[內文]')
print(text_data)
print('[標籤]')
print(text_label)

In [None]:
# 使用部分資料 ：2000 vs 500
train_dataset = dataset['train'].shuffle(seed=42).select(range(2000))
test_dataset = dataset['test'].shuffle(seed=42).select(range(500))

In [None]:
# 加載tokenizer和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# 建立文字資訊
input_text = 'Hi I am a human'
print(f'文字訊息：{input_text}')

In [None]:
# 對文字進行編碼
encode_text = tokenizer.encode(input_text)
print(f'編碼訊息：{encode_text}')

In [None]:
# 對數字進行解碼
text = tokenizer.decode(encode_text)
print(text)

In [None]:
# 加載tokenizer和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# 數據預處理函數
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=512)

train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

### 2.2 BERT模型建立 & 微調
[TrainingArguments 文件](https://huggingface.co/docs/transformers/v4.42.0/en/main_classes/trainer#transformers.TrainingArguments)


In [None]:
# 超參數設定
training_args = TrainingArguments(
    output_dir='./results',              # 儲存結果的資料夾
    num_train_epochs=3,                  # 總共訓練 3 輪
    per_device_train_batch_size=8,       # 每個設備的 batch_Size 為 8
    warmup_steps=500,                    # 熱身步數為 500
    weight_decay=0.01,                   # 權重衰減係數為 0.01
    evaluation_strategy="epoch",         # 每個 epoch 結束後進行評估
)

In [None]:
# 資料集與超參數設定
trainer = Trainer(
    model=model,                         # 模型
    args=training_args,                  # 訓練參數
    train_dataset=train_dataset,         # 訓練資料集
    eval_dataset=test_dataset            # 測試資料集
)

In [None]:
# 使用 pipeline 建立模型
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0)
label_map = {0: 'negative', 1: 'positive'}

### 2.3 訓練前測試 (Before test)

In [None]:
# 抽一筆測試資料
data_index = 3

sample = test_dataset[data_index]
input_text = sample['text']

print(f'輸入內容：{input_text[:50]}(前50個字)')
print(f"真實標籤：{label_map[sample['label']]}")

In [None]:
# 觀看結果
predict = nlp(input_text)

print(f"Score：{predict[0]['score']}")
print(f"預測結果：{predict[0]['label']}") # LABEL_0：負面, LABEL_1：正面

In [None]:
# 正面資料測試
input_text = 'I love this movie!'
predict = nlp(input_text)

print(f"預測結果：{predict[0]['label']}")
print(f"預測結果：{predict[0]['score']}")

In [None]:
# 負面資料測試
input_text = 'What a mess of this movie!'
predict = nlp(input_text)

print(f"預測結果：{predict[0]['label']}")
print(f"預測結果：{predict[0]['score']}")

### 2.4 模型訓練

In [None]:
# 模型訓練
trainer.train()

# 切為評估模式
trainer.evaluate()

### 2.5 微調後測試

In [None]:
# 抽一筆測試資料
data_index = 3

sample = test_dataset[data_index]
input_text = sample['text']

print(f'輸入內容：{input_text[:50]}(前50個字)')
print(f"真實標籤：{label_map[sample['label']]}")

In [None]:
# 觀看結果
predict = nlp(input_text)

print(f"Score：{predict[0]['score']}")
print(f"預測結果：{predict[0]['label']}") # LABEL_1：負面, LABEL_1：正面

In [None]:
# 正面資料測試
input_text = 'I love this movie!'
predict = nlp(input_text)

print(f"Score：{predict[0]['score']}")
print(f"預測結果：{predict[0]['label']}") # LABEL_1：負面, LABEL_1：正面

In [None]:
# 負面資料測試
input_text = 'What a mess of this movie!'
predict = nlp(input_text)

print(f"Score：{predict[0]['score']}")
print(f"預測結果：{predict[0]['label']}") # LABEL_1：負面, LABEL_1：正面

## 隨堂練習

In [None]:
#@title 練習 1：詩歌的文字生成
### 情境：喜愛創作的你，想透過 GPT 取得一些創意與靈感
### 試著提供開頭的線索，接著讓 GPT 輸出剩下的創作
### 參考：（1.5 文字生成 (繁體)）

### 實作練習 ###

prompt = '?' # 歌詞、唐詩都可
generator = ?

text = ?

### 實作練習 ###

text = text[0]['generated_text'].replace(' ','')
print(f'生成文本：{text}')

### 練習結果
# 接著你提供的 prompt，能成功產生出延續的文字

In [None]:
#@title 練習 2：文件的資訊檢索
### 情境：你的朋友是一個歷史軍事迷，他對於戰爭史非常感到興趣，
### 他想請你從二次世界大戰的歷史文本中，幫他尋找三個問題的答案。
### 參考章節: 1.3 QA問答
# 1. 問題：
#   1.1 第二次世界大戰什麼時候開始？
#   1.2 主要的戰場在哪裡？
#   1.3 誰贏得了這場戰爭？

context = """
World War II (1939-1945) was a global war that involved most of the world's nations,
forming two opposing military alliances: the Allies and the Axis.
It began on September 1, 1939, when Germany invaded Poland.
The war ended in 1945 with the victory of the Allies.
Major theaters of the war included Europe, East Asia, and the Pacific.
The war caused tens of millions of deaths and widespread destruction.
"""

questions = [
    "When did World War II begin?",
    "Where are the main battlefields?",
    "Who won the war?",
]

### 實作練習 ###

  ? # 建模型
  ? # 使用迴圈
    ? # QA問答
  print(f"Question: {question}")
  print(f"Answer: {result['answer']}\n")

### 實作練習 ###

### 實作結果
# Question: When did World War II begin?
# Answer: September 1, 1939

# Question: Where are the main battlefields?
# Answer: Europe, East Asia, and the Pacific

# Question: who won the war?
# Answer: Allies

In [None]:
#@title 練習 3：Google 評論監控
### 情境：隔壁的咖啡廳老闆，想請你做一個評論監控器，可以分析咖啡廳的google評論，
### 使得他能在最短時間，得知近期的客人對店有不滿或者抱怨
### 參考章節: 1.1 情緒分析模型

model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
pipe = pipeline("text-classification", model=model_name, device=0)

test_messages = [
                  "Awesome, I love the coffee in this shop",
                  "This cafe is terrible",
                  "This place has the best cakes in the world"
                ]

### 實作練習 ###
### 提示：先用 for 跑串列，再用 if-else
### 判斷 result 裡的資訊次否為 NEGATIVE
?
  ?
  ?

    print('This is an issue from comments!')
    print(message)

### 實作練習 ###

### 實作結
# This is an issue from comments!
# This cafe is terrible

## 3. Generative Adversarial Network (GAN)

### 3.1 資料前處理與設定資料集

In [None]:
# 指定裝置與資料前處理方式
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transform = transforms.Compose([transforms.ToTensor()])

In [None]:
# 創建 Dataset 與 Dataloader
batch_size = 128

train_dataset = datasets.MNIST(root='./mnist_data/', train=True, transform=transform, download=True)
test_dataset = datasets.MNIST(root='./mnist_data/', train=False, transform=transform, download=False)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

### 3.2 建立模型與優化器

In [None]:
#@title 3.2.1 設計模型
# 建立生成模型

class Generator(nn.Module):
    def __init__(self, g_input_dim, g_output_dim):
        super(Generator, self).__init__()

        self.fc1 = nn.Linear(g_input_dim, 256)
        self.fc2 = nn.Linear(self.fc1.out_features, self.fc1.out_features*2)
        self.fc3 = nn.Linear(self.fc2.out_features, self.fc2.out_features*2)
        self.fc4 = nn.Linear(self.fc3.out_features, g_output_dim)

    def forward(self, x):

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))

        return torch.tanh(self.fc4(x))

# 建立判別模型
class Discriminator(nn.Module):
    def __init__(self, d_input_dim):

        super(Discriminator, self).__init__()

        self.fc1 = nn.Linear(d_input_dim, 1024)
        self.fc2 = nn.Linear(self.fc1.out_features, self.fc1.out_features//2)
        self.fc3 = nn.Linear(self.fc2.out_features, self.fc2.out_features//2)
        self.fc4 = nn.Linear(self.fc3.out_features, 1)

    def forward(self, x):

        x = F.relu(self.fc1(x))
        x = F.dropout(x, 0.3)
        x = F.relu(self.fc2(x))
        x = F.dropout(x, 0.3)
        x = F.relu(self.fc3(x))
        x = F.dropout(x, 0.3)

        return torch.sigmoid(self.fc4(x))

In [None]:
#@title 3.2.2 建立模型
# 超參數設定
z_dim = 100
mnist_dim = 784

# 建立模型
G = Generator(g_input_dim=z_dim, g_output_dim=mnist_dim).to(device)
D = Discriminator(mnist_dim).to(device)

In [None]:
#@title 3.2.3 建立損失函數與優化器
# 損失函數
criterion = nn.BCELoss()

# 優化值
learning_rate = 0.0002
G_optimizer = optim.Adam(G.parameters(), lr=learning_rate)
D_optimizer = optim.Adam(D.parameters(), lr=learning_rate)

### 3.3 建構模型的訓練流程

In [None]:
#@title 3.3.1 判別模型的訓練流程
def D_train(x):

    D.zero_grad()

    # 丟入真實的圖片
    real_data_size = x.shape[0]
    x_real, y_real = x.view(-1, mnist_dim), torch.ones(real_data_size, 1)
    x_real, y_real = x_real.to(device), y_real.to(device)

    D_output = D(x_real)
    D_real_loss = criterion(D_output, y_real)
    D_real_score = D_output

    # 丟入生成的圖片
    z = torch.randn(real_data_size, z_dim).to(device)
    x_fake, y_fake = G(z), torch.zeros(real_data_size, 1).to(device)

    D_output = D(x_fake)
    D_fake_loss = criterion(D_output, y_fake)
    D_fake_score = D_output

    # 兩種資料做梯度，只更新生成模型的權重
    D_loss = D_real_loss + D_fake_loss
    D_loss.backward()
    D_optimizer.step()

    return  D_loss.data.item()

In [None]:
#@title 3.3.2 建立生成模型的訓練流程
def G_train(x):

    G.zero_grad()

    z = torch.randn(batch_size, z_dim).to(device)
    y = torch.ones(batch_size, 1).to(device)

    G_output = G(z)
    D_output = D(G_output)
    G_loss = criterion(D_output, y)

    G_loss.backward()
    G_optimizer.step()

    return G_loss.data.item()

In [None]:
#@title 3.4 訓練模型
n_epoch = 20
for epoch in range(1, n_epoch+1):
    D_losses, G_losses = [], []
    for batch_idx, (x, _) in enumerate(train_loader):
        D_losses.append(D_train(x))
        G_losses.append(G_train(x))

    print('[%d/%d]: loss_d: %.3f, loss_g: %.3f' % (
            (epoch), n_epoch, torch.mean(torch.FloatTensor(D_losses)), torch.mean(torch.FloatTensor(G_losses))))

In [None]:
#@title 3.5 生成圖片
with torch.no_grad():
    test_z = torch.randn(batch_size, z_dim).to(device)
    generated = G(test_z)

    save_image(generated.view(generated.size(0), 1, 28, 28), './sample_' + '.png')