# RNN, CNN
## https://nlp100.github.io/ja/ch09.html


[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/iamtatsuki05/NLP_100/blob/fix_all_merge/NLP_100_9.ipynb)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip3 install -U polars

# ID番号への変換
## 問題51で構築した学習データ中の単語にユニークなID番号を付与したい．学習データ中で最も頻出する単語に1，2番目に頻出する単語に2，……といった方法で，学習データ中で2回以上出現する単語にID番号を付与せよ．そして，与えられた単語列に対して，ID番号の列を返す関数を実装せよ．ただし，出現頻度が2回未満の単語のID番号はすべて0とせよ

In [None]:
#50,51をもう一度
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
!unzip NewsAggregatorDataset.zip
import pandas as pd
import polars as pl
from sklearn.model_selection import train_test_split

# df = pd.read_csv('/content/newsCorpora.csv', header=None, sep='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])
df = pl.read_csv('/content/newsCorpora.csv',
                 has_header=False,
                 separator='\t',
                 ignore_errors=True,
                 encoding="utf8",
                 new_columns=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'],
                 )
# df = df.loc[df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']), ['TITLE', 'CATEGORY']]
df = df.filter(df['PUBLISHER'].is_in(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail'])).select(['TITLE', 'CATEGORY'])

# データの分割
test, train_valid = train_test_split(df, test_size=0.8, shuffle=True, random_state=42, stratify=df['CATEGORY'])
valid, train = train_test_split(train_valid, test_size=0.25, shuffle=True, random_state=42, stratify=train_valid['CATEGORY'])
# train.reset_index(drop=True, inplace=True)
# valid.reset_index(drop=True, inplace=True)
# test.reset_index(drop=True, inplace=True)
train = train.with_row_count()
valid = valid.with_row_count()
test = test.with_row_count()

from collections import defaultdict
import string

d = defaultdict(int)
table = str.maketrans(string.punctuation, ' '*len(string.punctuation))#記号処理
for text in train['TITLE']:
    for word in text.translate(table).split():
        d[word] += 1
d = sorted(d.items(), key=lambda x:x[1], reverse=True)#sort

word_id = {word: idx + 1 for idx, (word, num) in enumerate(d) if num > 1}#辞書
word_id

In [None]:
PAD_TOKEN = '<PAD>'
UNK_TOKEN = '<UNK>'
PAD = 0 
UNK = 1 

In [None]:
word2id = {
    PAD_TOKEN: PAD,
    UNK_TOKEN: UNK,
}

MIN_COUNT = 1

In [None]:
class Vocab(object):

    def __init__(self, word2id={}):
        self.word2id = dict(word2id)
        self.id2word = {v: k for k, v in self.word2id.items()}    

    def build_vocab(self, sentences, min_count=1):
        word_counter = {}
        for sentence in sentences:
            for word in sentence:

                word_counter[word] = word_counter.get(word, 0) + 1
        for word, count in sorted(word_counter.items(), key=lambda x: -x[1]):
            if count < min_count:
                break
            _id = len(self.word2id)
            self.word2id.setdefault(word, _id)
            self.id2word[_id] = word
        self.raw_vocab = {w: word_counter[w] for w in self.word2id.keys() if w in word_counter}

In [None]:
vocab = Vocab(word2id=word2id)
vocab.build_vocab(train, min_count=MIN_COUNT)

In [None]:
def convert_sentence_to_ids(vocab, sen):
    result = [vocab.word2id.get(word, UNK) for word in sen]
    return result

In [None]:
id_train = [convert_sentence_to_ids(vocab, sen) for sen in train]
print(id_train[0])

In [None]:
vocab.build_vocab(valid, min_count=MIN_COUNT)
id_valid = [convert_sentence_to_ids(vocab, sen) for sen in valid]
vocab.build_vocab(test, min_count=MIN_COUNT)
id_test = [convert_sentence_to_ids(vocab, sen) for sen in test]

In [None]:
# from collections import Counter

# counter = Counter([
#     x
#     for sent in train
#     for x in sent
# ])
# vocab_in_train = [
#     token
#     for token, freq in counter.most_common()
#     if freq > 1
# ]
# vocab_list = ['[UNK]'] + vocab_in_train
# vocab_dict = {x:n for n, x in enumerate(vocab_list)}

In [None]:
# def sent_to_ids(sent):
#     return torch.tensor([vocab_dict[x if x in vocab_dict else '[UNK]'] for x in sent], dtype=torch.long)
# def dataset_to_ids(dataset):
#     return [sent_to_ids(x) for x in dataset]

In [None]:
# import torch
# train_ds = dataset_to_ids(train)
# valid_ds = dataset_to_ids(valid)
# test_ds = dataset_to_ids(test)
# train_ds[:3]

# RNNによる予測
## ID番号で表現された単語列x=(x1,x2,…,xT)がある．ただし，Tは単語列の長さ，xt∈ℝVは単語のID番号のone-hot表記である（Vは単語の総数である）．再帰型ニューラルネットワーク（RNN: Recurrent Neural Network）を用い，単語列xからカテゴリyを予測するモデルとして，次式を実装せよ．

h→0=0,h→t=RNN−→−−(emb(xt),h→t−1),y=softmax(W(yh)h→T+b(y))
ただし，emb(x)∈ℝdwは単語埋め込み（単語のone-hot表記から単語ベクトルに変換する関数），h→t∈ℝdhは時刻tの隠れ状態ベクトル，RNN−→−−(x,h)は入力xと前時刻の隠れ状態hから次状態を計算するRNNユニット，W(yh)∈ℝL×dhは隠れ状態ベクトルからカテゴリを予測するための行列，b(y)∈ℝLはバイアス項である（dw,dh,Lはそれぞれ，単語埋め込みの次元数，隠れ状態ベクトルの次元数，ラベル数である）．RNNユニットRNN−→−−(x,h)には様々な構成が考えられるが，典型例として次式が挙げられる．

RNN−→−−(x,h)=g(W(hx)x+W(hh)h+b(h))
ただし，W(hx)∈ℝdh×dw，W(hh)∈ℝdh×dh,b(h)∈ℝdhはRNNユニットのパラメータ，gは活性化関数（例えばtanhやReLUなど）である．

なお，この問題ではパラメータの学習を行わず，ランダムに初期化されたパラメータでyを計算するだけでよい．次元数などのハイパーパラメータは，dw=300,dh=50など，適当な値に設定せよ（以降の問題でも同様である）．

In [None]:
#dw=300, dh=50

In [None]:
import torch
from torch import nn
torch.manual_seed(42)

class RNN(nn.Module):
    def __init__(self, vocab_size, emb_size, padding_idx, output_size, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)
        self.rnn = nn.RNN(emb_size, hidden_size, nonlinearity='relu', batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        self.batch_size = x.size()[0]
        hidden = self.init_hidden()
        emb = self.emb(x)
        out, hidden = self.rnn(emb, hidden)
        out = self.fc(out[:, -1, :])
        return out
        
    def init_hidden(self):
        hidden = torch.zeros(1, self.batch_size, self.hidden_size)
        return hidden

In [None]:
import torch
#テキスト修正
def tokenizer(text, word2id=word_id, unk = 0):
    table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    return [word2id.get(word, unk) for word in text.translate(table).split()]

In [None]:
from torch.utils.data import Dataset

class NewsCorporaDataset(Dataset):
    def __init__(self, X, y, tokenizer):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.y)

    def __getitem__(self, index):
        text = self.X[index]
        inputs = self.tokenizer(text)

        return {'inputs': torch.tensor(inputs, dtype=torch.int64), 'labels': torch.tensor(self.y[index], dtype=torch.int64)}

In [None]:
# ラベルベクトル
category_dict = {'b': 0, 't': 1, 'e':2, 'm':3}
# y_train = train['CATEGORY'].map(lambda x: category_dict[x]).values
# y_valid = valid['CATEGORY'].map(lambda x: category_dict[x]).values
# y_test = test['CATEGORY'].map(lambda x: category_dict[x]).values
y_train = train['CATEGORY'].apply(lambda x: category_dict[x])
y_valid = valid['CATEGORY'].apply(lambda x: category_dict[x])
y_test = test['CATEGORY'].apply(lambda x: category_dict[x])

dataset_train = NewsCorporaDataset(train['TITLE'], y_train, tokenizer)
dataset_valid = NewsCorporaDataset(valid['TITLE'], y_valid, tokenizer)
dataset_test = NewsCorporaDataset(test['TITLE'], y_test, tokenizer)

vocab_size = len(set(word_id.values())) + 1
emb_size = 300
padding_idx = len(set(word_id.values()))
output_size = 4
hidden_size = 50

model = RNN(vocab_size, emb_size, padding_idx, output_size, hidden_size)

# 先頭10件の予測値取得
for num in range(10):
  X = dataset_train[num]['inputs']
  print(torch.softmax(model(X.unsqueeze(0)), dim=-1))

In [None]:
# 参考https://exture-ri.com/2021/01/12/pytorch-rnn/
# https://gotutiyan.hatenablog.com/entry/2020/09/02/200144

# 確率的勾配降下法による学習
## 確率的勾配降下法（SGD: Stochastic Gradient Descent）を用いて，問題81で構築したモデルを学習せよ．訓練データ上の損失と正解率，評価データ上の損失と正解率を表示しながらモデルを学習し，適当な基準（例えば10エポックなど）で終了させよ．

In [None]:
from torch.utils.data import DataLoader
from torch import optim

def calc_acc(model, dataset, device=None, criterion=None):
    dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
    loss = 0.0
    total = 0
    correct = 0
    with torch.no_grad():
        for data in dataloader:
        
            inputs = data['inputs'].to(device)#gpu
            labels = data['labels'].to(device)#gpu
            outputs = model(inputs)#計算
            if criterion != None:
                loss += criterion(outputs, labels).item()#loss
                #正解率
            pred = torch.argmax(outputs, dim=-1)
            total += len(inputs)
            correct += (pred == labels).sum().item()
        
    return loss / len(dataset), correct / total
  

def train_model(dataset_train, dataset_valid, batch_size, model, criterion, optimizer, num_epochs, collate_fn=None, device=None):
    model.to(device)
    dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    dataloader_valid = DataLoader(dataset_valid, batch_size=1, shuffle=False)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs, eta_min=1e-3, last_epoch=-1)
    loss_train_list = []
    loss_valid_list = []

    for epoch in range(num_epochs):
        model.train()
        for data in dataloader_train:
            optimizer.zero_grad()#初期化
            inputs = data['inputs'].to(device)#gpu
            labels = data['labels'].to(device)#gpu
            #計算
            outputs = model.forward(inputs)
            loss = criterion(outputs, labels)
            loss.backward()#調節
            optimizer.step()#更新
        
        model.eval()

        # 損失と正解率の算出
        loss_train, acc_train = calc_acc(model, dataset_train, device, criterion=criterion)
        loss_valid, acc_valid = calc_acc(model, dataset_valid, device, criterion=criterion)
        loss_train_list.append([loss_train, acc_train])
        loss_valid_list.append([loss_valid, acc_valid])

        #パラメータ保存
        torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, f'checkpoint{epoch + 1}.pt')

        #正答率誤算などの算出
        print(f'epoch: {epoch + 1}, loss_train: {loss_train:.4f}, acc_train: {acc_train:.4f}, loss_valid: {loss_valid:.4f}, acc_valid: {acc_valid:.4f}') 
        
        scheduler.step()

    return {'train': loss_train, 'valid': loss_valid}

In [None]:
vocab_size = len(set(word_id.values())) + 1
emb_size = 300
padding_idx = len(set(word_id.values()))
output_size = 4
hidden_size = 50
learning_rate = 1e-3
batch_size = 1
num_epochs = 3#testのため回数少なめ

model = RNN(vocab_size, emb_size, padding_idx, output_size, hidden_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

log = train_model(dataset_train, dataset_valid, batch_size, model, criterion, optimizer, num_epochs)

In [None]:
# https://note.nkmk.me/python-pytorch-device-to-cuda-cpu/
# https://runebook.dev/ja/docs/pytorch/generated/torch.nn.bcewithlogitsloss
# https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html
# https://tips-memo.com/python-diff-bce

# ミニバッチ化・GPU上での学習
## 問題82のコードを改変し，B事例ごとに損失・勾配を計算して学習を行えるようにせよ（Bの値は適当に選べ）．また，GPU上で学習を実行せよ．

In [None]:
class PadSequence():
    def __init__(self, padding_idx):
        self.padding_idx = padding_idx

    def __call__(self, batch):
        sorted_batch = sorted(batch, key=lambda x: x['inputs'].shape[0], reverse=True)
        sequences = [x['inputs'] for x in sorted_batch]
        sequences_padded = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=self.padding_idx)
        labels = torch.LongTensor([x['labels'] for x in sorted_batch])

        return {'inputs': sequences_padded, 'labels': labels}

In [None]:
vocab_size = len(set(word_id.values())) + 1
emb_size = 300
padding_idx = len(set(word_id.values()))
output_size = 4
hidden_size = 50
learning_rate = 1e-3
batch_size = 50
num_epochs = 3#testのため回数少なめ

model = RNN(vocab_size, emb_size, padding_idx, output_size, hidden_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
device = "cuda" if torch.cuda.is_available() else "cpu"

log = train_model(dataset_train, dataset_valid, batch_size, model, criterion, optimizer, num_epochs, collate_fn=PadSequence(padding_idx), device=device)

In [None]:
# 参考https://atmarkit.itmedia.co.jp/ait/articles/2008/28/news030.html

# 単語ベクトルの導入
## 事前学習済みの単語ベクトル（例えば，Google Newsデータセット（約1,000億単語）での学習済み単語ベクトル）で単語埋め込みemb(x)を初期化し，学習せよ．

In [None]:
# # ダウンロード制限がかかっているのでprthを指定する
# ! pip install --upgrade gdown
# import gdown
# gdown.download('https://drive.google.com/u/0/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download', './GoogleNews-vectors-negative300.bin.gz', quiet=False)
# #model
# from gensim.models import KeyedVectors

# # 学習済みモデルのロード
# model = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz' , binary=True)

In [None]:
# 直接pathを指定する場合
from gensim.models import KeyedVectors
model_kv = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Tutorial/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
import numpy as np

vocab_size = len(set(word_id.values())) + 1
emb_size = 300
weights = np.zeros((vocab_size , emb_size))
wordataset_in_pretrained = 0
for idx, word in enumerate(word_id.keys()):
    if KeyError:
        weights[idx] = np.random.normal(loc=0, scale=1, size=(emb_size,))#正規化
    else:
        weights[idx] = model_kv[word]
        wordataset_in_pretrained += 1
weights = torch.from_numpy(weights.astype((np.float32)))#torch

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, emb_size, padding_idx, output_size, hidden_size, num_layers, emb_weights=None, bidirectional=False):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_directions = bidirectional + 1

        if emb_weights != None:
            self.emb = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx)
        else:
            self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)

        self.rnn = nn.RNN(emb_size, hidden_size, num_layers, nonlinearity='relu', bidirectional=bidirectional, batch_first=True)
        self.fc = nn.Linear(hidden_size * self.num_directions, output_size)
        
    def forward(self, x):
        self.batch_size = x.size()[0]
        hidden = self.init_hidden()
        emb = self.emb(x)
        out, hidden = self.rnn(emb, hidden)
        out = self.fc(out[:, -1, :])
        return out
        
    def init_hidden(self):
        hidden = torch.zeros(self.num_layers * self.num_directions, self.batch_size, self.hidden_size)
        return hidden

In [None]:
vocab_size = len(set(word_id.values())) + 1
emb_size = 300
padding_idx = len(set(word_id.values()))
output_size = 4
hidden_size = 50
num_layers = 1
learning_rate = 1e-3
batch_size = 50
num_epochs = 3#testのため回数少なめ

model = RNN(vocab_size, emb_size, padding_idx, output_size, hidden_size, num_layers, emb_weights=weights)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters() , lr = learning_rate)
device = "cuda" if torch.cuda.is_available() else "cpu"

log = train_model(dataset_train, dataset_valid, batch_size, model, criterion, optimizer, num_epochs, collate_fn=PadSequence(padding_idx), device=device)

In [None]:
# 参考https://www.sejuku.net/blog/73026
# https://note.nkmk.me/python-numpy-dtype-astype/

# 双方向RNN・多層化
## 
順方向と逆方向のRNNの両方を用いて入力テキストをエンコードし，モデルを学習せよ．

h⃖ T+1=0,h⃖ t=RNN←−−−(emb(xt),h⃖ t+1),y=softmax(W(yh)[h→T;h⃖ 1]+b(y))
ただし，h→t∈ℝdh,h⃖ t∈ℝdhはそれぞれ，順方向および逆方向のRNNで求めた時刻tの隠れ状態ベクトル，RNN←−−−(x,h)は入力xと次時刻の隠れ状態hから前状態を計算するRNNユニット，W(yh)∈ℝL×2dhは隠れ状態ベクトルからカテゴリを予測するための行列，b(y)∈ℝLはバイアス項である．また，[a;b]はベクトルaとbの連結を表す。

さらに，双方向RNNを多層化して実験せよ．

In [None]:
vocab_size = len(set(word_id.values())) + 1
emb_size = 300
padding_idx = len(set(word_id.values()))
output_size = 4
hidden_size = 50
num_layers = 5#ここで多層化
learning_rate = 1e-3
batch_size = 50
num_epochs = 3#testのため回数少なめ

model = RNN(vocab_size, emb_size, padding_idx, output_size, hidden_size, num_layers, emb_weights=weights, bidirectional=True)#bidirectional = Trueで双方向
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
device = "cuda" if torch.cuda.is_available() else "cpu"

log = train_model(dataset_train, dataset_valid, batch_size, model, criterion, optimizer, num_epochs, collate_fn=PadSequence(padding_idx), device=device)

In [None]:
# 参考https://qiita.com/m__k/items/78a5125d719951ca98d3
# https://axa.biopapyrus.jp/deep-learning/rnn/brnn.html
# https://deepage.net/deep_learning/2017/05/23/recurrent-neural-networks.html
# https://teratail.com/questions/185713
# https://qiita.com/tetsuro_skiing/items/87c0c37cefd7b601f974

#  畳み込みニューラルネットワーク (CNN)
## ID番号で表現された単語列x=(x1,x2,…,xT)がある．ただし，Tは単語列の長さ，xt∈ℝVは単語のID番号のone-hot表記である（Vは単語の総数である）．畳み込みニューラルネットワーク（CNN: Convolutional Neural Network）を用い，単語列xからカテゴリyを予測するモデルを実装せよ．

ただし，畳み込みニューラルネットワークの構成は以下の通りとする．

単語埋め込みの次元数: dw
畳み込みのフィルターのサイズ: 3 トークン
畳み込みのストライド: 1 トークン
畳み込みのパディング: あり
畳み込み演算後の各時刻のベクトルの次元数: dh
畳み込み演算後に最大値プーリング（max pooling）を適用し，入力文をdh次元の隠れベクトルで表現
すなわち，時刻tの特徴ベクトルpt∈ℝdhは次式で表される．

pt=g(W(px)[emb(xt−1);emb(xt);emb(xt+1)]+b(p))
ただし，W(px)∈ℝdh×3dw,b(p)∈ℝdhはCNNのパラメータ，gは活性化関数（例えばtanhやReLUなど），[a;b;c]はベクトルa,b,cの連結である．なお，行列W(px)の列数が3dwになるのは，3個のトークンの単語埋め込みを連結したものに対して，線形変換を行うためである．

最大値プーリングでは，特徴ベクトルの次元毎に全時刻における最大値を取り，入力文書の特徴ベクトルc∈ℝdhを求める．c[i]でベクトルcのi番目の次元の値を表すことにすると，最大値プーリングは次式で表される．

c[i]=max1≤t≤Tpt[i]
最後に，入力文書の特徴ベクトルcに行列W(yc)∈ℝL×dhとバイアス項b(y)∈ℝLによる線形変換とソフトマックス関数を適用し，カテゴリyを予測する．

y=softmax(W(yc)c+b(y))
なお，この問題ではモデルの学習を行わず，ランダムに初期化された重み行列でyを計算するだけでよい．



In [None]:
# 単語埋め込みの次元数: dw
# 畳み込みのフィルターのサイズ: 3 トークン
# 畳み込みのストライド: 1 トークン
# 畳み込みのパディング: あり
# 畳み込み演算後の各時刻のベクトルの次元数: dh

In [None]:
from torch.nn import functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, emb_size, padding_idx, output_size, out_channels, kernel_heights, stride, padding, emb_weights=None):
        super().__init__()
        if emb_weights != None:
            self.emb = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx)
        else:
            self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)
        self.conv = nn.Conv2d(1, out_channels, (kernel_heights, emb_size), stride, (padding, 0))
        self.drop = nn.Dropout(0.3)
        self.fc = nn.Linear(out_channels, output_size)
        
    def forward(self, x):
        emb = self.emb(x).unsqueeze(1)
        conv = self.conv(emb)
        act = F.relu(conv.squeeze(3))
        max_pool = F.max_pool1d(act, act.size()[2])
        out = self.fc(self.drop(max_pool.squeeze(2)))
        return out

In [None]:
vocab_size = len(set(word_id.values())) + 1
emb_size = 300
padding_idx = len(set(word_id.values()))
output_size = 4
#CNNのパラメータ
out_channels =100
kernel_heights = 3
stride = 1
padding = 1

model = CNN(vocab_size, emb_size, padding_idx, output_size, out_channels, kernel_heights, stride, padding, emb_weights=weights)

In [None]:
for num in range(10):
    X = dataset_train[num]['inputs']
    print(torch.softmax(model(X.unsqueeze(0)), dim=-1))

In [None]:
# 参考https://qiita.com/shu_marubo/items/70b20c3a6c172aaeb8de
# https://qiita.com/mathlive/items/8e1f9a8467fff8dfd03c
# https://exture-ri.com/2021/01/11/pytorch-cnn/
# https://qiita.com/m__k/items/6c39cfe7dfa99102fa8e
# https://kento1109.hatenablog.com/entry/2019/09/30/115139

# 確率的勾配降下法によるCNNの学習
## 確率的勾配降下法（SGD: Stochastic Gradient Descent）を用いて，問題86で構築したモデルを学習せよ．訓練データ上の損失と正解率，評価データ上の損失と正解率を表示しながらモデルを学習し，適当な基準（例えば10エポックなど）で終了させよ．

In [None]:
vocab_size = len(set(word_id.values())) + 1
emb_size = 300
padding_idx = len(set(word_id.values()))
output_size = 4
#CNNのパラメータ
out_channels =100
kernel_heights = 3
stride = 1
padding = 1
learning_rate = 1e-3
batch_size = 50
num_epochs = 3#testのため回数少なめ

model = CNN(vocab_size, emb_size, padding_idx, output_size, out_channels, kernel_heights, stride, padding, emb_weights=weights)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
device = "cuda" if torch.cuda.is_available() else "cpu"

# モデルの学習
log = train_model(dataset_train, dataset_valid, batch_size, model, criterion, optimizer, num_epochs, collate_fn=PadSequence(padding_idx), device=device)

# パラメータチューニング
## 問題85や問題87のコードを改変し，ニューラルネットワークの形状やハイパーパラメータを調整しながら，高性能なカテゴリ分類器を構築せよ．

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, emb_size, padding_idx, output_size, hidden_size, num_layers, emb_weights=None, bidirectional=False):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_directions = bidirectional + 1

        if emb_weights != None:
            self.emb = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx)
        else:
            self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)

        self.rnn = nn.RNN(emb_size, hidden_size, num_layers, nonlinearity='relu', bidirectional=bidirectional, batch_first=True)
        self.fc = nn.Linear(hidden_size * self.num_directions, output_size)
        
    def forward(self, x):
        self.batch_size = x.size()[0]
        hidden = self.init_hidden()
        emb = self.emb(x)
        out, hidden = self.rnn(emb, hidden)
        out = self.fc(out[:, -1, :])
        return out
        
    def init_hidden(self):
        hidden = torch.zeros(self.num_layers * self.num_directions, self.batch_size, self.hidden_size)
        return hidden

In [None]:
vocab_size = len(set(word_id.values())) + 1
emb_size = 300
padding_idx = len(set(word_id.values()))
output_size = 4
hidden_size = 50
num_layers = 10#多い方が良い？
learning_rate = 1e-3#小さい方がいい？
batch_size = 128#適度に増やす
num_epochs = 25#多めに

model = RNN(vocab_size, emb_size, padding_idx, output_size, hidden_size, num_layers, emb_weights=weights, bidirectional=True)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
device = "cuda" if torch.cuda.is_available() else "cpu"

log = train_model(dataset_train, dataset_valid, batch_size, model, criterion, optimizer, num_epochs, collate_fn=PadSequence(padding_idx), device=device)

In [None]:
# 参考https://qiita.com/nyanko-box/items/a6f50e28383a5bd0a432
# https://cpp-learning.com/optuna-pytorch/
# https://qiita.com/Yushi1958/items/cd22ade638f7e292e520
# https://dreamer-uma.com/pytorch-optuna-hyperparameter-tuning/
# http://maruo51.com/2020/08/07/optuna_pytorch/
# https://ichi.pro/optuna-o-shiyoshita-pytorch-haipa-parame-ta-no-chosei-4883072668892

# 事前学習済み言語モデルからの転移学習
## 事前学習済み言語モデル（例えばBERTなど）を出発点として，ニュース記事見出しをカテゴリに分類するモデルを構築せよ．

In [None]:
# !pip install -q transformers
# from transformers import BertTokenizer, BertModel
# from torch import cuda

In [None]:
# https://note.nkmk.me/python-pytorch-device-to-cuda-cpu/
# https://qiita.com/yamaru/items/63a342c844cff056a549
# https://qiita.com/m__k/items/e312ddcf9a3d0ea64d72
# https://scrapbox.io/miyamonz/pytorch,_transformers%E3%82%92%E4%BD%BF%E3%81%A3%E3%81%9FBERT%E3%81%AEfine-tuning%E3%81%AE%E6%96%B9%E6%B3%95