# 第9章: RNNとCNN
*深層学習フレームワークを用い，再帰型ニューラルネットワーク（RNN）や畳み込みニューラルネットワーク（CNN）を実装します*

### データの前処理(記号と数字の削除)

In [1]:
#文字列(str)を受け取って記号と数字をすべて削除する関数
import re
def remove_symbols(text):
    text = re.sub(r'[^a-zA-Z\s]',r'',text).lower()
    return text

In [2]:
!pwd
!wc -l ./data/NewsAggregatorDataset/newsCorpora.csv
# 読込時のエラー回避のためダブルクォーテーションをシングルクォーテーションに置換
!sed -e 's/"/'\''/g' ./data/NewsAggregatorDataset/newsCorpora.csv > ./work/newsCorpora_re.csv

#pandasのDataFrameに格納
import pandas as pd
df = pd.read_csv('./work/newsCorpora_re.csv', header=None, sep='\t', names=['TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP']) #dataframe型に格納する
df = df[(df['PUBLISHER'] == 'Reuters') | (df['PUBLISHER'] == 'Huffington Post') | (df['PUBLISHER'] == 'Businessweek') | (df['PUBLISHER'] == 'Contactmusic.com') | (df['PUBLISHER'] == 'Daily Mail')] #特定のpublisherの記事を抽出
# df = df.sample(frac=1,random_state=0) #sampleはランダムにサンプリングするメソッド　fracで指定した割合を母集団から取ってくる、１にするとすべて取ってくる、つまりすべてランダムに並べ替える

#記号と数字の削除
df['TITLE'] = df['TITLE'].apply(lambda x:remove_symbols(x))

#train_test_splitを使って分割
from sklearn.model_selection import train_test_split
train, valid_test = train_test_split(df,test_size = 0.2)
valid, test = train_test_split(valid_test,test_size = 0.5)

train = train.reset_index()
valid = valid.reset_index()
test = test.reset_index()

#データの保存
train['TITLE'].to_csv('train.txt',sep='\t',index=False,header=False)#locとつけないとエラーになる、完全には理解できてない
valid['TITLE'].to_csv('valid.txt',sep='\t',index=False,header=False)#複数要素(title,category)で抽出するときにはリストにして渡す
test['TITLE'].to_csv('test.txt',sep='\t',index=False,header=False)#カテゴリ名と記事見出しのタブ区切り形式なのでsep=\t


/home/kai/100knock-crash-2023/trainee_kai/chapter09
422937 ./data/NewsAggregatorDataset/newsCorpora.csv


## 80 ID番号への変換

In [3]:
from collections import defaultdict
from nltk.tokenize import word_tokenize

#テキストファイルから１行ずつトークン化して単語の出現回数を数える
words = defaultdict(int)
with open('/home/kai/100knock-crash-2023/trainee_kai/chapter09/train.txt','r') as f:
    for line in f:
        tokens = word_tokenize(line)
        for token in tokens:
            #出現回数を数えていく。初登場であれば１が入る
            words[token] += 1

#出現頻度の高い順に並び替え
words_sorted = sorted(words.items(), key = lambda x: x[1], reverse=True)

#出現頻度の高い順にIDを付与、出現回数が１のものにはID'0'を付与
word_ID_dict = {word[0]:ID+1 for ID,word in enumerate(words_sorted) if word[1]>1}

In [4]:
# 与えられた単語列に対して，ID番号の列を返す関数
def word2id(words):
    return [word_ID_dict.get(word.lower(),0) for word in words.split()]

# 確認
print(word2id('A Brief History of American Apparels Dov Charney Allegedly Doing  Up'))  

[11, 3749, 1192, 4, 142, 4002, 2394, 1918, 2398, 1542, 19]


## 81 RNNによる予測

### クラスの定義

In [5]:
import torch
from torch import nn

# パラメータの設定
# VOCAB_SIZE = len(set(word_ID_dict.values())) + 1  # 単語の種類数＋パディングの１ ダミーIDのために単語IDが1つ余分に消費されるため，宣言時の語彙サイズも1つ大きくする必要がある
# EMB_SIZE = 300
# PADDING_IDX = len(set(word_ID_dict.values()))
# OUTPUT_SIZE = 4
# HIDDEN_SIZE = 50

class RNN(nn.Module):
    def __init__(self,vocab_size,emb_size,num_layers=1,padding_idx=0,output_size=4,hidden_size=50):
        '''
        vocab_size: 入力単語の種類数
        emb_size: 埋め込みベクトルの次元数
        num_layers: RNNの層の数
        padding_idx: パディングのインデックス
        output_size: ラベルの種類。カテゴリ４種類
        hidden_size: 隠れ状態ベクトルの次元数
        '''
        super().__init__()
        self.hidden_size = hidden_size
        # nn.Embedding(埋め込みに使う辞書の大きさ、埋め込みベクトルの次元):単語埋め込みをする
        # (系列長) -> (系列長, 埋め込み次元)
        self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)
        # emb_size : The number of expected features in the input x
        # hidden_size : The number of features in the hidden state h
        # nonlinearity : 活性化関数 tanhかrelu
        # batch_first : True -> output is (batch,seq,feature)
        self.rnn = nn.RNN(emb_size, hidden_size, nonlinearity='tanh', batch_first=True)
        # in_features : 入力次元数
        # out_features :　出力次元数
        self.fc = nn.Linear(hidden_size, output_size)
        self.num_layers = num_layers

    def forward(self, x):
        self.batch_size = x.size()[0]
        hidden = self.init_hidden(x.device)  # h0のゼロベクトルを作成
        emb = self.emb(x)
        # emb.size() = (batch_size, seq_len, emb_size)
        out, hidden = self.rnn(emb, hidden)
        # out.size() = (batch_size, seq_len, hidden_size)
        out = self.fc(out[:, -1, :])
        # out.size() = (batch_size, output_size)
        return out

    def init_hidden(self, device):
        hidden = torch.zeros(1, self.batch_size, self.hidden_size, device=device)
        return hidden

from torch.utils.data import Dataset

class CreateDataset(Dataset):
    def __init__(self, X, y, tokenizer):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer

    def __len__(self):  # len(Dataset)で返す値を指定
        return len(self.y)

    def __getitem__(self, index):  # Dataset[index]で返す値を指定
        text = self.X[index]
        inputs = self.tokenizer(text)

        return {
        'inputs': torch.tensor(inputs, dtype=torch.int64),
        'labels': torch.tensor(self.y[index], dtype=torch.int64)
        }

### データセットの作成

In [23]:
# ラベルベクトルの作成
category_dict = {'b': 0, 't': 1, 'e':2, 'm':3}
y_train = train['CATEGORY'].map(lambda x: category_dict[x]).values
y_valid = valid['CATEGORY'].map(lambda x: category_dict[x]).values
y_test = test['CATEGORY'].map(lambda x: category_dict[x]).values

# Datasetの作成
dataset_train = CreateDataset(train['TITLE'], y_train, word2id)
dataset_valid = CreateDataset(valid['TITLE'], y_valid, word2id)
dataset_test = CreateDataset(test['TITLE'], y_test, word2id)

print(f'len(Dataset): {len(dataset_train)}')
print('Dataset[index]:')
for var in dataset_train[1]:
    print(f'  {var}: {dataset_train[1][var]}') 
    
# パラメータの設定
VOCAB_SIZE = len(set(word_ID_dict.values())) + 1  # 単語の種類数＋パディングの１
EMB_SIZE = 300
NUM_LAYERS = 1
PADDING_IDX = len(set(word_ID_dict.values()))
OUTPUT_SIZE = 4
HIDDEN_SIZE = 50

# モデルの定義
model = RNN(VOCAB_SIZE, EMB_SIZE, NUM_LAYERS, PADDING_IDX, OUTPUT_SIZE, HIDDEN_SIZE)

# 先頭10件の予測値取得
for i in range(10):
    X = dataset_train[i]['inputs']
    print(X)
    #print(torch.softmax(model(X.unsqueeze(0)), dim=-1))

len(Dataset): 10684
Dataset[index]:
  inputs: tensor([  65, 2453, 1143,  601,    1, 2177,  146,  715,  107,  882,  622])
  labels: 2
tensor([ 266,  881,  266,  881,   14, 2176,  245])
tensor([  65, 2453, 1143,  601,    1, 2177,  146,  715,  107,  882,  622])
tensor([ 193,   17,  177,    1,  748,   62,    7,   39, 1621,   16,  983])
tensor([   0,   18,  458,    7,  154,   18,    3, 1959,  458, 1144])
tensor([ 425,    1,  199, 1960,  749,   98, 2178,  188,   29,  387,  194])
tensor([ 267,  275,   28, 3690,  174,    8,  550,  113,    0,  148])
tensor([1784, 1961, 3691,    3, 3175,   13, 1211,   91,    2, 1377])
tensor([ 573,    4, 5656,    3,  145, 1622, 2454, 1065,    1,   11, 2455,    7,
         438, 2772])
tensor([1785, 1623,  276, 1962, 1145,   10, 4476, 1486, 2456,  159,   13])
tensor([ 104,  459, 5657,   11,  749,  357,    2,  149,    1,  574, 4477, 4478])


## 82 確率的勾配降下法による学習

In [6]:
# W&B の初期設定
import wandb
import numpy as np
# 保存したいハイパーパラメータを指定
config_dict = {
        # "input_dim": train['TITLE'].shape[1],
        # "output_dim": len(np.unique(y_train)),
        "lr": 0.1,
        "epoch": 100,
        "optimizer": "SGD",
        "loss": "CrossEntropyLoss",
        "metric": "accuracy",
    }
# W&B の実行を初期化
wandb.init(
    project="nlp100knock-ch08",
    name='q82',
    config=config_dict
    )
# 保存したハイパーパラメータを取得(dict形式)
config = wandb.config

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkai-satou-r8[0m ([33mkaikaikaikai[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [34]:
#パディングの処理をしていないはず?なのにちゃんと動いているっぽい理由が知りたい
#バッチサイズが1だといけちゃうっぽい
from torch import optim
import numpy as np
from torch.utils.data import DataLoader
import time

#学習率
lr = 0.01
#エポック数
num_epochs = 10
history = np.zeros((0,3))
#損失関数
criterion = nn.CrossEntropyLoss()
#最適化関数 SGD: stochastic gradient descent 確率的勾配降下法
optimizer = optim.SGD(model.parameters(), lr=lr)

base_epochs = len(history)
#デバイスを切り替えてる、なくても動きはする？
device = 'cpu'
model = model.to(device)

#wandbに関するところ
config = wandb.config
wandb.watch(model, criterion, log="all", log_freq=10)

batch_size = 1

train_loader = DataLoader(dataset_train,batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(dataset_valid,batch_size=batch_size, shuffle=False)

#ここから学習の本体
for epoch in range(base_epochs, base_epochs+num_epochs):
    start_time = time.time()
    train_loss, train_acc = 0,0
    valid_loss, valid_acc = 0,0

    '''訓練フェーズ'''
    model.train()
    count = 0
    for data in train_loader:
        
        inputs = data['inputs']#.to(torch.float32)
        labels = data['labels']#.to(torch.int64)
        count += len(labels)
        #デバイスの割り当て
        inputs = inputs.to(device)
        labels = labels.to(device)
        #勾配の初期化
        optimizer.zero_grad()
        #予測計算
        outputs = model(inputs)
        #損失計算
        loss = criterion(outputs, labels)
        train_loss += loss.item()
        #勾配計算
        loss.backward()
        #パラメータ修正
        optimizer.step()
        #予測値算出
        predicted = torch.max(outputs, 1)[1]
        #正解した数を数えて、長さで割る
        train_acc += (predicted == labels).sum().item()
        #損失と精度の計算
        avg_train_loss = train_loss / count
        avg_train_acc = train_acc / count
        
    model.eval()
    count = 0
    with torch.no_grad():
        for data in valid_loader:
            #型を揃えないとエラー
            inputs = data['inputs']#.to(torch.float32)
            labels = data['labels']#.to(torch.int64)
            count += len(labels)
            #デバイスの割り当て
            inputs = inputs.to(device)
            labels = labels.to(device)
            #勾配の初期化
            optimizer.zero_grad()
            #予測計算
            outputs = model(inputs)
            #損失計算
            loss = criterion(outputs, labels)
            valid_loss += loss.item()
            #予測値算出
            predicted = torch.max(outputs, 1)[1]
            #正解した数を数えて、長さで割る
            valid_acc += (predicted == labels).sum().item()
            #損失と精度の計算
            avg_valid_loss = valid_loss / count
            avg_valid_acc = valid_acc / count
    
    print (f'Epoch 【{epoch+1}/{num_epochs+base_epochs}】,train_loss: {avg_train_loss}, train_acc:{avg_train_acc}, valid_loss: {avg_valid_loss}, valid_acc:{avg_valid_acc}')
    wandb.log({"train_loss":avg_train_loss,"train_acc":avg_train_acc,"valid_loss":avg_valid_loss,"valid_acc":avg_valid_acc})
wandb.finish()
    

Epoch 【1/10】,train_loss: 0.317773833017505, train_acc:0.884032197678772, valid_loss: 0.8183775551981602, valid_acc:0.7470059880239521
Epoch 【2/10】,train_loss: 0.28510128084897846, train_acc:0.8989142643204793, valid_loss: 0.8185837631677928, valid_acc:0.7612275449101796
Epoch 【3/10】,train_loss: 0.33225998961571895, train_acc:0.8849681767128417, valid_loss: 0.844965943890485, valid_acc:0.7447604790419161
Epoch 【4/10】,train_loss: 0.3239056474150063, train_acc:0.8829090228378884, valid_loss: 0.7651861743899486, valid_acc:0.7574850299401198
Epoch 【5/10】,train_loss: 0.2960176357231746, train_acc:0.8931111943092475, valid_loss: 0.8657488579580213, valid_acc:0.7597305389221557
Epoch 【6/10】,train_loss: 0.2673523836671166, train_acc:0.9025645825533508, valid_loss: 0.8517741279124146, valid_acc:0.7724550898203593
Epoch 【7/10】,train_loss: 0.2950545200405892, train_acc:0.8933919880194684, valid_loss: 0.8547314057305345, valid_acc:0.7514970059880239
Epoch 【8/10】,train_loss: 0.3042068553754022, trai

0,1
train_acc,▁▅▂▁▄▆▄▃▄█
train_loss,▆▃█▇▄▁▄▅▄▁
valid_acc,▅▆▄▆▆█▅▁▄▅
valid_loss,▃▃▅▁▆▅▅█▄▇

0,1
train_acc,0.90846
train_loss,0.26881
valid_acc,0.74925
valid_loss,0.89108


## 83 ミニバッチ化・GPU上での学習

In [19]:
import time
from torch import optim
import numpy as np
from torch.utils.data import DataLoader

def fit(
        model,
        optimizer,
        criterion,
        num_epochs,
        train_loader,
        valid_loader,
        history,# : NDArray[Shape["*, 5"], Float64]
        wandb=wandb,
        device="cpu"
        ) : #-> NDArray[Shape["*, 5"], Float64]
    """
    docstring:
    the function for training neural network models
    
    Args:
        model: Model instances defined in pytorch. This function will train this model.
        optimizer: Optimization function.
        criterion: Loss funciton.
        num_epochs: Number of epochs.
        train_loader: Dataloader for trianing data.
        valid_loader: Dataloader for valid data.
        history: NDArray containing the loss and accuracy for each epoch.
                np.array[<number of epochs>,<loss against training data>,<accuracy against training data>]
        wandb: Inistialized wandb instance.
        device: Haedware(CPU or GPU) to run the training.
        
    Return:
        history: NDArray containing the loss and accuracy for each epoch
    """

    
    base_epochs = len(history)
    model = model.to(device)

    #wandbに関するところ
    config = wandb.config
    wandb.watch(model, criterion, log="all", log_freq=10)

    #ここから学習の本体
    for epoch in range(base_epochs, base_epochs+num_epochs):
        start_time = time.time()
        train_loss, train_acc = 0,0
        valid_loss, valid_acc = 0,0

        '''訓練フェーズ'''
        model.train()
        count = 0
        for data in train_loader:
            
            inputs = data['inputs']#.to(torch.float32)
            labels = data['labels']#.to(torch.int64)
            count += len(labels)
            #デバイスの割り当て
            inputs = inputs.to(device)
            labels = labels.to(device)
            #勾配の初期化
            optimizer.zero_grad()
            #予測計算
            outputs = model(inputs)
            #損失計算
            loss = criterion(outputs, labels)
            train_loss += loss.item()
            #勾配計算
            loss.backward()
            #パラメータ修正
            optimizer.step()
            #予測値算出
            predicted = torch.max(outputs, 1)[1]
            #正解した数を数えて、長さで割る
            train_acc += (predicted == labels).sum().item()
            #損失と精度の計算
            avg_train_loss = train_loss / count
            avg_train_acc = train_acc / count
            
        model.eval()
        count = 0
        with torch.no_grad():
            for data in valid_loader:
                #型を揃えないとエラー
                inputs = data['inputs']#.to(torch.float32)
                labels = data['labels']#.to(torch.int64)
                count += len(labels)
                #デバイスの割り当て
                inputs = inputs.to(device)
                labels = labels.to(device)
                #勾配の初期化
                optimizer.zero_grad()
                #予測計算
                outputs = model(inputs)
                #損失計算
                loss = criterion(outputs, labels)
                valid_loss += loss.item()
                #予測値算出
                predicted = torch.max(outputs, 1)[1]
                #正解した数を数えて、長さで割る
                valid_acc += (predicted == labels).sum().item()
                #損失と精度の計算
                avg_valid_loss = valid_loss / count
                avg_valid_acc = valid_acc / count
        
        #10回ごとに表示
        if ((epoch+1)%10 == 0):
            print (f'Epoch 【{epoch+1}/{num_epochs+base_epochs}】,train_loss: {train_loss}, train_acc:{train_acc}')
            wandb.log({"train_loss":avg_train_loss,"train_acc":avg_train_acc,"valid_loss":avg_valid_loss,"valid_acc":avg_valid_acc})
    wandb.finish()
    return history
        

In [12]:
# W&B の初期設定
import wandb
# 保存したいハイパーパラメータを指定
config_dict = {
        "lr": 0.1,
        "epoch": 100,
        "optimizer": "SGD",
        "loss": "CrossEntropyLoss",
        "metric": "accuracy",
    }
# W&B の実行を初期化
wandb.init(
    project="nlp100knock-ch08",
    name='q83',
    config=config_dict
    )
# 保存したハイパーパラメータを取得(dict形式)
config = wandb.config

In [65]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

lr=0.01
optimizer = optim.SGD(model.parameters(), lr=lr)

#損失関数
criterion = nn.CrossEntropyLoss()

#エポック数
num_epochs = 10

#バッチサイズ
batch_size = 1

print(dataset_train[0])

#ミニバッチを取り出して長さを揃える関数
def My_collate_func(batch):
    xs, ys = [], []
    print(batch)
    print(batch[0]['inputs'])
    x = batch[0]['inputs']
    y = batch[0]['labels']
    xs.append(torch.LongTensor(x))
    ys.append(torch.LongTensor(y))
    #データ長を揃える処理
    xs = pad_sequence(xs, batch_first=True)
    print(xs)
    ys = pad_sequence(ys, batch_first=True, padding_value=-1.0)
    print(ys)
    return xs, ys

def padding(dataset):
    xs, ys = [], []
    for i in range(len(dataset)):
        xs.append(dataset[i]['inputs'])
        ys.append(dataset[i]['labels'])
        xs = pad_sequence(xs, batch_first=True)
    return Dataset(xs, ys)

train_loader = DataLoader(padding(dataset_train),batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(padding(dataset_valid),batch_size=batch_size, shuffle=False)
test_loader = DataLoader(padding(dataset_test),batch_size=batch_size, shuffle=False)

for data in train_loader:
    print(data)

#     inputs = data['inputs']#.to(torch.float32)
#     labels = data['labels']#.to(torch.int64)
#     print(inputs)
#history = np.zeros((0,3))

#最適化関数 SGD: stochastic gradient descent 確率的勾配降下法
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

history = fit(model,optimizer,criterion,num_epochs,train_loader,valid_loader,history,wandb,device)

{'inputs': tensor([ 266,  881,  266,  881,   14, 2176,  245]), 'labels': tensor(2)}


AttributeError: 'Tensor' object has no attribute 'append'

In [None]:
# class CreateDataset(Dataset):
#     def __init__(self, X, y, tokenizer):
#         self.X = X
#         self.y = y
#         self.tokenizer = tokenizer

#     def __len__(self):  
#         return len(self.y)

#     def __getitem__(self, index):  
#         text = self.X[index]
#         inputs = self.tokenizer(text)

#         return {
#         'inputs': torch.tensor(inputs, dtype=torch.int64),
#         'labels': torch.tensor(self.y[index], dtype=torch.int64)
#         }
        
# dataset_train = CreateDataset(train['TITLE'], y_train, word2id)
# dataset_valid = CreateDataset(valid['TITLE'], y_valid, word2id)

# def My_collate_func(batch):
#     xs, ys = [], []
#     for x,y in batch:
#         xs.append(torch.LongTensor(x))
#         ys.append(torch.LongTensor(y))
#     #データ長を揃える処理
#     xs = pad_sequence(xs, batch_first=True)
#     print(xs)
#     ys = pad_sequence(ys, batch_first=True, padding_value=-1.0)
#     print(ys)
#     return xs, ys

# train_loader = DataLoader(dataset_train,batch_size=batch_size, shuffle=True, collate_fn=My_collate_func)
# valid_loader = DataLoader(dataset_valid,batch_size=batch_size, shuffle=False, collate_fn=My_collate_func)

# for data in train_loader:
#     print(data)

## 84 単語ベクトルの導入

## 85 双方向RNN・多層化

## 86 畳み込みニューラルネットワーク (CNN)

## 87 確率的勾配降下法によるCNNの学習

## 88 パラメータチューニング

## 89 事前学習済み言語モデルからの転移学習