In [None]:
# ====================
# ライブラリのインストール
# ====================
! pip install --quiet torch==1.6.0
! pip install --quiet torchtext==0.7.0
! pip install --quiet pytorch-lightning==1.0.8
! pip install --quiet torchwordemb
! pip install --quiet optuna

[K     |████████████████████████████████| 748.8 MB 18 kB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.11.1+cu111 requires torch==1.10.0, but you have torch 1.6.0 which is incompatible.
torchtext 0.11.0 requires torch==1.10.0, but you have torch 1.6.0 which is incompatible.
torchaudio 0.10.0+cu111 requires torch==1.10.0, but you have torch 1.6.0 which is incompatible.[0m
[K     |████████████████████████████████| 4.5 MB 3.2 MB/s 
[K     |████████████████████████████████| 1.2 MB 54.0 MB/s 
[K     |████████████████████████████████| 561 kB 3.0 MB/s 
[K     |████████████████████████████████| 134 kB 55.4 MB/s 
[K     |████████████████████████████████| 829 kB 49.9 MB/s 
[K     |████████████████████████████████| 596 kB 55.6 MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone
  Building wheel for torchwordemb (setup.py

In [None]:
# ここでランタイムを再起動

# ライブラリの読み込み
import os
import string
import torch
import torchwordemb
import pandas as pd
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
from torchtext.vocab import FastText
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from torchtext.data import Example, Field, Dataset, BucketIterator

# データセットのダウンロード
if os.path.isfile("/content/NewsAggregatorDataset.zip") == False:
    ! wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
    ! unzip NewsAggregatorDataset.zip
    # 読込時のエラー回避のためダブルクォーテーションをシングルクォーテーションに置換
    ! sed -e 's/"/'\''/g' ./newsCorpora.csv > ./newsCorpora_re.csv
df = pd.read_csv('/content/newsCorpora.csv', sep='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])
df1 = df.loc[df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']), ['TITLE', 'CATEGORY']]

# データの分割 stratifyを設定することで訓練データとテストデータの指定した中身の割合を同じにすることができる
train, temp = train_test_split(df1, test_size=0.2, shuffle=True, random_state=0, stratify=df1['CATEGORY'])
test, valid = train_test_split(temp, test_size=0.5, shuffle=True, random_state=0, stratify=temp['CATEGORY'])

# データの保存
! mkdir -p /content/data/
train.to_csv('/content/data/train.txt', sep="\t", index=False)
test.to_csv('/content/data/test.txt', sep="\t", index=False)
valid.to_csv('/content/data/valid.txt', sep="\t", index=False)

# maketransの説明  maketrans(変換前文字列, 変換後文字列, 削除対象文字列)　string.punctuationの中身  !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
table = str.maketrans(string.punctuation, ' '*len(string.punctuation))

# ラベルの辞書
label2id = {'b': 0, 't': 1, 'e':2, 'm':3}

# データの読み込み（text用のFieldとlabel用のfieldを作る必要がある　テキストの方は単語分割を行うので上で作った単語分割用の関数を渡す）
text_field = Field(sequential=True, use_vocab=True)
label_field = Field(sequential=False, use_vocab=False, is_target=True)
fields = [("x", text_field), ("t", label_field)]

# テキストの読み込み
def load_corpus(fname):
    examples = list()
    with open(fname, "r") as f:
        df  = pd.read_csv(fname, sep='\t')
        sentences = df["TITLE"]
        labels = df["CATEGORY"]
        for sentence, label in zip(sentences, labels):
            word_list = sentence.translate(table).split()
            label_id = label2id[label]
            # 単語のリストとラベルを渡せばTorchtextが望む形式に変換してくれる　Example.fromlist
            examples.append(Example.fromlist([word_list, label_id], fields))
        return Dataset(examples, fields)

dataset_train = load_corpus("/content/data/train.txt")
dataset_val = load_corpus("/content/data/valid.txt")
dataset_test = load_corpus("/content/data/test.txt")

# 語彙を登録（訓練データに含まれる単語にIDを割り振る） min_freq=2 2回以上出てきた単語すべてにIDを割り振る
text_field.build_vocab(dataset_train, min_freq=2)



In [None]:
# ============
# 88. パラメータチューニング
# ============

class RNN(pl.LightningModule):

    # 埋め込み層, 隠れ層, 全結合層の定義 
    # n_inputは単語の種類 n_embedは単語ベクトルのサイズ, n_hiddenは文ベクトル, n_layersはlstmが何層あるか, bidirectionalはlstmを双方向にするか
    def __init__(self, n_input, n_embed, n_hidden, n_layers, n_output, dropout, bidirectional, lr):
        super(RNN, self).__init__()
        self.lr = lr
        # 埋め込み層　nn.Embeddingは単語IDを与えるとone-hotベクトルに変換した後, n_embedのサイズのベクトルに変換する
        self.embed = nn.Embedding(num_embeddings=n_input, embedding_dim=n_embed, padding_idx=1)
        # lstm層
        self.lstm = nn.LSTM(input_size=n_embed, hidden_size=n_hidden, num_layers=n_layers, dropout=dropout, bidirectional=bidirectional)
        # 全結合層
        self.fc = nn.Linear(in_features=n_hidden * (2 if bidirectional==True else 1), out_features=n_output)
    
    # 順伝播
    # oとhが同じものでoを出力として扱う それにfcをかけるとラベルになる
    def forward(self, x):
        o, (h, c) = self.lstm(self.embed(x))
        return self.fc(o[-1])

    # 訓練用データのバッチを受け取って損失を計算
    def training_step(self, batch, batch_idx):
        x, t = batch
        # 予測したラベルがy
        y = self(x)
        loss = self.lossfun(y, t)
        self.log("train_loss", loss)
        # pl.LightningModuleはlossだけ返せばbackwardは勝手にやってくれる
        return loss
    
    # 検証用データのバッチを受け取って損失を計算
    def validation_step(self, batch, batch_idx):
        x, t = batch
        y = self(x)
        loss = self.lossfun(y, t)
        self.log("val_loss", loss)

    # 評価用データのバッチを受け取って分類の正解率を計算
    def test_step(self, batch, batch_idx):
        x, t = batch
        y = self(x)
        y = torch.argmax(y, dim=1)

        accuracy = torch.sum(t == y).item() / (len(y) * 1.0)
        self.log("test_acc", accuracy)

    # 損失関数を設定
    def lossfun(self, y, t):
        return F.cross_entropy(y, t)

    # 最適化手法を設定 SGD→Adam
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

In [None]:
import optuna

def objective(trial):
    # チューニング対象パラメータのセット
    n_embed = int(trial.suggest_discrete_uniform('emb_size', 300,  500, 100))
    n_hidden = int(trial.suggest_discrete_uniform('out_channels', 300, 400, 100))
    n_layers = int(trial.suggest_discrete_uniform('n_layers', 2, 3, 1))
    batch_size = int(trial.suggest_discrete_uniform('batch_size', 32, 64, 32))
    lr = trial.suggest_loguniform('lr', 1e-4, 1e-1)

    dataloader_train = BucketIterator(dataset_train, batch_size=batch_size, shuffle=True)
    dataloader_val = BucketIterator(dataset_val, batch_size=batch_size, shuffle=False)
    dataloader_test = BucketIterator(dataset_test, batch_size=batch_size, shuffle=False)

    # 固定パラメータの設定
    n_input = len(text_field.vocab)
    n_output = len(label2id)
    dropout = 0.1
    num_epochs = 20
    bidirectional = True

    model = RNN(n_input, n_embed, n_hidden, n_layers, n_output, dropout, bidirectional, lr)

    ! rm -r model
    ! rm -r lightning_logs

    # 訓練中にモデルを保存するための設定
    checkpoint = pl.callbacks.ModelCheckpoint(
        # 検証用データにおける損失が最も小さいモデルを保存する
        monitor="val_loss", mode="min", save_top_k=1,
        # モデルファイル（重みのみ）を "model" というディレクトリに保存する
        save_weights_only=True, dirpath="model/"
    )

    early_stopping = pl.callbacks.EarlyStopping(
        monitor="val_loss", mode="min", patience=5
    )

    # 訓練
    trainer = pl.Trainer(gpus=1, max_epochs=num_epochs, callbacks=[checkpoint, early_stopping])
    trainer.fit(model, dataloader_train, dataloader_val)

    # ベストモデルの検証用データに対するloss
    valid_loss = checkpoint.best_model_score

    return valid_loss

In [None]:
# 最適化
study = optuna.create_study(direction="minimize")
optuna.logging.disable_default_handler()
study.optimize(objective, n_trials=100)

In [None]:
# 結果の表示
print('Best trial:')
trial = study.best_trial
print('  Value: {:.3f}'.format(trial.value))
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

Best trial:
  Value: 0.325
  Params: 
    emb_size: 500.0
    out_channels: 300.0
    n_layers: 2.0
    batch_size: 32.0
    lr: 0.008889831308193414


In [None]:
# 学習済み単語ベクトルの読み込み
vocab, vec = torchwordemb.load_word2vec_bin("/content/drive/MyDrive/Colab Notebooks/nlp100/chapter8/GoogleNews-vectors-negative300.bin")
text_field.vocab.set_vectors(stoi=vocab, vectors=vec, dim=300)

# バッチサイズ
batch_size = trial.params['batch_size']

# データセットオブジェクトからデータローダーを作成  BucketIteratorはdataloaderに変換できる
dataloader_train = BucketIterator(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_val = BucketIterator(dataset_val, batch_size=batch_size, shuffle=False)
dataloader_test = BucketIterator(dataset_test, batch_size=batch_size, shuffle=False)

# 以下同じように学習
! rm -r model
! rm -r lightning_logs
# 単語の種類
n_input = len(text_field.vocab)
# 単語ベクトルの次元
n_embed = int(trial.params['emb_size'])
n_hidden = int(trial.params['out_channels'])
n_layers = int(trial.params['n_layers'])
n_output = len(label2id)
dropout = 0.1
lr = trial.params['lr']
bidirectional = True

'''
Best trial:
  Value: 0.325
  Params: 
    emb_size: 500.0
    out_channels: 300.0
    n_layers: 2.0
    batch_size: 32.0
    lr: 0.008889831308193414
'''


model = RNN(n_input, n_embed, n_hidden, n_layers, n_output, dropout, bidirectional, lr)

# 訓練中にモデルを保存するための設定
checkpoint = pl.callbacks.ModelCheckpoint(
    # 検証用データにおける損失が最も小さいモデルを保存する
    monitor="val_loss", mode="min", save_top_k=1,
    # モデルファイル（重みのみ）を "model" というディレクトリに保存する
    save_weights_only=True, dirpath="model/"
)

early_stopping = pl.callbacks.EarlyStopping(
    monitor="val_loss", mode="min", patience=5
)

# 訓練
trainer = pl.Trainer(gpus=1, max_epochs=20, callbacks=[checkpoint, early_stopping])
trainer.fit(model, dataloader_train, dataloader_val)

# ベストモデルの確認
print("ベストモデル: ", checkpoint.best_model_path)
print("ベストモデルの検証用データにおける損失: ", checkpoint.best_model_score)

# 評価
test = trainer.test(test_dataloaders=dataloader_test)
print("Test accuracy = %.3f" % (test[0]["test_acc"]))

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type      | Params
------------------------------------
0 | embed | Embedding | 4.7 M 
1 | lstm  | LSTM      | 4.1 M 
2 | fc    | Linear    | 2.4 K 


Validation sanity check: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

ベストモデル:  /content/model/epoch=2.ckpt
ベストモデルの検証用データにおける損失:  tensor(0.4253, device='cuda:0')


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': tensor(0.8577),
 'train_loss': tensor(0.0483, device='cuda:0'),
 'val_loss': tensor(0.6138, device='cuda:0')}
--------------------------------------------------------------------------------
Test accuracy = 0.858
