In [1]:
# ====================
# ライブラリのインストール
# ====================
! pip install --quiet torch==1.6.0

[K     |████████████████████████████████| 748.8 MB 17 kB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.11.1+cu111 requires torch==1.10.0, but you have torch 1.6.0 which is incompatible.
torchtext 0.11.0 requires torch==1.10.0, but you have torch 1.6.0 which is incompatible.
torchaudio 0.10.0+cu111 requires torch==1.10.0, but you have torch 1.6.0 which is incompatible.[0m
[?25h

In [6]:
# ここでランタイムを再起動

# ライブラリの読み込み
import os
import time
import string
import torch
import pandas as pd
import torch.nn as nn
from torch import optim
from torch.nn import functional as F
from collections import defaultdict
from gensim.models import KeyedVectors
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split


# データセットのダウンロード
if os.path.isfile("/content/NewsAggregatorDataset.zip") == False:
    ! wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
    ! unzip NewsAggregatorDataset.zip
    # 読込時のエラー回避のためダブルクォーテーションをシングルクォーテーションに置換
    ! sed -e 's/"/'\''/g' ./newsCorpora.csv > ./newsCorpora_re.csv
df = pd.read_csv('/content/newsCorpora.csv', sep='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])
df1 = df.loc[df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']), ['TITLE', 'CATEGORY']]

# データの分割 stratifyを設定することで訓練データとテストデータの指定した中身の割合を同じにすることができる
train, temp = train_test_split(df1, test_size=0.2, shuffle=True, random_state=0, stratify=df1['CATEGORY'])
test, valid = train_test_split(temp, test_size=0.5, shuffle=True, random_state=0, stratify=temp['CATEGORY'])

# データの保存
! mkdir -p /content/data/
train.to_csv('/content/data/train.txt', sep="\t", index=False)
test.to_csv('/content/data/test.txt', sep="\t", index=False)
valid.to_csv('/content/data/valid.txt', sep="\t", index=False)

In [2]:
# 辞書作成の関数
def get_word2id(fname):
    my_dict = defaultdict(int)
    train = pd.read_table(fname)
    table = str.maketrans(string.punctuation, ' '*len(string.punctuation)) 
    for text in train['TITLE']:
        for word in text.translate(table).split():
            my_dict[word] += 1
    my_dict = sorted(my_dict.items(), key=lambda x:x[1], reverse=True)
    # 単語ID辞書の作成
    word2id = {word: i + 1 for i, (word, cnt) in enumerate(my_dict) if cnt > 1}  # 出現頻度が2回以上の単語を登録
    return word2id

word2id = get_word2id("/content/data/train.txt")

# ラベル列を返す関数
def tokenizer(text, word2id=word2id, unknown=0):
    table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    return [word2id.get(word, unknown) for word in text.translate(table).split()] 

In [3]:
# データセットの作成
class CreateDataset(Dataset):
    def __init__(self, X, y, tokenizer):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer

    def __len__(self):  # len(Dataset)で返す値を指定
        return len(self.y)

    def __getitem__(self, index):  # Dataset[index]で返す値を指定
        text = self.X[index]
        inputs = self.tokenizer(text)

        return {
            'inputs': torch.tensor(inputs, dtype=torch.int64),
            'labels': torch.tensor(self.y[index], dtype=torch.int64)
        }
    
def make_dataset(input):
    label2id = {'b': 0, 't': 1, 'e':2, 'm':3}
    df = pd.read_table(input)
    y = df['CATEGORY'].map(lambda x: label2id[x]).values
    dataset = CreateDataset(df['TITLE'], y, tokenizer)
    return dataset

dataset_train = make_dataset("/content/data/train.txt")
dataset_valid = make_dataset("/content/data/valid.txt")
dataset_test  = make_dataset("/content/data/test.txt")

In [4]:
# 学習済み単語ベクトルの取得
def get_emb_weights(word2id):
    wv = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Colab Notebooks/nlp100/chapter8/GoogleNews-vectors-negative300.bin.gz', binary=True)
    vocab_size = len(word2id) + 1
    emb_size = wv.vector_size
    emb_weights = torch.zeros(vocab_size, emb_size)
    for i, word in enumerate(word2id.keys()):
        try:
            emb_weights[i] = torch.tensor(wv[word])
        except KeyError:
            emb_weights[i] = torch.rand(emb_size)

    return emb_size, emb_weights
EMB_SIZE, emb_weights = get_emb_weights(word2id)

  if __name__ == '__main__':


In [24]:
# ============
# 86. 畳み込みニューラルネットワーク(CNN)
# ============
dataloader_train = DataLoader(dataset_train, batch_size=1, shuffle=False)

class CNN(nn.Module):
    def __init__(self, vocab_size, emb_size, padding_idx, output_size, out_channels, kernel_heights, stride, padding, emb_weights=None):
        super().__init__()
        if emb_weights is None:
            self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)
        else:
            self.emb = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx)
        self.conv = nn.Conv2d(1, out_channels, (kernel_heights, emb_size), stride, (padding, 0))
        self.fc = nn.Linear(out_channels, output_size)

    def forward(self, x):
        emb = self.emb(x).unsqueeze(1)
        conv = self.conv(emb)
        act = F.relu(conv.squeeze(3))
        max_pool = F.max_pool1d(act, act.size()[2])
        out = self.fc(max_pool.squeeze(2))
        return out

VOCAB_SIZE = len(word2id) + 1
PADDING_IDX = len(set(word2id.values()))
OUTPUT_SIZE = 4
OUT_CHANNELS = 100
KERNEL_HEIGHTS = 3
STRIDE = 1
PADDING = 1

model = CNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, OUT_CHANNELS, KERNEL_HEIGHTS, STRIDE, PADDING, emb_weights=emb_weights)

# 10件の予測値のみ表示
for data, i in zip(dataloader_train, range(10)):
    print(torch.softmax(model(data['inputs']), dim=-1))

tensor([[0.2945, 0.2494, 0.2807, 0.1753]], grad_fn=<SoftmaxBackward>)
tensor([[0.3026, 0.2432, 0.2758, 0.1784]], grad_fn=<SoftmaxBackward>)
tensor([[0.3018, 0.2358, 0.2729, 0.1895]], grad_fn=<SoftmaxBackward>)
tensor([[0.2864, 0.2275, 0.2796, 0.2065]], grad_fn=<SoftmaxBackward>)
tensor([[0.2992, 0.2506, 0.2786, 0.1716]], grad_fn=<SoftmaxBackward>)
tensor([[0.2764, 0.2533, 0.2927, 0.1777]], grad_fn=<SoftmaxBackward>)
tensor([[0.2906, 0.2283, 0.2861, 0.1950]], grad_fn=<SoftmaxBackward>)
tensor([[0.3018, 0.2419, 0.2633, 0.1930]], grad_fn=<SoftmaxBackward>)
tensor([[0.2960, 0.2747, 0.2646, 0.1647]], grad_fn=<SoftmaxBackward>)
tensor([[0.2988, 0.2569, 0.2768, 0.1675]], grad_fn=<SoftmaxBackward>)


In [18]:
def calculate_loss_and_accuracy(model, dataset, device=None, criterion=None):
    """損失・正解率を計算"""
    dataloader = DataLoader(dataset, batch_size=1, shuffle=False)
    loss = 0.0
    total = 0
    correct = 0
    with torch.no_grad():
        for data in dataloader: 
            # デバイスの指定
            inputs = data['inputs'].to(device)
            labels = data['labels'].to(device)

            # 順伝播
            outputs = model(inputs)

            # 損失計算
            if criterion != None:
                loss += criterion(outputs, labels).item()

            # 正解率計算
            pred = torch.argmax(outputs, dim=-1)
            total += len(inputs)
            correct += (pred == labels).sum().item()

        return loss / len(dataset), correct / total

In [19]:
class Padsequence():
    """Dataloaderからミニバッチを取り出すごとに最大系列長でパディング"""
    def __init__(self, padding_idx):
        self.padding_idx = padding_idx

    def __call__(self, batch):
        sorted_batch = sorted(batch, key=lambda x: x['inputs'].shape[0], reverse=True)
        sequences = [x['inputs'] for x in sorted_batch]
        sequences_padded = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=self.padding_idx)
        labels = torch.LongTensor([x['labels'] for x in sorted_batch])

        return {'inputs': sequences_padded, 'labels': labels}

In [20]:
def train_model(dataset_train, dataset_valid, batch_size, model, criterion, optimizer, num_epochs, collate_fn=None, device=None):
    """モデルの学習を実行し、損失・正解率のログを返す"""
    # デバイスの指定
    model.to(device)

    # dataloaderの作成
    dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    dataloader_valid = DataLoader(dataset_valid, batch_size=1, shuffle=False)

    # スケジューラの設定
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs, eta_min=1e-5, last_epoch=-1)

    # 学習
    log_train = []
    log_valid = []
    for epoch in range(num_epochs):
        # 開始時刻の記録
        s_time = time.time()

        # 訓練モードに設定
        model.train()
        for data in dataloader_train:
            # 勾配をゼロで初期化
            optimizer.zero_grad()

            # 順伝播 + 誤差逆伝播 + 重み更新
            inputs = data['inputs'].to(device)
            labels = data['labels'].to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        # 評価モードに設定
        model.eval()

        # 損失と正解率の算出
        loss_train, acc_train = calculate_loss_and_accuracy(model, dataset_train, device, criterion=criterion)
        loss_valid, acc_valid = calculate_loss_and_accuracy(model, dataset_valid, device, criterion=criterion)
        log_train.append([loss_train, acc_train])
        log_valid.append([loss_valid, acc_valid])

        # チェックポイントの保存
        torch.save({'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, f'checkpoint{epoch + 1}.pt')

        # 終了時刻の記録
        e_time = time.time()

        # ログを出力
        print(f'epoch: {epoch + 1}, loss_train: {loss_train:.4f}, accuracy_train: {acc_train:.4f}, loss_valid: {loss_valid:.4f}, accuracy_valid: {acc_valid:.4f}, {(e_time - s_time):.4f}sec') 

        # 検証データの損失が3エポック連続で低下しなかった場合は学習終了
        if epoch > 2 and log_valid[epoch - 3][0] <= log_valid[epoch - 2][0] <= log_valid[epoch - 1][0] <= log_valid[epoch][0]:
            break

        # スケジューラを1ステップ進める
        scheduler.step()

    return {'train': log_train, 'valid': log_valid}

In [26]:
# ============
# 87. 確率的勾配降下法によるCNNの学習
# ============
# パラメータの設定
VOCAB_SIZE = len(set(word2id.values())) + 1
EMB_SIZE = 300
PADDING_IDX = len(set(word2id.values()))
OUTPUT_SIZE = 4
OUT_CHANNELS = 100
KERNEL_HEIGHTS = 3
STRIDE = 1
PADDING = 1
LEARNING_RATE = 0.1
BATCH_SIZE = 64
NUM_EPOCHS = 30

# モデルの定義
model = CNN(VOCAB_SIZE, EMB_SIZE, PADDING_IDX, OUTPUT_SIZE, OUT_CHANNELS, KERNEL_HEIGHTS, STRIDE, PADDING, emb_weights=emb_weights)

# 損失関数の定義
criterion = nn.CrossEntropyLoss()

# オプティマイザの定義
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

# デバイスの指定
device = torch.device('cuda')

# モデルの学習
log = train_model(dataset_train, dataset_valid, BATCH_SIZE, model, criterion, optimizer, NUM_EPOCHS, collate_fn=Padsequence(PADDING_IDX), device=device)

# 正解率の算出
_, acc_train = calculate_loss_and_accuracy(model, dataset_train, device)
_, acc_test = calculate_loss_and_accuracy(model, dataset_test, device)
print(f'正解率（学習データ）：{acc_train:.3f}')
print(f'正解率（評価データ）：{acc_test:.3f}')

epoch: 1, loss_train: 1.0348, accuracy_train: 0.6089, loss_valid: 1.0568, accuracy_valid: 0.5652, 7.4740sec
epoch: 2, loss_train: 0.8415, accuracy_train: 0.7003, loss_valid: 0.8865, accuracy_valid: 0.6717, 6.9949sec
epoch: 3, loss_train: 0.7158, accuracy_train: 0.7554, loss_valid: 0.8024, accuracy_valid: 0.7151, 7.0725sec
epoch: 4, loss_train: 0.6053, accuracy_train: 0.7894, loss_valid: 0.7224, accuracy_valid: 0.7391, 7.0970sec
epoch: 5, loss_train: 0.5314, accuracy_train: 0.8081, loss_valid: 0.6914, accuracy_valid: 0.7406, 7.0678sec
epoch: 6, loss_train: 0.4598, accuracy_train: 0.8407, loss_valid: 0.6696, accuracy_valid: 0.7541, 7.0824sec
epoch: 7, loss_train: 0.4458, accuracy_train: 0.8381, loss_valid: 0.7013, accuracy_valid: 0.7339, 7.2000sec
epoch: 8, loss_train: 0.3583, accuracy_train: 0.8897, loss_valid: 0.6495, accuracy_valid: 0.7661, 7.0206sec
epoch: 9, loss_train: 0.2905, accuracy_train: 0.9195, loss_valid: 0.5896, accuracy_valid: 0.7819, 7.0035sec
epoch: 10, loss_train: 0.259