<a href="https://colab.research.google.com/github/hashk1/nlp-100-knock-2020-rev2/blob/main/08-%E3%83%8B%E3%83%A5%E3%83%BC%E3%83%A9%E3%83%AB%E3%83%8D%E3%83%83%E3%83%88.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 第8章: ニューラルネット

## 準備

In [None]:
# ライブラリ読み込み
from gensim.models import KeyedVectors
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# データ取得
! wget -c https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
! wget -c https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
! unzip -o -d NewsAggregatorDataset NewsAggregatorDataset.zip

In [None]:
df = pd.read_table("NewsAggregatorDataset/newsCorpora.csv", header=None)
df.columns = ["ID", "TITLE", "URL", "PUBLISHER", "CATEGORY", "STORY", "HOSTNAME", "TIMESTAMP"]
df = df.query('PUBLISHER in ["Reuters", "Huffington Post", "Businessweek", "Contactmusic.com", "Daily Mail"]')
df = df[["CATEGORY", "TITLE"]]
df["CATEGORY"] = df["CATEGORY"].map({"b": 0, "t": 1, "e": 2, "m": 3})

X = df
y = df["CATEGORY"]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size =0.8, stratify=y, random_state=0)
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, train_size =0.5, stratify=y_valid, random_state=0)

X_train.to_csv("train.txt", sep="\t", index=False, header=None)
X_valid.to_csv("valid.txt", sep="\t", index=False, header=None)
X_test.to_csv("test.txt", sep="\t", index=False, header=None)

## ここから本番

In [None]:
# ライブラリ読み込み
import time
import pandas as pd
import numpy as np
import joblib
from gensim.models import KeyedVectors
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
tqdm.pandas()
import matplotlib.pyplot as plt
%matplotlib inline

### 70. 単語ベクトルの和による特徴量

In [None]:
# データ
X_train = pd.read_table("train.txt", header=None)
X_valid = pd.read_table("valid.txt", header=None)
X_test = pd.read_table("test.txt", header=None)
X_train.columns = ["CATEGORY", "TITLE"]
X_valid.columns = ["CATEGORY", "TITLE"]
X_test.columns = ["CATEGORY", "TITLE"]

# モデル
model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [None]:
def calc_docvec_from_row(row):
    global model
    wvs = [model[w] for w in row["TITLE"].split() if w in model.vocab]
    return np.nanmean(wvs, axis=0) if len(wvs) > 0 else np.zeros(shape=(model.vector_size, ))

In [None]:
docvec_train = X_train.progress_apply(calc_docvec_from_row, axis=1)
docvec_valid = X_valid.progress_apply(calc_docvec_from_row, axis=1)
docvec_test = X_test.progress_apply(calc_docvec_from_row, axis=1)

In [None]:
# データ保存
joblib.dump(np.array(docvec_train.tolist()), "X_train.joblib")
joblib.dump(np.array(docvec_valid.tolist()), "X_valid.joblib")
joblib.dump(np.array(docvec_test.tolist()), "X_test.joblib")
joblib.dump(np.array(X_train["CATEGORY"]), "y_train.joblib")
joblib.dump(np.array(X_valid["CATEGORY"]), "y_valid.joblib")
joblib.dump(np.array(X_test["CATEGORY"]), "y_test.joblib")

### 71. 単層ニューラルネットワークによる予測

In [None]:
# 以下で使う

# 学習データ
X_train = joblib.load("X_train.joblib")
y_train = joblib.load("y_train.joblib")
X_train = torch.from_numpy(X_train.astype(np.float32)).clone()
y_train = torch.from_numpy(y_train.astype(np.int64)).clone()
dataset_train = TensorDataset(X_train, y_train)
dataloader_train = DataLoader(dataset_train, batch_size=1, shuffle=True)

# 検証データ
X_test = joblib.load("X_test.joblib")
y_test = joblib.load("y_test.joblib")
X_test = torch.from_numpy(X_test.astype(np.float32)).clone()
y_test = torch.from_numpy(y_test.astype(np.int64)).clone()
dataset_test = TensorDataset(X_test, y_test)
dataloader_test = DataLoader(dataset_test, batch_size=len(dataset_test), shuffle=False)

In [None]:
class SLPNet(nn.Module):
  def __init__(self, input_size, output_size):
    super().__init__()
    self.fc = nn.Linear(input_size, output_size, bias=False)
    nn.init.normal_(self.fc.weight, 0.0, 1.0)

  def forward(self, x):
    x = self.fc(x)
    return x

In [None]:
model = SLPNet(X_train.size()[1], 4)

y_hat_1 = torch.softmax(model(X_train[:1]), dim=-1)
print("y-hat-1: \n{}".format(y_hat_1))
Y_hat = torch.softmax(model(X_train[:4]), dim=-1)
print("Y-hat: \n{}".format(Y_hat))

### 72. 損失と勾配の計算

In [None]:
model = SLPNet(X_train.size()[1], 4)
criterion = nn.CrossEntropyLoss()

loss_1 = criterion(model(X_train[:1]), y_train[:1])
model.zero_grad()
loss_1.backward()
print("cross entropy loss from x_1: {}".format(loss_1))
print("gradient from x_1: {}".format(model.fc.weight.grad))

loss_4 = criterion(model(X_train[:4]), y_train[:4])
model.zero_grad()
loss_4.backward(retain_graph=True)
print("cross entropy loss from X_[1:4]: {}".format(loss_4))
print("gradient from X_[1:4]: {}".format(model.fc.weight.grad))

### 73. 確率的勾配降下法による学習

In [None]:
model = SLPNet(X_train.size()[1], 4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

n_epoch = 100
for epoch in range(n_epoch):
  
  model.train()   
  for X, y in dataloader_train:
    optimizer.zero_grad()
    loss = criterion(model(X), y)
    loss.backward()
    optimizer.step()

  model.eval()
  with torch.no_grad():
    print("epoch: {}\ttrain_loss: {}".format(epoch+1, float(criterion(model(X_train), y_train))))

### 74. 正解率の計測

In [None]:
model.eval()
with torch.no_grad():
    print("accuracy for train: {}".format((model(X_train).max(axis=1).indices == y_train).sum() / len(y_train)))
    print("accuracy for test: {}".format((model(X_test).max(axis=1).indices == y_test).sum() / len(y_test)))

### 75. 損失と正解率のプロット

In [None]:
model = SLPNet(X_train.size()[1], 4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

n_epoch = 100
train_losses = []
test_losses = []
train_accuracies = []
test_accuracies = []
for epoch in range(n_epoch):
  
  model.train()
  for X, y in dataloader_train:
    optimizer.zero_grad()
    loss = criterion(model(X), y)
    loss.backward()
    optimizer.step()

  model.eval()
  with torch.no_grad():
    train_losses.append(float(criterion(model(X_train), y_train)))
    test_losses.append(float(criterion(model(X_test), y_test)))
    train_accuracies.append(float((model(X_train).max(axis=1).indices == y_train).sum() / len(y_train)))
    test_accuracies.append(float((model(X_test).max(axis=1).indices == y_test).sum() / len(y_test)))
    print("epoch: {}\ttrain_loss: {}\ttest_loss: {}\ttrain_accuracy: {}\ttest_accuracy: {}".format(epoch+1, train_losses[epoch], test_losses[epoch], train_accuracies[epoch], test_accuracies[epoch]))

In [None]:
plt.plot(train_losses, label="train loss")
plt.plot(test_losses, label="test loss")
plt.legend()
plt.show()

plt.plot(train_accuracies, label="train accuracy")
plt.plot(test_accuracies, label="test accuracy")
plt.legend()
plt.show()

### 76. チェックポイント

In [None]:
model = SLPNet(X_train.size()[1], 4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

n_epoch = 100
train_losses = []
test_losses = []
train_accuracies = []
test_accuracies = []
for epoch in range(n_epoch):

  model.train()
  for X, y in dataloader_train:
    optimizer.zero_grad()
    loss = criterion(model(X), y)
    loss.backward()
    optimizer.step()
    
  model.eval()
  with torch.no_grad():
    torch.save({"epoch": epoch+1, "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict()}, "checkpoint-{}.pytorch".format(epoch+1)) 
    train_losses.append(float(criterion(model(X_train), y_train)))
    test_losses.append(float(criterion(model(X_test), y_test)))
    train_accuracies.append(float((model(X_train).max(axis=1).indices == y_train).sum() / len(y_train)))
    test_accuracies.append(float((model(X_test).max(axis=1).indices == y_test).sum() / len(y_test)))
    print("epoch: {}\ttrain_loss: {}\ttest_loss: {}\ttrain_accuracy: {}\ttest_accuracy: {}".format(epoch+1, train_losses[epoch], test_losses[epoch], train_accuracies[epoch], test_accuracies[epoch]))

### 77. ミニバッチ化

In [None]:
batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256]
for batch_size in tqdm(batch_sizes):
  dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)

  model = SLPNet(X_train.size()[1], 4)
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.SGD(model.parameters(), lr=0.01)

  n_epoch = 100
  train_losses = []
  test_losses = []
  train_accuracies = []
  test_accuracies = []
  elapsed_time = 0
  for epoch in range(n_epoch):

    model.train()
    s_time = time.time()
    for X, y in dataloader_train:
      optimizer.zero_grad()
      loss = criterion(model(X), y)
      loss.backward()
      optimizer.step()

    e_time = time.time()
    elapsed_time += e_time - s_time
    
    model.eval()
    with torch.no_grad():
      #torch.save({"batch_size": batch_size, "epoch": epoch+1, "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict()}, "checkpoint-{}.pytorch".format(epoch+1)) 
      train_losses.append(float(criterion(model(X_train), y_train)))
      test_losses.append(float(criterion(model(X_test), y_test)))
      train_accuracies.append(float((model(X_train).max(axis=1).indices == y_train).sum() / len(y_train)))
      test_accuracies.append(float((model(X_test).max(axis=1).indices == y_test).sum() / len(y_test)))
      #print("batch_size: {}\tepoch: {}\ttrain_loss: {}\ttest_loss: {}\ttrain_accuracy: {}\ttest_accuracy: {}".format(batch_size, epoch+1, train_losses[epoch], test_losses[epoch], train_accuracies[epoch], test_accuracies[epoch])) 

  elapsed_time /= n_epoch
  print("\nbatch_size: {}\telapsed_time: {}\ttrain_loss: {}\ttest_loss: {}\ttrain_accuray: {}\ttest_accuracy: {}".format(batch_size, elapsed_time, train_losses[n_epoch-1], test_losses[n_epoch-1], train_accuracies[n_epoch-1], test_accuracies[n_epoch-1]))

### 78. GPU上での学習

In [None]:
device = "cuda:0"

X_train = X_train.to(device)
X_test = X_test.to(device)
y_train = y_train.to(device)
y_test = y_test.to(device)
dataset_train = TensorDataset(X_train, y_train)

batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256]
for batch_size in tqdm(batch_sizes):
  dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
  
  model = SLPNet(X_train.size()[1], 4).to(device)
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.SGD(model.parameters(), lr=0.01)

  n_epoch = 100
  train_losses = []
  test_losses = []
  train_accuracies = []
  test_accuracies = []
  elapsed_time = 0
  for epoch in range(n_epoch):

    model.train()
    s_time = time.time()
    for X, y in dataloader_train:
        optimizer.zero_grad()
        loss = criterion(model(X), y)
        loss.backward()
        optimizer.step()

    e_time = time.time()
    elapsed_time += e_time - s_time
    
    model.eval()
    with torch.no_grad():
      #torch.save({"batch_size": batch_size, "epoch": epoch+1, "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict()}, "checkpoint-{}.pytorch".format(epoch+1)) 
      train_losses.append(float(criterion(model(X_train), y_train)))
      test_losses.append(float(criterion(model(X_test), y_test)))
      train_accuracies.append(float((model(X_train).max(axis=1).indices == y_train).sum() / len(y_train)))
      test_accuracies.append(float((model(X_test).max(axis=1).indices == y_test).sum() / len(y_test)))
      #print("batch_size: {}\tepoch: {}\ttrain_loss: {}\ttest_loss: {}\ttrain_accuracy: {}\ttest_accuracy: {}".format(batch_size, epoch+1, train_losses[epoch], test_losses[epoch], train_accuracies[epoch], test_accuracies[epoch])) 

  elapsed_time /= n_epoch
  print("\nbatch_size: {}\telapsed_time: {}\ttrain_loss: {}\ttest_loss: {}\ttrain_accuray: {}\ttest_accuracy: {}".format(batch_size, elapsed_time, train_losses[n_epoch-1], test_losses[n_epoch-1], train_accuracies[n_epoch-1], test_accuracies[n_epoch-1]))

### 79. 多層ニューラルネットワーク

In [None]:
class MLPNet(nn.Module):

  def __init__(self, input_size, output_size):
    super().__init__()
    self.fc = nn.Sequential(
        nn.Linear(input_size, 100, bias=True),
        nn.PReLU(),
        nn.BatchNorm1d(100),
        nn.Linear(100, 25, bias=True),
        nn.PReLU(),
        nn.BatchNorm1d(25),
        nn.Linear(25, output_size, bias=True)
    )

    def init_normal(m):
      if type(m) == nn.Linear:
        nn.init.normal_(m.weight, 0.0, 1.0)
    self.fc.apply(init_normal)

  def forward(self, x):
    x = self.fc(x)
    return x

In [None]:
device = "cuda:0"

X_train = X_train.to(device)
X_test = X_test.to(device)
y_train = y_train.to(device)
y_test = y_test.to(device)
dataset_train = TensorDataset(X_train, y_train)

batch_sizes = [64, 128, 256]
for batch_size in tqdm(batch_sizes):
  dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
  
  model = MLPNet(X_train.size()[1], 4).to(device)
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.SGD(model.parameters(), lr=0.01)

  n_epoch = 100
  train_losses = []
  test_losses = []
  train_accuracies = []
  test_accuracies = []
  elapsed_time = 0
  for epoch in range(n_epoch):

    model.train()
    s_time = time.time()
    for X, y in dataloader_train:
        optimizer.zero_grad()
        y_pred = model(X)
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()

    e_time = time.time()
    elapsed_time += e_time - s_time
    
    model.eval()
    with torch.no_grad():
      #torch.save({"batch_size": batch_size, "epoch": epoch+1, "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict()}, "checkpoint-{}.pytorch".format(epoch+1)) 
      train_losses.append(float(criterion(model(X_train), y_train)))
      test_losses.append(float(criterion(model(X_test), y_test)))
      train_accuracies.append(float((model(X_train).max(axis=1).indices == y_train).sum() / len(y_train)))
      test_accuracies.append(float((model(X_test).max(axis=1).indices == y_test).sum() / len(y_test)))
      #print("batch_size: {}\tepoch: {}\ttrain_loss: {}\ttest_loss: {}\ttrain_accuracy: {}\ttest_accuracy: {}".format(batch_size, epoch+1, train_losses[epoch], test_losses[epoch], train_accuracies[epoch], test_accuracies[epoch])) 

  elapsed_time /= n_epoch
  print("\nbatch_size: {}\telapsed_time: {}\ttrain_loss: {}\ttest_loss: {}\ttrain_accuray: {}\ttest_accuracy: {}".format(batch_size, elapsed_time, train_losses[n_epoch-1], test_losses[n_epoch-1], train_accuracies[n_epoch-1], test_accuracies[n_epoch-1]))