In [1]:
import torch
#gpuの確認
print(torch.cuda.is_available())

True


In [2]:
!pip install janome

Collecting janome
[?25l  Downloading https://files.pythonhosted.org/packages/79/f0/bd7f90806132d7d9d642d418bdc3e870cfdff5947254ea3cab27480983a7/Janome-0.3.10-py2.py3-none-any.whl (21.5MB)
[K     |████████████████████████████████| 21.5MB 1.4MB/s 
[?25hInstalling collected packages: janome
Successfully installed janome-0.3.10


In [3]:
!pip install jaconv

Collecting jaconv
  Downloading https://files.pythonhosted.org/packages/b0/9e/cf1353fb3e81a177bb52ca59a0ebee425f084b7298039a7965c5414d2d62/jaconv-0.2.4.tar.gz
Building wheels for collected packages: jaconv
  Building wheel for jaconv (setup.py) ... [?25l[?25hdone
  Created wheel for jaconv: filename=jaconv-0.2.4-cp36-none-any.whl size=12285 sha256=729eb343ae79564933df546a9ebbddea79ed48866b1ef77d8ccdfb1c3483c708
  Stored in directory: /root/.cache/pip/wheels/e1/46/f7/85a7f89bd3263423c8530dfed16083f9a142cc0fc78c81ff32
Successfully built jaconv
Installing collected packages: jaconv
Successfully installed jaconv-0.2.4


In [0]:
import jaconv
from janome.tokenizer import Tokenizer
import re

j_t = Tokenizer()

def clean_text(text):
  text = jaconv.h2z(text)
  result = text.lower()
  result = re.sub(r'[【】]', '', result)                  # 【】の除去
  result = re.sub(r'[（）()]', '', result)                # （）の除去
  result = re.sub(r'[［］\[\]]', '', result)              # ［］の除去
  result = re.sub(r'[@＠]\w+', '', result)               # メンションの除去
  result = re.sub(r'https?:\/\/.*?[\r\n ]', '', result)  # URLの除去
  result = re.sub(r'http\S+', '', result)                #urlの除去
  result = re.sub(r'pic.\S+', '', result)                #画像リンクの除去
  result = re.sub(r'#\S+', '', result)                   #twitterのハッシュタグの除去
  result = re.sub(r'03-\S+', '', result)                 #電話番号の除去
  result = re.sub(r'0120-\S+', '', result)               #電話番号の除去
  result = re.sub(r'@\S+', '', result)                   #リツイート?みたいなやつ
  result = re.sub(r'[\r]', '', result)
  result = re.sub(r'　', ' ', result)                    #全角空白の除去
  return result

def tokenizer_janome(text):
  return [tok for tok in j_t.tokenize(text, wakati=True)]

def tokenize_preprocessing(text):
  text = clean_text(text)
  text = tokenizer_janome(text)
  return text


In [5]:
#データの読みこみ
import torchtext

max_length = 140
TEXT = torchtext.data.Field(sequential=True, use_vocab=True, tokenize=tokenize_preprocessing,
                            lower=True, include_lengths=True, batch_first=True,
                            fix_length=max_length, init_token='<cls>',
                            eos_token='<eos>')

LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

#pandasでcsvを保存するときに、labelをintでキャストしておかないとエラーでるから注意
train_val_ds, test_ds = torchtext.data.TabularDataset.splits(
    path='drive/My Drive/', train='tweets_train.csv', test='tweets_test.csv',
    format='csv', fields=[('text', TEXT), ('Label', LABEL)]
)

#分かち書きや正規化などが終了している
print(len(train_val_ds))
print(vars(train_val_ds[0]))

33094
{'text': ['群れ', 'を', 'なす', '犬', 'たち', ' ', '-'], 'Label': '0'}


In [6]:
import random

#検証データの確保
train_ds, val_ds = train_val_ds.split(split_ratio=0.9, random_state=random.seed(12))

print(len(train_ds))
print(len(val_ds))
print(len(test_ds))

29785
3309
3678


In [7]:
from torchtext.vocab import Vectors

japanese_word2vec_vectors = Vectors(
    name='drive/My Drive/tweets133_.vec')

  0%|          | 0/285495 [00:00<?, ?it/s]Skipping token b'285495' with 1-dimensional vector [b'300']; likely a header
100%|█████████▉| 284588/285495 [00:28<00:00, 9633.26it/s]

In [8]:
print(japanese_word2vec_vectors.dim)
print(len(japanese_word2vec_vectors.itos))

300
285495


In [9]:
#ボキャブラリを作成
TEXT.build_vocab(train_ds, vectors=japanese_word2vec_vectors)
print(TEXT.vocab.vectors.shape)
print(TEXT.vocab.stoi)

torch.Size([20644, 300])
defaultdict(<function _default_unk_index at 0x7f6ec2c771e0>, {'<unk>': 0, '<pad>': 1, '<cls>': 2, '<eos>': 3, 'の': 4, '\n': 5, '。': 6, 'に': 7, 'を': 8, 'は': 9, '、': 10, 'て': 11, ' ': 12, 'が': 13, '\n\n': 14, 'で': 15, 'し': 16, 'た': 17, '！': 18, 'と': 19, 'ない': 20, 'な': 21, '「': 22, '」': 23, 'も': 24, '型': 25, 'だ': 26, 'こと': 27, '…': 28, 'ん': 29, 'です': 30, '・': 31, '？': 32, 'か': 33, 'いる': 34, '  ': 35, '名言': 36, 'ナビ': 37, 'ます': 38, 'から': 39, 'ある': 40, '人': 41, 'する': 42, 'う': 43, 'よ': 44, 'い': 45, '悲しみ': 46, '恐怖': 47, 'ﾟ': 48, '...': 49, 'よう': 50, 'b': 51, '-': 52, 'д': 53, '自分': 54, '#': 55, '中': 56, 'さ': 57, 'ﾟゴルァ': 58, '位': 59, '\xa0': 60, 'れ': 61, '!!': 62, '『': 63, 'や': 64, '』': 65, '相手': 66, 'ば': 67, '怒り': 68, 'み': 69, '→': 70, 'れる': 71, 'フォロー': 72, 'ね': 73, '人類': 74, '滅亡': 75, 'てる': 76, '喜び': 77, '   ': 78, '方': 79, '時': 80, '効果': 81, 'せ': 82, 'まし': 83, 'ませ': 84, 'もの': 85, 'なら': 86, '血液': 87, 'ab': 88, 'たい': 89, 'これ': 90, 'いい': 91, '女': 92, 'なる': 93, 'なっ': 94,

In [10]:
train_dl = torchtext.data.Iterator(train_ds, batch_size=64, train=True)
val_dl = torchtext.data.Iterator(val_ds, batch_size=64, train=False, sort=False)
test_dl = torchtext.data.Iterator(test_ds, batch_size=64, train=False, sort=False)

batch = next(iter(val_dl))
print(batch.text)
print(batch.Label)

(tensor([[   2,  837, 1536,  ...,    1,    1,    1],
        [   2,  107,   39,  ...,    1,    1,    1],
        [   2,   22,  365,  ...,    1,    1,    1],
        ...,
        [   2, 1352,    4,  ...,    1,    1,    1],
        [   2,  385,   24,  ...,    1,    1,    1],
        [   2,  221,  110,  ...,    1,    1,    1]]), tensor([39, 24, 21, 19, 40, 21, 20, 26, 35, 21, 43, 53, 21, 18, 21, 47, 66, 16,
        19, 10, 44, 16, 53, 32, 35, 36, 62, 55,  8, 22,  5, 27, 42, 42, 42, 14,
        42, 32, 69, 19, 36, 30, 26, 36, 39, 27, 15, 29, 22, 35, 71, 27, 34, 45,
        29, 24, 14, 26,  7, 31, 29, 35, 42, 39]))
tensor([4, 3, 2, 0, 2, 0, 2, 2, 3, 2, 2, 2, 0, 0, 2, 3, 0, 3, 2, 0, 0, 0, 0, 2,
        2, 3, 1, 0, 3, 3, 2, 2, 4, 2, 3, 0, 2, 3, 2, 3, 2, 0, 3, 0, 4, 0, 4, 2,
        4, 2, 2, 3, 3, 4, 0, 0, 4, 2, 0, 2, 2, 0, 4, 4])


In [0]:
# パッケージのimport
import numpy as np
import random
import math 

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F 

torch.manual_seed(12)
np.random.seed(12)

#埋め込み層
class Embedder(nn.Module):
    def __init__(self, text_embedding_vectors):
        super(Embedder, self).__init__()
        
        #更新はしない
        self.embeddings = nn.Embedding.from_pretrained(
            embeddings=text_embedding_vectors, freeze=True)

    def forward(self, x):
        x_vec = self.embeddings(x)

        return x_vec

#PositonalEncoding
class PositionalEncoder(nn.Module):

    def __init__(self, d_model=300, max_seq_len=140):
        super().__init__()

        self.d_model = d_model

        pe = torch.zeros(max_seq_len, d_model)

        # GPUが使える場合はGPUへ送る、ここでは省略。実際に学習時には使用する
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        pe = pe.to(device)

        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))
                pe[pos, i + 1] = math.cos(pos /
                                          (10000 ** ((2 * (i + 1))/d_model)))

        self.pe = pe.unsqueeze(0)

        self.pe.requires_grad = False

    def forward(self, x):
        ret = math.sqrt(self.d_model)*x + self.pe
        return ret


class MultiheadAttention(nn.Module):
    def __init__(self, d_model, head_num, dropout_rate):
        super().__init__()
        """
        d_model：出力層の次元(head_bumの倍数)
        head_num：ヘッドの数
        dropout_rate
        """
        self.d_model = d_model
        self.head_num = head_num
        self.dropout_rate = dropout_rate
    
        #特徴量変換
        self.q_linear = nn.Linear(d_model, d_model) 
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        #出力の全結合層
        self.out = nn.Linear(d_model, d_model)
        self.attention_dropout_layer = nn.Dropout(dropout_rate)   
    
    def forward(self, q, k, v, mask):
        #key, query, valueを生成
        q = self.q_linear(q) # [batch_size, max_seq_len, d_model]
        k = self.q_linear(k) 
        v = self.q_linear(v)
        
        #head_numに分割
        q = self._split_head(q) # [batch_size, head_num, max_seq_len, d_model/head_num]
        k = self._split_head(k)
        v = self._split_head(v)
        
        #queryとkeyの関連度の計算と、Scaled Dot-production
        weights = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(self.d_model)
        
        #maskをかける
        mask = mask.unsqueeze(1).unsqueeze(1)
        weights = weights.masked_fill(mask==0, -1e9)# [batch_size, head_num, max_seq_len, max_seq_len]

        #AttentionWeightを計算
        attention_weight = F.softmax(weights, dim=-1)# [batch_size, head_num, q_length, k_length]
        
        #AttentionWeightよりvalueから情報を引き出す
        attention_output = torch.matmul(attention_weight, v)# [batch_size, head_num, q_length, d_model/head_num]
        attention_output = self._combine_head(attention_output)
        output = self.out(attention_output)
        
        
        return output, attention_weight
        
    def _split_head(self, x):
        """
        x.size:[batch_size, length, d_model]
        """
        batch_size, length, d_model = x.size()
        x = x.view(batch_size, length, self.head_num, self.d_model//self.head_num) #reshape
        return x.permute(0, 2, 1, 3)
    
    #outputする前に分割したheadを戻す。
    def _combine_head(self, x):
        """
        x.size:[batch_size, head_num, length, d_model//head_num]
        """
        batch_size, _, length, _  = x.size()
        x = x.permute(0, 2, 1, 3)
        return x.reshape(batch_size, length, self.d_model)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=1024, dropout=0.1):
        super().__init__()

        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        """
        x size=[batch_size, length, d_model]
        return size=[batch_size, length, d_model]
        """
        x = self.linear_1(x)
        x = self.dropout(F.relu(x))
        x = self.linear_2(x)
        return x

class TransformerBlock(nn.Module):
    def __init__(self, d_model, head_num, dropout=0.1):
        super().__init__()

        # LayerNormalization
        self.norm_1 = nn.LayerNorm(d_model)
        self.norm_2 = nn.LayerNorm(d_model)
        # Attention
        self.attn = MultiheadAttention(d_model, head_num, dropout)
        # FFN
        self.ff = FeedForward(d_model)
        # Dropout
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        # SelfAttention
        x_normlized = self.norm_1(x)
        output, normlized_weights = self.attn(
            x_normlized, x_normlized, x_normlized, mask)
        x2 = x + self.dropout_1(output)
        # FFN
        x_normlized2 = self.norm_2(x2)
        output = x2 + self.dropout_2(self.ff(x_normlized2))

        return output, normlized_weights

class ClassificationHead(nn.Module):
    '''Transformer_Blockの出力を使用し、最後にクラス分類させる'''

    def __init__(self, d_model=300, output_dim=5):
        super().__init__()

        # 全結合層
        self.linear = nn.Linear(d_model, output_dim)  

        # 重み初期化処理
        nn.init.normal_(self.linear.weight, std=0.02)
        nn.init.normal_(self.linear.bias, 0)

    def forward(self, x):
        x0 = x[:, 0, :]  # <cls>の結果を用いる
        out = self.linear(x0)

        return out

# 最終的なTransformerモデルのクラス


class TransformerEncoderClassification(nn.Module):

    def __init__(self, text_embedding_vectors, head_num, dropout=0.1, d_model=300, max_seq_len=140, output_dim=5):
        super().__init__()

        # モデル構築
        self.net1 = Embedder(text_embedding_vectors)
        self.net2 = PositionalEncoder(d_model=d_model, max_seq_len=max_seq_len)
        self.net3 = nn.Dropout(dropout)
        self.net4_1 = TransformerBlock(d_model=d_model, head_num=head_num, dropout=dropout)
        self.net4_2 = TransformerBlock(d_model=d_model, head_num=head_num, dropout=dropout)
        self.net5 = ClassificationHead(output_dim=output_dim, d_model=d_model)

    def forward(self, x, mask):
        x1 = self.net1(x)  #Embedding
        x2 = self.net2(x1) #PositinalEncoding
        x3 = self.net3(x2) #Dropout
        x4_1, normlized_weights_1 = self.net4_1(x3, mask) #self-Attention+FFN 
        x4_2, normlized_weights_2 = self.net4_2(x4_1, mask)  #self-Attention+FFN
        x5 = self.net5(x4_2)  #linear
        return x5, normlized_weights_1, normlized_weights_2



In [12]:
# 辞書オブジェクトにまとめる
dataloaders_dict = {"train": train_dl, "val": val_dl}
# モデル構築
net = TransformerEncoderClassification(
    text_embedding_vectors=TEXT.vocab.vectors, head_num=5, dropout=0.1, d_model=300, max_seq_len=140, output_dim=5)

# ネットワークの初期化を定義


def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:
        # Liner層の初期化
        nn.init.kaiming_normal_(m.weight)
        if m.bias is not None:
            nn.init.constant_(m.bias, 0.0)


# 訓練モードに設定
net.train()

# TransformerBlockモジュールを初期化実行
net.net4_1.apply(weights_init)
net.net4_2.apply(weights_init)


print('ネットワーク設定完了')

100%|█████████▉| 284588/285495 [00:40<00:00, 9633.26it/s]

ネットワーク設定完了


In [0]:
#損失関数を定義
criterion = nn.CrossEntropyLoss()

#最適化手法
learning_rate = 2e-5
optimizer = optim.Adam(net.parameters(), lr=learning_rate)

In [0]:
#モデルを訓練して、訓練したモデルをreturnする
#モデル、辞書型で定義したdataloder(イテレータ)、損失関数、オプティマイザ、エポック数を渡す
def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):

    # GPUが使えるかを確認
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("使用デバイス：", device)
    print('-----start-------')
    # モデルをGPUへ渡す
    net.to(device)

    # ネットワークがある程度固定であれば、高速化させる
    torch.backends.cudnn.benchmark = True

    # 各epoch
    for epoch in range(num_epochs):
        # 学習と検証
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  # モデルを訓練モードに
            else:
                net.eval()   # モデルを検証モードに

            epoch_loss = 0.0  # epochの損失和
            epoch_corrects = 0  # epochの正解数

            # 各バッチ
            for batch in (dataloaders_dict[phase]):
                # batchはTextとLableの辞書オブジェクト

                # GPUが使えるならGPUにデータを送る
                inputs = batch.text[0].to(device)  # 文章
                labels = batch.Label.to(device)  # ラベル

                # optimizerを初期化
                optimizer.zero_grad()

                # 順伝搬（forward）計算
                with torch.set_grad_enabled(phase == 'train'):

                    # mask作成
                    input_pad = 1  # 単語のIDにおいて、'<pad>': 1 なので
                    input_mask = (inputs != input_pad) #mask部分がFalseに

                    # モデルに入力
                    outputs, _, _ = net(inputs, input_mask)
                    loss = criterion(outputs, labels)  # 損失を計算

                    _, preds = torch.max(outputs, 1)  # ラベルを予測

                    # 訓練時はバックプロパゲーション
                    if phase == 'train':
                        #勾配を計算
                        loss.backward()
                        #パラメータの更新
                        optimizer.step()

                    # 結果の計算
                    epoch_loss += loss.item() * inputs.size(0)  # lossの合計を更新
                    # 正解数の合計を更新
                    epoch_corrects += torch.sum(preds == labels.data)

            # epochごとのlossと正解率
            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
            epoch_acc = epoch_corrects.double(
            ) / len(dataloaders_dict[phase].dataset)

            print('Epoch {}/{} | {:^5} |  Loss: {:.4f} Acc: {:.4f}'.format(epoch+1, num_epochs,
                                                                           phase, epoch_loss, epoch_acc))

    return net

In [15]:
import torch.nn.functional as F 
num_epochs = 10
net_trained = train_model(net, dataloaders_dict,
                          criterion, optimizer, num_epochs=num_epochs)

使用デバイス： cuda:0
-----start-------
Epoch 1/10 | train |  Loss: 1.4647 Acc: 0.3380
Epoch 1/10 |  val  |  Loss: 1.2613 Acc: 0.4902
Epoch 2/10 | train |  Loss: 1.1634 Acc: 0.5062
Epoch 2/10 |  val  |  Loss: 0.8127 Acc: 0.6987
Epoch 3/10 | train |  Loss: 0.7381 Acc: 0.7280
Epoch 3/10 |  val  |  Loss: 0.6071 Acc: 0.7894
Epoch 4/10 | train |  Loss: 0.5812 Acc: 0.7980
Epoch 4/10 |  val  |  Loss: 0.4625 Acc: 0.8398
Epoch 5/10 | train |  Loss: 0.4971 Acc: 0.8299
Epoch 5/10 |  val  |  Loss: 0.4040 Acc: 0.8640
Epoch 6/10 | train |  Loss: 0.4491 Acc: 0.8464
Epoch 6/10 |  val  |  Loss: 0.3882 Acc: 0.8637
Epoch 7/10 | train |  Loss: 0.4225 Acc: 0.8550
Epoch 7/10 |  val  |  Loss: 0.3668 Acc: 0.8701
Epoch 8/10 | train |  Loss: 0.3964 Acc: 0.8630
Epoch 8/10 |  val  |  Loss: 0.3502 Acc: 0.8797
Epoch 9/10 | train |  Loss: 0.3824 Acc: 0.8687
Epoch 9/10 |  val  |  Loss: 0.3360 Acc: 0.8800
Epoch 10/10 | train |  Loss: 0.3650 Acc: 0.8710
Epoch 10/10 |  val  |  Loss: 0.3264 Acc: 0.8852


In [23]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

net_trained.eval()
net_trained.to(device)

y_true = np.array([])
y_pred = np.array([])

epoch_corrects = 0

for batch in (test_dl):
  inputs = batch.text[0].to(device)
  labels = batch.Label.to(device)

  with torch.set_grad_enabled(False):
    input_pad = 1
    input_mask = (inputs != input_pad)

    outputs, _, _ = net_trained(inputs, input_mask)
    _, preds = torch.max(outputs, 1)
    
    y_true = np.concatenate([y_true, labels.to("cpu", torch.double).numpy()])
    y_pred = np.concatenate([y_pred, preds.to("cpu", torch.double).numpy()])

    epoch_corrects += torch.sum(preds == labels.data)

# 正解率
epoch_acc = epoch_corrects.double() / len(test_dl.dataset)

print('テストデータ{}個での正解率：{:.4f}'.format(len(test_dl.dataset),epoch_acc))

テストデータ3678個での正解率：0.8812


In [28]:
from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.92      0.90      0.91      1155
         1.0       1.00      0.77      0.87        69
         2.0       0.89      0.92      0.91      1168
         3.0       0.85      0.82      0.83       743
         4.0       0.82      0.85      0.84       543

    accuracy                           0.88      3678
   macro avg       0.90      0.85      0.87      3678
weighted avg       0.88      0.88      0.88      3678

