In [None]:
import numpy as np
import pandas as pd
import time
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
torch.manual_seed(3510)
random_state = 3510
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
data = pd.read_csv("train.csv")
data.head()

In [None]:
train = data[130612:].drop("qid", axis=1)
test = data[:130612].drop("qid", axis=1)

In [None]:
PAD = 0
UNK = 1
BOS = 2
EOS = 3

PAD_TOKEN = '<PAD>'
UNK_TOKEN = '<UNK>'
BOS_TOKEN = '<S>'
EOS_TOKEN = '</S>'

In [None]:
word2id = {
    PAD_TOKEN: PAD,
    BOS_TOKEN: BOS,
    EOS_TOKEN: EOS,
    UNK_TOKEN: UNK,
    }

id2word = {v: k for k, v in word2id.items()}   

for s in train["question_text"]:
    for w in s.split():
        _id = len(word2id)
        word2id.setdefault(w, _id)
        id2word[_id] = w 

In [None]:
def sentence_to_ids(char2id, sentence):
    """
    単語のリストをインデックスのリストに変換する
    :param vocab: Vocabのインスタンス
    :param sentence: list of str
    :return indices: list of int
    """
    ids = [char2id.get(c, UNK) for c in sentence.split()]
    ids = [BOS] + ids + [EOS]  # </S>トークンを末尾に加える
#     ids += [EOS]  # EOSを末尾に加える
    return ids

In [None]:
train["question_text"]= [sentence_to_ids(word2id, sentence) for sentence in train["question_text"]]
test["question_text"] = [sentence_to_ids(word2id, sentence) for sentence in test["question_text"]]

In [None]:
train.head()

In [None]:
def preprocess_seqs(seqs):
            # パディング
    max_length = max([len(s) for s in seqs])
    data = [s + [PAD] * (max_length - len(s)) for s in seqs]
     # テンソルに変換
    data_tensor = torch.tensor(data, dtype=torch.long, device=device)
    return data_tensor   

In [None]:
class DataLoader(object):
    def __init__(self, data, batch_size, shuffle=True):
        """
        :param src_insts: list, 入力言語の文章（単語IDのリスト）のリスト
        :param tgt_insts: list, 出力言語の文章（単語IDのリスト）のリスト
        :param batch_size: int, バッチサイズ
        :param shuffle: bool, サンプルの順番をシャッフルするか否か
        """
        self.positive = data[data.target==1]
        self.negative = data[data.target==0]
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.start_index = 0
        
        self.reset()
    
    def reset(self):
        if self.shuffle:
            self.negative = shuffle(self.negative, random_state=random_state)
        self.start_index = 0
    
    def __iter__(self):
        return self
    
    def __next__(self):       
        # ポインタが最後まで到達したら初期化する
        if self.start_index >= len(self.negative):
            self.reset()
            raise StopIteration()
        
        minibatch = pd.concat([self.positive.sample(self.batch_size), self.negative[self.start_index:self.start_index+self.batch_size]], axis=0)
        
        minibatch_X = preprocess_seqs(minibatch["question_text"].values)
        minibatch_y = torch.tensor(minibatch["target"].values, dtype=torch.long, device=device)
        # バッチを取得して前処理
        self.start_index += self.batch_size

        return minibatch_X, minibatch_y

In [None]:
class TextCNN(nn.Module):

    def __init__(self, vocab_size, embedding_size, class_num, kernel_num, kernel_sizes, dropout, static):
        """
        :param vocab_size: int, 入力言語の語彙数
        :param embedding_size: int, 埋め込みベクトルの次元数
        :param class_num: int, 出力のクラス数
        :param kernel_num: int,　畳み込み層の出力チャネル数
        :param kernel_sizes: list of int, カーネルのウィンドウサイズ
        :param dropout: float, ドロップアウト率
        :param static: bool, 埋め込みを固定するか否かのフラグ
        """
        super(TextCNN, self).__init__()
        
        self.static = static

        self.embedding = nn.Embedding(vocab_size, embedding_size)
        # nn.ModuleList: 任意の数のModuleをlistのような形で保持することが出来るクラス
        self.convs = nn.ModuleList(
            [nn.Conv1d(1, kernel_num, (kernel_size, embedding_size)) for kernel_size in kernel_sizes]
            )
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(len(kernel_sizes)*kernel_num, class_num)

    def forward(self, x):
        # x: (batch_size, max_length)
        x = self.embedding(x)  # (batch_size, max_length, embedding_size)
        
        if self.static:
            x = torch.tensor(x)  # 埋め込みを固定

        x = x.unsqueeze(1)  # (batch_size, 1, max_length, embedding_size)

        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]  # [(batch_size, kernel_num, max_length-kernel_size+1), ...]*len(kernel_sizes)

        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(batch_size, kernel_num), ...]*len(kernel_sizes)

        x = torch.cat(x, 1)  # (batch_size, len(kernel_sizes)*kernel_num)

        x = self.dropout(x)
        logit = self.out(x)  # (batch_size, class_num)
        return logit

In [None]:
model_args = {
    'vocab_size': len(id2word),
    'embedding_size': 128,
    'class_num': 2,
    'kernel_num': 64,
    'kernel_sizes': [3, 4, 5],
    'dropout': 0.5,
    'static': False,
}

lr = 0.001
num_epochs = 30
batch_size = 64
ckpt_path = 'cnn.pth'

In [None]:
# model
model = TextCNN(**model_args)
model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

train_dataloader = DataLoader(train, batch_size)

In [None]:
test_X = preprocess_seqs(test["question_text"].values)
test_y = torch.tensor(test["target"].values, dtype=torch.long, device=device)        

In [None]:
log_train_loss=[]
log_valid_loss=[]
# 訓練
for epoch in range(1, num_epochs+1):
    train_loss = 0.
    # train
    for batch_X, batch_Y in train_dataloader:
        print("training...")
        model.train()
        pred_Y = model(batch_X)
        loss = criterion(pred_Y, batch_Y.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()        
        
        train_loss += loss
    train_loss = train_loss/len(train)
    
    pred_Y = model(test_X)
    valid_loss = criterion(pred_Y, test_y.view(-1))
    print("train:",train_loss, ", valid:", valid_loss)
    log_train_loss.append(train_loss)
    log_valid_loss.append(valid_loss)    
    
    ckpt = model.state_dict()
    torch.save(ckpt, ckpt_path)

In [None]:
plt.plot(log_train_loss)
plt.plot(log_valid_loss)
plt.savefig('cnn.png')