## Text Preprocessing

In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [5]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
print("Train shape : ",train.shape)
print("Test shape : ",test.shape)

Train shape :  (1306122, 3)
Test shape :  (56370, 2)


In [6]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [7]:
sentences = train["question_text"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

100%|██████████| 1306122/1306122 [00:04<00:00, 323358.82it/s]
100%|██████████| 1306122/1306122 [00:03<00:00, 384595.03it/s]

{'How': 261930, 'did': 33489, 'Quebec': 97, 'nationalists': 91, 'see': 9003}





In [9]:
from gensim.models import KeyedVectors

news_path = '../embedding/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)

In [10]:
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [11]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████| 508823/508823 [00:01<00:00, 416027.92it/s]


Found embeddings for 24.31% of vocab
Found embeddings for  78.75% of all text


In [12]:
def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [13]:
train["question_text"] = train["question_text"].progress_apply(lambda x: clean_text(x))
sentences = train["question_text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:07<00:00, 175009.03it/s]
100%|██████████| 1306122/1306122 [00:03<00:00, 380490.54it/s]


In [14]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████| 253623/253623 [00:00<00:00, 330316.57it/s]


Found embeddings for 57.38% of vocab
Found embeddings for  89.99% of all text


In [15]:
import re
def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [16]:
train["question_text"] = train["question_text"].progress_apply(lambda x: clean_numbers(x))
sentences = train["question_text"].progress_apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:11<00:00, 118537.47it/s]
100%|██████████| 1306122/1306122 [00:03<00:00, 355320.15it/s]
100%|██████████| 1306122/1306122 [00:02<00:00, 455991.19it/s]


In [17]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium'

                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

In [18]:
train["question_text"] = train["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
sentences = train["question_text"].progress_apply(lambda x: x.split())
to_remove = ['a','to','of','and']
sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
vocab = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:04<00:00, 311545.83it/s]
100%|██████████| 1306122/1306122 [00:03<00:00, 377658.24it/s]
100%|██████████| 1306122/1306122 [00:03<00:00, 356958.94it/s]
100%|██████████| 1306122/1306122 [00:03<00:00, 397285.33it/s]


In [19]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████| 242935/242935 [00:00<00:00, 364479.49it/s]


Found embeddings for 60.43% of vocab
Found embeddings for  98.96% of all text


## Prepare for training

In [35]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import time
import sys
import re

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional as F
import torch.utils.data
from sklearn.metrics import f1_score

from sklearn.model_selection import train_test_split
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
data_dir = '../input/'

submission = pd.read_csv(data_dir+'sample_submission.csv',index_col=[0])

# 最大系列長
max_length = 40

In [36]:
raw_train = pd.read_csv(data_dir+"train.csv")
raw_test = pd.read_csv(data_dir+"test.csv")
raw_train['sen_len'] = raw_train['question_text'].apply(lambda x: len(x.split()) if(len(x.split())<=max_length)\
                                                        else max_length)
raw_test['sen_len'] = raw_test['question_text'].apply(lambda x: len(x.split()) if(len(x.split())<=max_length)\
                                                        else max_length)
test_X = raw_test['question_text']
del raw_test

# train, validデータの分割
def train_valid_extract(raw_train,n_train1,n_train0,n_valid0):
    df_1 = raw_train[raw_train['target']==1]
    df_0 = raw_train[raw_train['target']==0]

    train_1 = df_1.sample(n=n_train1, random_state=0)
    train_0 = df_0.sample(n=n_train0, random_state=0)
    train_df = pd.concat([train_1,train_0])

    valid_1 = df_1[~df_1.index.isin(train_1.index)]
    valid_0 = df_0[~df_0.index.isin(train_0.index)].sample(n=n_valid0, random_state=0)
    valid_df = pd.concat([valid_1,valid_0])

    train1_X = train_1['question_text']
    train0_X = train_0['question_text']
    valid_X = valid_df['question_text']
    valid_y = valid_df['target']
    
    return train1_X,train0_X,valid_X,valid_y

In [37]:
# data sampling paramater
n_train1 = 75000
n_train0 = 1000000
n_valid0 = 60000 # n_valid1が約5~10%になるように調整
train1_X,train0_X,valid_X,valid_y = train_valid_extract(raw_train,n_train1,n_train0,n_valid0)
del raw_train

In [45]:
text = np.array(re.split("[ ,.:;!?-]","He  is not a man")[:40])

In [63]:
oov_dict = dict((x,y) for x, y in oov)

In [65]:
oov_dict["bitcoin"]

987

In [66]:
def sentence_to_id(text,max_length):
    #text = text[:-1].split()[:max_length]
    text = np.array(re.split("[ ,.:;!?-]",text)[:max_length])
    text = text[~(text == '')]
    embeds = [oov_dict.get(x) for x in text]
    embeds+= [0] * (max_length - len(embeds))
    #unknown wordはNoneで返されるため、1で置換
    embeds = np.array(embeds)
    embeds[embeds == None] = 1
    return embeds.astype(np.int64)

In [67]:
def make_dataloader(X,y=None,batch_size=2000,len_X=None,shuffle=True):
    '''
    X: torch.tensor 
    y: numpy.array
    len_X: numpy.array
    '''
    if y is not None: # testデータではないとき
        tensor_y = torch.from_numpy(np.array(y))
    else : # testデータ、または全ラベルが同じデータセットの時
        tensor_y = torch.from_numpy(np.zeros(len(X))) #testデータに疑似ラベルを作成
        
    if len_X is not None: #文長の情報も用いる場合
        tensor_len_X = torch.from_numpy(np.array(len_X))
        data = torch.utils.data.TensorDataset(X,tensor_y,tensor_len_X)
    else : 
        data = torch.utils.data.TensorDataset(X,tensor_y) # create your datset 
    data_loader =  torch.utils.data.DataLoader(data,batch_size=batch_size, shuffle=shuffle) # create your dataloader 
    
    return data_loader

In [68]:
# word embedding
train1_ids = torch.stack([torch.LongTensor(sentence_to_id(X_text,max_length)) for X_text in tqdm(train1_X)])
train0_ids = torch.stack([torch.LongTensor(sentence_to_id(X_text,max_length)) for X_text in tqdm(train0_X)])
valid_ids = torch.stack([torch.LongTensor(sentence_to_id(X_text,max_length)) for X_text in tqdm(valid_X)])
test_ids = torch.stack([torch.LongTensor(sentence_to_id(X_text,max_length)) for X_text in tqdm(test_X)])

del train1_X,train0_X,valid_X,test_X,word2id

100%|██████████| 75000/75000 [00:02<00:00, 25420.29it/s]
100%|██████████| 1000000/1000000 [00:35<00:00, 28117.26it/s]
100%|██████████| 65810/65810 [00:03<00:00, 21168.13it/s]
100%|██████████| 56370/56370 [00:01<00:00, 31267.12it/s]


In [69]:
batch_size = 3750
#batch1_ratio = 0.3
batch1_size = 1250 #int(batch_size*batch1_ratio)
batch0_size = 2500 #int(batch_size*(1-batch1_ratio))

repeat_num =  n_train1//batch1_size # クラス1の重複回数

train1_loader = make_dataloader(train1_ids.repeat(repeat_num,1),
                                np.ones(len(train1_ids.repeat(repeat_num,1))),
                                batch_size=batch1_size,len_X=None,shuffle=True)
train0_loader = make_dataloader(train0_ids,np.zeros(len(train0_ids)),batch_size=batch0_size,len_X=None,shuffle=True)
valid_loader = make_dataloader(valid_ids,valid_y,len_X=None,shuffle=True)
test_loader = make_dataloader(test_ids,len_X=None,shuffle=False)
del train1_ids,train0_ids,valid_ids,test_ids

## Modeling

In [70]:
class LSTM(nn.Module):
    def __init__(self,vocab_size, embed_size, hidden_size,output_size,bidirectional,
                 num_layers=2,dropout=0):
        """
        :param input_size: int, 入力言語の語彙数
        :param hidden_size: int, 隠れ層のユニット数
        """
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding.from_pretrained(id2vec)
        self.embedding.weight.requires_grad = False
        '''
        self.gru = nn.GRU(input_size=embed_size, hidden_size=self.hidden_size,
                          num_layers=self.num_layers,dropout=self.dropout,
                          batch_first=True)
        '''
        self.gru = nn.GRU(input_size=embed_size, hidden_size=self.hidden_size,
                          num_layers=1,dropout=0,
                          bidirectional=bidirectional,batch_first=True)
        self.dropout = nn.Dropout()
        self.fc = nn.Linear(self.hidden_size, self.hidden_size//2)
        self.fc2 = nn.Linear(self.hidden_size//2, self.output_size)
        
    def forward(self, seqs, hidden=None):
        """
        :param seqs: tensor, 入力のバッチ, size=(max_length, batch_size)
        :param input_lengths: 入力のバッチの各サンプルの文長
        :param hidden: tensor, 隠れ状態の初期値, Noneの場合は0で初期化される
        :return output: tensor, Encoderの出力, size=(max_length, batch_size, hidden_size)
        :return hidden: tensor, Encoderの隠れ状態, size=(1, batch_size, hidden_size)
        """
        emb = self.embedding(seqs)
        output, hidden = self.gru(emb, hidden)
        x = F.relu(self.fc(output[:,-1,:])) # 系列の最後の出力のみ取り出す。クラス数分の次元のはず
        x = self.dropout(x)
        return self.fc2(x)

In [71]:
def multi_weighted_log_loss(input, target, class_weight=None):
    '''
    class_weight: np.array([a1,a2,...])
    '''
    if target.device.type == 'cuda':
        input = input.to(torch.device("cpu"))
        target = target.to(torch.device("cpu"))
    #真分類値tのonehot化
    batch_size = len(target)

    t_onehot = torch.FloatTensor(batch_size, num_classes)
    t_onehot.zero_()
    t_onehot = t_onehot.scatter_(1,target.reshape(batch_size,1),1)
    if class_weight is None:
        w = torch.ones(num_classes).type(torch.FloatTensor)
    else: w = torch.from_numpy(class_weight).type(torch.FloatTensor)

    t_y = (torch.log(input)*t_onehot) # y_ij * ln(p_ij) 
    sum_t = -1 * (torch.sum(t_y,dim=0)/torch.bincount(target).type(torch.FloatTensor)) # -1/N_i sum( t_y )
    return torch.dot(sum_t,w)/w.sum()

def compute_loss(batch_X, batch_Y, model, criterion,
                 class_weight=None,
                 optimizer=None, is_train=True):
    # バッチの損失を計算
    model.train(is_train)
    
    if is_train:
        optimizer.zero_grad()
    
    y = model(batch_X)
    y = F.softmax(y, dim=-1)
    if class_weight is not None: #クラス重みづけができる損失関数の場合
        loss = criterion(y, batch_Y,class_weight=class_weight)
    else: #それ以外の損失関数
        loss = criterion(y, batch_Y)
    
    if is_train:
        loss.backward()
        optimizer.step()
    pred = y.argmax(dim=1).tolist()
    return loss.item(), batch_Y.data, pred

In [72]:
model_args = {
    'vocab_size':len(id2vec),
    'embed_size':300,
    'hidden_size':300,
    'output_size':2,
    'num_layers':2,
    'dropout':0.25,
    'bidirectional':False
}
num_classes=2
class_weight = np.array([10,1])
lr = 0.005
num_epochs = 2
ckpt_path = 'lstm_online_embedding.pth'

# model
model = LSTM(**model_args).to(device)

optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = multi_weighted_log_loss

score = f1_score
loss_epoch_train_ls = []
loss_epoch_valid_ls = []
score_epoch_train_ls = []
score_epoch_valid_ls = []
best_valid_score = 0

NameError: name 'id2vec' is not defined

In [73]:
start = time.time()
for epoch in range(1, num_epochs+1):
    train_losses = []
    valid_losses = []
    preds_train = []
    preds_valid = []
    trues_train = []
    trues_valid = []
    # train
    for i, ((batch1_X,batch1_y), (batch0_X,batch0_y)) in enumerate(zip(train1_loader,train0_loader)):
        batch_X = torch.cat([batch1_X, batch0_X], dim=0).numpy()
        batch_Y = torch.cat([batch1_y, batch0_y], dim=0).numpy()
        # クラス0とクラス1のデータをランダムに混ぜる
        np.random.seed(i)
        batch_X = torch.LongTensor(np.random.permutation(batch_X)).to(device)
        np.random.seed(i)
        batch_Y = torch.LongTensor(np.random.permutation(batch_Y)).to(device)
        # 学習
        train_loss, true,pred = compute_loss(
            batch_X, batch_Y, model ,criterion, class_weight ,optimizer, is_train=True
            )
        train_losses.append(train_loss)
        trues_train.extend(true)
        preds_train.extend(pred)
        #if i % 50 == 0: print(i)
    # valid
    for batch in valid_loader:
        batch_X, batch_Y = batch
        batch_X = batch_X.to(device)
        batch_Y = batch_Y.to(device)
        valid_loss, true,pred = compute_loss(
            batch_X, batch_Y, model, criterion, class_weight ,is_train=False
            )
        valid_losses.append(valid_loss)
        trues_valid.extend(true)
        preds_valid.extend(pred)
    
    loss_epoch_train = np.mean(train_losses)
    loss_epoch_valid = np.mean(valid_losses)
    loss_epoch_train_ls.append(loss_epoch_train)
    loss_epoch_valid_ls.append(loss_epoch_valid)
    
    train_score = score(np.array(trues_train), np.array(preds_train))
    valid_score = score(np.array(trues_valid), np.array(preds_valid))
    score_epoch_train_ls.append(train_score)
    score_epoch_valid_ls.append(valid_score)
    
    if valid_score > best_valid_score:
        ckpt = model.state_dict()
        torch.save(ckpt, ckpt_path)
        best_valid_score = valid_score

    print('Time:{:.1f}, Epoch:{}, train_loss: {:.3f}  train_score: {:.3f}  valid_loss: {:.3f}  valid_score: {:.3f}'.format(
            time.time()-start,epoch, 
            loss_epoch_train, 
            train_score,
            loss_epoch_valid,
            valid_score
    ))

NameError: name 'num_epochs' is not defined

## Prediction

In [None]:
# testデータセットに適用
#del train_loader,valid_loader
preds_test = []
for batch_X,batch_Y in test_loader:
    batch_X = batch_X.to(device)
    batch_Y = batch_Y.type(torch.LongTensor).to(device)
    test_loss, true,pred = compute_loss(
        batch_X, batch_Y, model, criterion, is_train=False) 
    
    # モデルの出力を予測値のスカラーに変換
    preds_test.extend(pred)

In [None]:
submission['prediction'] = preds_test
submission.to_csv('submission.csv')
submission.sum()

In [None]:
df_loss_score = pd.DataFrame({'train_loss': loss_epoch_train_ls,
                           'valid_loss':loss_epoch_valid_ls,
                           'train_score': score_epoch_train_ls,
                           'valid_score':score_epoch_valid_ls
                          })
df_loss_score