## submission ##

# ISSUES #
## validation set에 대해 계산한 Columnwise mean ROC AUC가 실제 테스트셋에 대해 제출했을 때 값과 차이가 많이 남

## Requirements

- pytorch
- torchtext
- pandas
- scikit-learn
- numpy
- tqdm
- gensim


In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torchtext
from torch.utils.data import DataLoader, Dataset, TensorDataset
import pandas as pd 
import random
import numpy as np

## Model 설명

- embedding layer
- |
- convolutional layer (kernel = 5 x embedding dim)
- |
- relu
- |
- dropout
- |
- maxpool w.r.t time axis
- |
- fcn1 
- | 
- fcn2 ( -> label output )
- | 
- sigmoid - BinaryCrossEntropyLoss

In [2]:
class Net(nn.Module):
    def __init__(self, 
                 vocab_size,
                 embedding_dim,
                 len_sentence,
                 channel_size=4,
                 x2_size=1, # additional data - cap ratio
                 fc_dim=128,
                 padding_idx=1,
                 dropout=0.3,
                 num_labels=7,
                 batch_size=32,
                 is_cuda=False
                ):
        super(Net, self).__init__()
        self.embedding = nn.Embedding(vocab_size+2, embedding_dim=embedding_dim, padding_idx=padding_idx)
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.channel_size = channel_size
        self.len_sentence = len_sentence
        self.batch_size = batch_size
        self.x2_size = x2_size
        
        self.conv2d = nn.Conv2d(1, out_channels=channel_size, kernel_size=(5, embedding_dim), stride=1)
        # output : batch x channel x (len_sentence - 2) x 1
        
        # -> squeeze : batch x channel x (len_sentence - 2)
        self.relu = nn.ReLU(inplace=True)
        self.dropout1d = nn.Dropout(p=dropout)
        self.pool1d = nn.AvgPool1d(kernel_size=2)
        # output : batch x channel x (len_sentence - 2) / 2
        
        self.bottleneck_size = channel_size * (len_sentence - 4) / 2
#         print ("Linear size : %sx(%s-2)/2"%(channel_size, len_sentence), self.bottleneck_size)
        assert self.bottleneck_size.is_integer()
        self.bottleneck_size = int(self.bottleneck_size) + self.x2_size
        
        self.fcn1 = nn.Linear(self.bottleneck_size, fc_dim)
        self.relu1 = nn.ReLU(inplace=True)
        self.fcn2 = nn.Linear(fc_dim, num_labels)
        self.sigmoid = nn.Sigmoid()
#         self.fcns1 = [nn.Linear(self.bottleneck_size, fc_dim) for i in range(num_labels)]
#         self.relu1 = [nn.ReLU(inplace=True) for i in range(num_labels)]
#         self.fcns2 = [nn.Linear(fc_dim, 2) for i in range(num_labels)]
        
        
#         for i in range(num_labels):
#             self.add_module("fcn1-"+str(i), self.fcns1[i])
#         for i in range(num_labels):
#             self.add_module("relu1-"+str(i), self.relu1[i])
#         for i in range(num_labels):
#             self.add_module("fcn2-"+str(i), self.fcns2[i])
        
        self.fc_dim = fc_dim
        self.num_labels = num_labels
    
    def forward(self, sentence, other_features):
#         print("sentence ", sentence.shape)
        image = self.embedding(sentence)
#         print(bottleneck.shape)
        image.unsqueeze_(1)
#         print("image ", image.shape)
        
        bottleneck = self.conv2d(image)
        bottleneck.squeeze_(3)
        bottleneck = self.relu(bottleneck) # batch x channel x features
        bottleneck = self.dropout1d(bottleneck)
        bottleneck = self.pool1d(bottleneck)
#         print("bt shape ", bottleneck.shape)
        
        bottleneck = bottleneck.view(-1, self.bottleneck_size - self.x2_size)
        if self.x2_size > 0:
            bottleneck = torch.cat([bottleneck, other_features], dim=1)

        
#         fcns_1 = []
#         for i in range(self.num_labels):
#             fcns_1.append(self.relu1[i](self.fcns1[i](bottleneck)))
        
#         fcns_2 = []
#         for i in range(self.num_labels):
#             fcns_2.append(self.fcns2[i](fcns_1[i]))
            
#         return fcns_2 # return num_labels
        
        fcn = self.relu1(self.fcn1(bottleneck))
        fcn = self.fcn2(fcn)
        logit = self.sigmoid(fcn)
        
        return logit

In [3]:
class config:
    vocab_size = 20000
    embedding_dim = 100 # TODO: max 300
    len_sentence = 100
    num_labels = 6
    min_freq = 1
    batch_size = 64
    channel_size = 128
    seed = 0
    dropout = 0.5 # TODO: batch norm으로 대체 추천
    x2_size = 1

In [4]:

# seed 고정
torch.cuda.manual_seed_all(config.seed)
torch.manual_seed(config.seed)
random.seed(config.seed)
np.random.seed(config.seed)

In [5]:
def get_pd_data(path : str):
    df = pd.read_csv(path)
    return df

In [6]:
train = get_pd_data('./data/train.csv')

In [7]:
test = get_pd_data('./data/test.csv')

In [8]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


## Preprocess (1)
----
###  Set captial character ratio
- 문장 내의 대문자 비율을 뉴럴넷의 input으로 줌

In [9]:
def set_capital_ratio(df : pd.DataFrame):
    df['alphas'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isalpha()))
    df['capitals'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['cap_ratio'] = df.apply(lambda row: float(row['capitals']) / (float(row['alphas']) + 1), axis=1)


In [10]:
set_capital_ratio(train), set_capital_ratio(test)

(None, None)

In [11]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,alphas,capitals,cap_ratio
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,203,17,0.083333
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,73,8,0.108108
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,186,4,0.02139
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,486,11,0.022587
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,50,2,0.039216


## Preprocess(2)
-----
### Word tokenize
- gensim의 tokenize function

In [12]:
from gensim.utils import simple_tokenize

In [13]:
def tokenizer(string : str):
    return [s for s in simple_tokenize(string)]

In [14]:
tk_train = train['comment_text'].str.lower().apply(tokenizer)
tk_test = test['comment_text'].str.lower().apply(tokenizer)

In [15]:
tk_train[:5]

0    [explanation, why, the, edits, made, under, my...
1    [d, aww, he, matches, this, background, colour...
2    [hey, man, i, m, really, not, trying, to, edit...
3    [more, i, can, t, make, any, real, suggestions...
4    [you, sir, are, my, hero, any, chance, you, re...
Name: comment_text, dtype: object

In [16]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
x_features = ['cap_ratio']

## Preprocess(3)
----
### Add Normal column label
- toxic하지 않은 label로 분류되는 것에, normal=1 의 새로운 라벨 추가 (하지않음)

In [17]:
# train['normal'] = 0
# train.loc[train[labels].sum(axis=1) == 0, 'normal'] = 1

In [18]:
# labels.append('normal')

In [19]:
y_labels = train[labels]
y_labels.head(n=10)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,1,1,1,0,1,0
7,0,0,0,0,0,0
8,0,0,0,0,0,0
9,0,0,0,0,0,0


In [20]:
# Add manually engineered features ex) capital ratio of sentence
x2 = train[x_features]
x2.head(n=10)

Unnamed: 0,cap_ratio
0,0.083333
1,0.108108
2,0.02139
3,0.022587
4,0.039216
5,0.021277
6,0.973684
7,0.043478
8,0.019391
9,0.033333


In [21]:
x2_train = x2
x2_test = test[x_features]

In [22]:
from torchtext import data, datasets

## Preprocess(3)
---
### torchtext.data.Field
- word dictionary, word to index 구현

In [23]:
TEXT = data.Field(sequential=True,  
                  # 들어갈 데이터가 sequential 인가요? 우리는 tokenize한 word의 sequence를 다룰거니까 True입니다. Defualt로도 True임.
                  tokenize=tokenizer, 
                  # 그 데이터를 tokenize할 함수를 지정할 수 있습니다. 우리는 gensim library의 tokenize 함수를 쓸건데요
                  # 뭐 굳이 그거 말고도 직접 정의해도 되고 str.split 같은걸 써넣어도 됩니다.
                  # :: 그런 줄 알았는데 아무 tokenize 함수나 쓰면 안되고, generator가 아닌 tokenized list 를 반환하는 함수여야합니다..
                  # :::: 이게 아닐거같기도 함.
                  fix_length=config.len_sentence,
                 # 아마 tokenize된 길이 제한 같은데 한번 확인해볼게요. 특이사항으로는 length 넘으면 자르고, 안넘으면 padding을 채웁니다
                  # :: 그게 아니고 vector화 했을 때의 길이 제한일 것 같아요. 확인해보겠습니다.
                  pad_first=True,
                  # padding이 앞에서부터 붙냐, 뒤에서부터 붙냐는 겁니다.
                  tensor_type=torch.cuda.LongTensor
                  # cuda를 써도 됩니다
                 )

In [24]:
TEXT.build_vocab(tk_train, tk_test, max_size=config.vocab_size, min_freq=config.min_freq)

In [25]:
def batchify(tk_train, x2, y_labels=None, batch_size=32):
    for i in range(0, len(tk_train), batch_size):
        end = min(i+batch_size, len(tk_train))
        if y_labels is None:
            yield tk_train[i:end], x2[i:end]
        else:
            yield tk_train[i:end], x2[i:end], y_labels[i:end]

In [26]:
net = Net(vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, len_sentence=config.len_sentence,
         x2_size=config.x2_size, channel_size=config.channel_size, dropout=config.dropout, num_labels=config.num_labels, batch_size=config.batch_size).cuda()

In [27]:
net

Net(
  (embedding): Embedding(20002, 100, padding_idx=1)
  (conv2d): Conv2d (1, 128, kernel_size=(5, 100), stride=(1, 1))
  (relu): ReLU(inplace)
  (dropout1d): Dropout(p=0.5)
  (pool1d): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,), ceil_mode=False, count_include_pad=True)
  (fcn1): Linear(in_features=6145, out_features=128)
  (relu1): ReLU(inplace)
  (fcn2): Linear(in_features=128, out_features=6)
  (sigmoid): Sigmoid()
)

In [28]:
optimizer = optim.Adam(net.parameters())
criterion = nn.BCELoss()
# criterions = [nn.CrossEntropyLoss() for i in range(config.num_labels)]
# -> Binary

In [29]:
from tqdm import tqdm

In [31]:
train_corrects = [0 for i in range(config.num_labels)]
train_loss = 0
net.train(True)
for step, (batch, x2, y_label) in tqdm(enumerate(batchify(tk_train, x2_train.values, y_labels.values, batch_size=config.batch_size))):
    var_batch = TEXT.process(batch, device=0, train=True).transpose(dim0=0, dim1=1)
    var_y = Variable(torch.cuda.FloatTensor(y_label))
    var_x2 = Variable(torch.cuda.FloatTensor(x2))
    pred_score = net(var_batch, var_x2)
    
    net.zero_grad()
    y_loss = criterion(pred_score, var_y)
    #print(y_loss.data[0])
    y_loss.backward()
    optimizer.step()
    

2494it [00:11, 209.68it/s]


In [39]:
net.train(False)
val_score = None
for val_step, (batch_val, x2_val) in enumerate(batchify(tk_test, x2_test.values, y_labels=None, batch_size=config.batch_size)):
    var_batch = TEXT.process(batch_val, device=0, train=False).transpose(dim0=0, dim1=1)
    var_x2 = Variable(torch.cuda.FloatTensor(x2_val))
    pred_score = net(var_batch, var_x2)
    if val_score is None:
        val_score = pred_score
    else:
        val_score = torch.cat([val_score, pred_score])
net.train(True)

Net(
  (embedding): Embedding(20002, 100, padding_idx=1)
  (conv2d): Conv2d (1, 128, kernel_size=(5, 100), stride=(1, 1))
  (relu): ReLU(inplace)
  (dropout1d): Dropout(p=0.5)
  (pool1d): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,), ceil_mode=False, count_include_pad=True)
  (fcn1): Linear(in_features=6145, out_features=128)
  (relu1): ReLU(inplace)
  (fcn2): Linear(in_features=128, out_features=6)
  (sigmoid): Sigmoid()
)

In [40]:
from sklearn.metrics import roc_auc_score


## TODO
---
### roc_auc_score w.r.t. validation set's score
- Kaggle form에 맞추어 column-wise roc auc score 계산

In [41]:
test_submission = test.drop(['comment_text', 'alphas', 'capitals', 'cap_ratio'], axis=1)

In [42]:
test_score = val_score.data.cpu().numpy()
test_score

array([[  9.70897853e-01,   3.64708513e-01,   7.26424515e-01,
          9.72823054e-02,   7.90768623e-01,   3.08161736e-01],
       [  2.26571434e-03,   1.07036988e-06,   1.39281823e-04,
          3.06097827e-05,   3.43758496e-04,   8.46987750e-05],
       [  2.59651113e-02,   1.96309717e-04,   5.40274475e-03,
          1.37544947e-03,   8.35924037e-03,   2.90093198e-03],
       ..., 
       [  1.80142088e-05,   4.30685376e-10,   3.83072302e-06,
          1.88833269e-08,   1.54129521e-06,   2.26104291e-08],
       [  3.87673572e-05,   2.15635620e-10,   2.87453213e-06,
          6.91871733e-08,   1.43058980e-06,   5.07096516e-08],
       [  9.31639731e-01,   2.37068068e-02,   4.90417719e-01,
          6.19889144e-03,   5.59331059e-01,   5.63138649e-02]], dtype=float32)

In [43]:
test_score.shape

(153164, 6)

In [44]:
df_test_score = pd.DataFrame(data=test_score, columns=labels)
df_test_score.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.970898,0.3647085,0.726425,0.097282,0.790769,0.308162
1,0.002266,1.07037e-06,0.000139,3.1e-05,0.000344,8.5e-05
2,0.025965,0.0001963097,0.005403,0.001375,0.008359,0.002901
3,0.001801,3.47759e-08,0.00016,3e-06,0.000175,6e-06
4,0.011329,1.172845e-05,0.001475,0.000103,0.00207,0.000207


In [45]:
test_submission = pd.concat([test_submission, df_test_score], axis=1)
test_submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.970898,0.3647085,0.726425,0.097282,0.790769,0.308162
1,0000247867823ef7,0.002266,1.07037e-06,0.000139,3.1e-05,0.000344,8.5e-05
2,00013b17ad220c46,0.025965,0.0001963097,0.005403,0.001375,0.008359,0.002901
3,00017563c3f7919a,0.001801,3.47759e-08,0.00016,3e-06,0.000175,6e-06
4,00017695ad8997eb,0.011329,1.172845e-05,0.001475,0.000103,0.00207,0.000207


In [46]:
test_submission.to_csv("./submission_wlcnn_bce.csv", index=False)