In [1]:
!pip install transformers

Collecting transformers
  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
Installing collected packages: transformers
Successfully installed transformers-2.3.0


You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [1]:
import random
import numpy as np
from tqdm import tqdm_notebook as tqdm
import time
import logging
from sklearn.model_selection import StratifiedKFold
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from sklearn.metrics import accuracy_score, f1_score

from transformers import *

I0126 18:48:25.140106 13856 file_utils.py:35] PyTorch version 1.3.1 available.


In [2]:
# Set Seed Everything
def seed_everything(seed=623):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [3]:
train = pd.read_csv('C:\\Users\\bokhy\\Python\\pytorch\\pytorch\\data\\nlp-getting-started\\train.csv')
test = pd.read_csv('C:\\Users\\bokhy\\Python\\pytorch\\pytorch\\data\\nlp-getting-started\\test.csv')
submit = pd.read_csv('C:\\Users\\bokhy\\Python\\pytorch\\pytorch\\data\\nlp-getting-started\\sample_submission.csv')

In [4]:
print(train.shape)
print(test.shape)
train.head()

(7613, 5)
(3263, 4)


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
# Tokenize our train and test data

In [6]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, id, text, label=None):
        """Constructs a InputExample.
        Args:
            id: Unique id for the example.
            text: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.id = id
        self.text = text
        self.label = label


class InputFeatures(object):
    def __init__(self,example_id,choices_features,label):
        self.example_id = example_id
        _, input_ids, input_mask, segment_ids = choices_features[0]
        self.choices_features = {
            'input_ids': input_ids,
            'input_mask': input_mask,
            'segment_ids': segment_ids
        }
        self.label = label

In [7]:
def read_examples(df, is_training):
    if not is_training:
        df['target'] = np.zeros(len(df), dtype=np.int64)
    examples = []
    for val in df[['id', 'text', 'target']].values:
        examples.append(InputExample(id=val[0], text=val[1], label=val[2]))
    return examples, df


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.

    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [8]:
def convert_examples_to_features(examples, tokenizer, max_seq_length,
                                 is_training):
    features = []
    for example_index, example in enumerate(examples):

        text = tokenizer.tokenize(example.text)
        MAX_TEXT_LEN = max_seq_length - 2 
        text = text[:MAX_TEXT_LEN]

        choices_features = []

        tokens = ["[CLS]"] + text + ["[SEP]"]  
        segment_ids = [0] * (len(text) + 2) 
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)

        padding_length = max_seq_length - len(input_ids)
        input_ids += ([0] * padding_length)
        input_mask += ([0] * padding_length)
        segment_ids += ([0] * padding_length)
        choices_features.append((tokens, input_ids, input_mask, segment_ids))

        label = example.label
        if example_index < 1 and is_training:
            logger.info("*** Example ***")
            logger.info("idx: {}".format(example_index))
            logger.info("id: {}".format(example.id))
            logger.info("tokens: {}".format(' '.join(tokens).replace('\u2581', '_')))
            logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
            logger.info("input_mask: {}".format(len(input_mask)))
            logger.info("segment_ids: {}".format(len(segment_ids)))
            logger.info("label: {}".format(label))

        features.append(
            InputFeatures(
                example_id=example.id,
                choices_features=choices_features,
                label=label
            )
        )
    return features


def select_field(features, field):
    return [
        feature.choices_features[field] for feature in features
    ]

def metric(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='macro')
    return acc, f1

In [9]:
# hyperparameters
max_seq_length = 128
learning_rate = 1e-5  
num_epochs = 3  
batch_size = 8  
patience = 2  
file_name = 'model'  

In [10]:
logger = logging.getLogger('mylogger')
logger.setLevel(logging.DEBUG)
timestamp = time.strftime("%Y.%m.%d_%H.%M.%S", time.localtime())
fh = logging.FileHandler('log_model.txt')
fh.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('[%(asctime)s][%(levelname)s] ## %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
logger.addHandler(fh)
logger.addHandler(ch)

In [11]:
# BERT expects three kinds of input: input_ids(of tokens), segment_ids(to distinguish different sentences), 
# and input_mask(to indicate which elements in the sequence are tokens and which are padding elements). 
# The code below gets all three inputs for train set
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

I0126 18:48:49.879570 13856 tokenization_utils.py:398] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\bokhy\.cache\torch\transformers\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [12]:
# Train set
train_examples, train_df = read_examples(train, is_training=True)
labels = train_df['target'].astype(int).values
train_features = convert_examples_to_features(
    train_examples, tokenizer, max_seq_length, True)
all_input_ids = np.array(select_field(train_features, 'input_ids'))
all_input_mask = np.array(select_field(train_features, 'input_mask'))
all_segment_ids = np.array(select_field(train_features, 'segment_ids'))
all_label = np.array([f.label for f in train_features])

[2020-01-26 18:48:51,736][INFO] ## *** Example ***
I0126 18:48:51.736677 13856 <ipython-input-8-d8a8c87a5f7e>:25] *** Example ***
[2020-01-26 18:48:51,737][INFO] ## idx: 0
I0126 18:48:51.737674 13856 <ipython-input-8-d8a8c87a5f7e>:26] idx: 0
[2020-01-26 18:48:51,739][INFO] ## id: 1
I0126 18:48:51.739673 13856 <ipython-input-8-d8a8c87a5f7e>:27] id: 1
[2020-01-26 18:48:51,742][INFO] ## tokens: [CLS] our deeds are the reason of this # earthquake may allah forgive us all [SEP]
I0126 18:48:51.742662 13856 <ipython-input-8-d8a8c87a5f7e>:28] tokens: [CLS] our deeds are the reason of this # earthquake may allah forgive us all [SEP]
[2020-01-26 18:48:51,743][INFO] ## input_ids: 101 2256 15616 2024 1996 3114 1997 2023 1001 8372 2089 16455 9641 2149 2035 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
I0126 18:48:51.743

In [13]:
# Test-set Similarly
test_examples, test_df = read_examples(test, is_training=False)
test_features = convert_examples_to_features(
    test_examples, tokenizer, max_seq_length, True)
test_input_ids = torch.tensor(select_field(test_features, 'input_ids'), dtype=torch.long)
test_input_mask = torch.tensor(select_field(test_features, 'input_mask'), dtype=torch.long)
test_segment_ids = torch.tensor(select_field(test_features, 'segment_ids'), dtype=torch.long)

[2020-01-26 18:48:56,144][INFO] ## *** Example ***
I0126 18:48:56.144277 13856 <ipython-input-8-d8a8c87a5f7e>:25] *** Example ***
[2020-01-26 18:48:56,150][INFO] ## idx: 0
I0126 18:48:56.150223 13856 <ipython-input-8-d8a8c87a5f7e>:26] idx: 0
[2020-01-26 18:48:56,153][INFO] ## id: 0
I0126 18:48:56.153217 13856 <ipython-input-8-d8a8c87a5f7e>:27] id: 0
[2020-01-26 18:48:56,155][INFO] ## tokens: [CLS] just happened a terrible car crash [SEP]
I0126 18:48:56.155211 13856 <ipython-input-8-d8a8c87a5f7e>:28] tokens: [CLS] just happened a terrible car crash [SEP]
[2020-01-26 18:48:56,160][INFO] ## input_ids: 101 2074 3047 1037 6659 2482 5823 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
I0126 18:48:56.160197 13856 <ipython-input-8-d8a8c87a5f7e>:29] input_ids: 101 2074 3047 1037 6659 2482 5823 102 0 0 

In [14]:
# We will be using bert-base-uncased as our base model
class NeuralNet(nn.Module):
    def __init__(self, hidden_size=768, num_class=2):
        super(NeuralNet, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased',  
                                        output_hidden_states=True,
                                        output_attentions=True)
        for param in self.bert.parameters():
            param.requires_grad = True
        self.weights = nn.Parameter(torch.rand(13, 1))
        self.dropouts = nn.ModuleList([
            nn.Dropout(0.5) for _ in range(5)
        ])
        self.fc = nn.Linear(hidden_size, num_class)

    def forward(self, input_ids, input_mask, segment_ids):
        all_hidden_states, all_attentions = self.bert(input_ids, token_type_ids=segment_ids,
                                                                attention_mask=input_mask)[-2:]
        batch_size = input_ids.shape[0]
        ht_cls = torch.cat(all_hidden_states)[:, :1, :].view(
            13, batch_size, 1, 768)
        atten = torch.sum(ht_cls * self.weights.view(
            13, 1, 1, 1), dim=[1, 3])
        atten = F.softmax(atten.view(-1), dim=0)
        feature = torch.sum(ht_cls * atten.view(13, 1, 1, 1), dim=[0, 2])
        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                h = self.fc(dropout(feature))
            else:
                h += self.fc(dropout(feature))
        h = h / len(self.dropouts)
        return h

In [15]:
# We will use StratifiedKFold to split our data into 7 folds. 
# Multifold splitting is a popular validation strategy in kaggle competitions.
skf = StratifiedKFold(n_splits=7, shuffle=True, random_state=623)
# off: out-of-fold
oof_train = np.zeros((len(train_df), 2), dtype=np.float32)
oof_test = np.zeros((len(test_df), 2), dtype=np.float32)
oof_train
oof_test

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       ...,
       [0., 0.],
       [0., 0.],
       [0., 0.]], dtype=float32)

In [16]:
import fastai
from fastai.vision import *
from fastai.callbacks import *
from fastai.utils.mem import *

In [17]:
gc.collect()

33

In [21]:
torch.cuda.set_device(0)
torch.cuda.empty_cache()

In [16]:
for fold, (train_index, valid_index) in enumerate(skf.split(all_label, all_label)):
    if fold == 2:
        break # due to kernel time limit

    logger.info('================     fold {}        ==============='.format(fold))

    train_input_ids = torch.tensor(all_input_ids[train_index], dtype=torch.long)
    train_input_mask = torch.tensor(all_input_mask[train_index], dtype=torch.long)
    train_segment_ids = torch.tensor(all_segment_ids[train_index], dtype=torch.long)
    train_label = torch.tensor(all_label[train_index], dtype=torch.long)

    valid_input_ids = torch.tensor(all_input_ids[valid_index], dtype=torch.long)
    valid_input_mask = torch.tensor(all_input_mask[valid_index], dtype=torch.long)
    valid_segment_ids = torch.tensor(all_segment_ids[valid_index], dtype=torch.long)
    valid_label = torch.tensor(all_label[valid_index], dtype=torch.long)

    train = torch.utils.data.TensorDataset(train_input_ids, train_input_mask, train_segment_ids, train_label)
    valid = torch.utils.data.TensorDataset(valid_input_ids, valid_input_mask, valid_segment_ids, valid_label)
    test = torch.utils.data.TensorDataset(test_input_ids, test_input_mask, test_segment_ids)

    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

    model = NeuralNet()
    model.cuda()
    loss_fn = torch.nn.CrossEntropyLoss()

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-6)
    model.train()

    best_f1 = 0.
    valid_best = np.zeros((valid_label.size(0), 2))

    early_stop = 0
    for epoch in range(num_epochs):
        train_loss = 0.
        for batch in tqdm(train_loader):
            batch = tuple(t.cuda() for t in batch)
            x_ids, x_mask, x_sids, y_truth = batch
            y_pred = model(x_ids, x_mask, x_sids)
            loss = loss_fn(y_pred, y_truth)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item() / len(train_loader)
    
        model.eval()
        val_loss = 0.
        valid_preds_fold = np.zeros((valid_label.size(0), 2))
        with torch.no_grad():
            for i, batch in tqdm(enumerate(valid_loader)):
                batch = tuple(t.cuda() for t in batch)
                x_ids, x_mask, x_sids, y_truth = batch
                y_pred = model(x_ids, x_mask, x_sids).detach()
                val_loss += loss_fn(y_pred, y_truth).item() / len(valid_loader)
                valid_preds_fold[i * batch_size:(i + 1) * batch_size] = F.softmax(y_pred, dim=1).cpu().numpy()
    
        acc, f1 = metric(all_label[valid_index], np.argmax(valid_preds_fold, axis=1))
        if best_f1 < f1:
            early_stop = 0
            best_f1 = f1
            valid_best = valid_preds_fold
            torch.save(model.state_dict(), 'model_fold_{}.bin'.format(fold))
        else:
            early_stop += 1
        logger.info(
            'epoch: %d, train loss: %.8f, valid loss: %.8f, acc: %.8f, f1: %.8f, best_f1: %.8f\n' %
            (epoch, train_loss, val_loss, acc, f1, best_f1))
        torch.cuda.empty_cache()  
    
        if early_stop >= patience:
            break

    test_preds_fold = np.zeros((len(test_df), 2))
    valid_preds_fold = np.zeros((valid_label.size(0), 2))
    model.load_state_dict(torch.load('model_fold_{}.bin'.format(fold)))
    model.eval()
    with torch.no_grad():
        for i, batch in tqdm(enumerate(valid_loader)):
            batch = tuple(t.cuda() for t in batch)
            x_ids, x_mask, x_sids, y_truth = batch
            y_pred = model(x_ids, x_mask, x_sids).detach()
            valid_preds_fold[i * batch_size:(i + 1) * batch_size] = F.softmax(y_pred, dim=1).cpu().numpy()
    with torch.no_grad():
        for i, batch in tqdm(enumerate(test_loader)):
            batch = tuple(t.cuda() for t in batch)
            x_ids, x_mask, x_sids = batch
            y_pred = model(x_ids, x_mask, x_sids).detach()
            test_preds_fold[i * batch_size:(i + 1) * batch_size] = F.softmax(y_pred, dim=1).cpu().numpy()
    valid_best = valid_preds_fold
    oof_train[valid_index] = valid_best
    acc, f1 = metric(all_label[valid_index], np.argmax(valid_best, axis=1))
    logger.info('epoch: best, acc: %.8f, f1: %.8f, best_f1: %.8f\n' %
                (acc, f1, best_f1))
    
    
    #oof_test += test_preds_fold / 7 # uncomment this for 7 folds
    oof_test += test_preds_fold / 2 # comment this line when training for 7 folds

I0126 18:49:08.318372 13856 configuration_utils.py:185] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at C:\Users\bokhy\.cache\torch\transformers\4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
I0126 18:49:08.320368 13856 configuration_utils.py:199] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": true,
  "output_hidden_states": true,
  "output_past": true,
  "pruned_heads": {}

HBox(children=(FloatProgress(value=0.0, max=816.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[2020-01-26 19:00:54,805][INFO] ## epoch: 0, train loss: 0.44981819, valid loss: 0.37909862, acc: 0.83471074, f1: 0.82362659, best_f1: 0.82362659

I0126 19:00:54.805861 13856 <ipython-input-16-4acce12899b9>:75] epoch: 0, train loss: 0.44981819, valid loss: 0.37909862, acc: 0.83471074, f1: 0.82362659, best_f1: 0.82362659



HBox(children=(FloatProgress(value=0.0, max=816.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




[2020-01-26 19:13:06,571][INFO] ## epoch: 1, train loss: 0.31929401, valid loss: 0.40869437, acc: 0.83471074, f1: 0.83129299, best_f1: 0.83129299

I0126 19:13:06.571895 13856 <ipython-input-16-4acce12899b9>:75] epoch: 1, train loss: 0.31929401, valid loss: 0.40869437, acc: 0.83471074, f1: 0.83129299, best_f1: 0.83129299



HBox(children=(FloatProgress(value=0.0, max=816.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

[2020-01-26 19:25:18,279][INFO] ## epoch: 2, train loss: 0.19407260, valid loss: 0.44171160, acc: 0.82920110, f1: 0.82456660, best_f1: 0.83129299

I0126 19:25:18.279090 13856 <ipython-input-16-4acce12899b9>:75] epoch: 2, train loss: 0.19407260, valid loss: 0.44171160, acc: 0.82920110, f1: 0.82456660, best_f1: 0.83129299






Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

[2020-01-26 19:27:40,460][INFO] ## epoch: best, acc: 0.83471074, f1: 0.83129299, best_f1: 0.83129299

I0126 19:27:40.460875 13856 <ipython-input-16-4acce12899b9>:101] epoch: best, acc: 0.83471074, f1: 0.83129299, best_f1: 0.83129299






I0126 19:27:40.880117 13856 configuration_utils.py:185] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at C:\Users\bokhy\.cache\torch\transformers\4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
I0126 19:27:40.882111 13856 configuration_utils.py:199] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": true,
  "output_hidden_states": true,
  "output_past": true,
  "pruned_heads": {}

HBox(children=(FloatProgress(value=0.0, max=816.0), HTML(value='')))

RuntimeError: CUDA out of memory. Tried to allocate 12.00 MiB (GPU 0; 4.00 GiB total capacity; 2.93 GiB already allocated; 2.47 MiB free; 131.13 MiB cached)

In [17]:
logger.info(f1_score(labels, np.argmax(oof_train, axis=1)))
train_df['pred_target'] = np.argmax(oof_train, axis=1)

[2020-01-26 20:28:56,492][INFO] ## 0.20176612255820175
I0126 20:28:56.492817 13856 <ipython-input-17-986c7bf9c857>:1] 0.20176612255820175


In [18]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target,pred_target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,1
1,4,,,Forest fire near La Ronge Sask. Canada,1,0
2,5,,,All residents asked to 'shelter in place' are ...,1,0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,0


In [19]:
test_df['target'] = np.argmax(oof_test, axis=1)
logger.info(test_df['target'].value_counts())

[2020-01-26 20:59:19,591][INFO] ## 0    1889
1    1374
Name: target, dtype: int64
I0126 20:59:19.591239 13856 <ipython-input-19-4d11e337f759>:2] 0    1889
1    1374
Name: target, dtype: int64


In [20]:
submit['target'] = np.argmax(oof_test, axis=1)
submit.to_csv('submission_3fold.csv', index=False)

In [21]:
submit.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
