# install required libraries

## NOTE: This environment has been installed basic libraries like as torch, jupyter, pandas, numpy, and so on. 

In [1]:
!pip3 install opendatasets transformers pandas-profiling -q

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


# prepare data files and authorize kaggle

In [2]:
import opendatasets as od

od.download('https://www.kaggle.com/c/nlp-getting-started', force=True)

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: lamttic
Your Kaggle Key: ········
Downloading nlp-getting-started.zip to ./nlp-getting-started


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 593k/593k [00:00<00:00, 51.6MB/s]


Extracting archive ./nlp-getting-started/nlp-getting-started.zip to ./nlp-getting-started





In [3]:
!cp ./nlp-getting-started/* .

# load data files

In [None]:
import pandas as pd
import numpy as np
import torch

sample_df = pd.read_csv('sample_submission.csv')
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

sample_df

In [None]:
torch.manual_seed(42)

# preprocess

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer, BertModel

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
train_sentences = train_df['text'].values
train_labels = train_df['target'].values
test_sentences = test_df['text'].values

## show frequency of each sentence

In [None]:
import matplotlib.pyplot as plt

plt.hist([len(sentence) for sentence in train_sentences])

In [None]:
plt.hist([len(sentence) for sentence in test_sentences])

# tokenize

In [None]:
max_length = max([len(sentence) for sentence in np.concatenate([train_sentences, test_sentences], axis=0)])

In [None]:
def encode_sentences(sentences, max_length):
    input_ids = []
    attention_masks = []

    for sentence in sentences:
        encode_dict = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encode_dict['input_ids'])
        attention_masks.append(encode_dict['attention_mask'])
        
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

In [None]:
train_input_ids, train_attention_masks = encode_sentences(train_sentences, max_length)

In [None]:
train_labels = torch.tensor(train_labels)

# set dataset and dataloader for train

In [None]:
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

In [None]:
dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)

In [None]:
train_size = int(len(dataset) * 0.99)
val_size = len(dataset) - train_size

In [None]:
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [None]:
batch_size = 16

train_dataloader = DataLoader(train_dataset,
                              sampler=RandomSampler(train_dataset),
                              batch_size=batch_size)
val_dataloader = DataLoader(val_dataset,
                            sampler=SequentialSampler(val_dataset),
                            batch_size=batch_size)

# set pretrained model

In [None]:
import torch.nn as nn

In [None]:
class LayerNorm(nn.Module):
    def __init__(self, num_hidden: int, variance_epsilon: float = 1e-12):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(num_hidden))
        self.beta = nn.Parameter(torch.zeros(num_hidden))
        self.variance_epsilon = variance_epsilon

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.gamma * x + self.beta

In [None]:
from transformers import BertPreTrainedModel

In [None]:
class CustomBert(BertPreTrainedModel):
    def __init__(self, config):
        config.output_hidden_states = True
        super(CustomBert, self).__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(p=0.2)
        self.high_dropout = nn.Dropout(p=0.5)

        n_weights = config.num_hidden_layers + 1
        weights_init = torch.zeros(n_weights).float()
        weights_init.data[:-1] = -3
        self.layer_weights = torch.nn.Parameter(weights_init)

        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        hidden_layers = outputs[2]

        cls_outputs = torch.stack(
            [self.dropout(layer[:, 0, :]) for layer in hidden_layers], dim=2
        )
        cls_output = (torch.softmax(self.layer_weights, dim=0) * cls_outputs).sum(-1)

        # multisample dropout (wut): https://arxiv.org/abs/1905.09788
        logits = torch.mean(
            torch.stack(
                [self.classifier(self.high_dropout(cls_output)) for _ in range(5)],
                dim=0,
            ),
            dim=0,
        )

        outputs = logits
        return outputs

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = CustomBert.from_pretrained('bert-base-uncased', num_labels=2)

model.to(device)

# set optimizer

In [None]:
from transformers import AdamW

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# set learning rate scheduler

In [None]:
from transformers import get_linear_schedule_with_warmup

In [None]:
epochs = 1
total_step = len(train_dataset) * epochs

In [None]:
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_step
)

In [None]:
def get_accuracy(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(preds_flat == labels_flat) / len(preds_flat)

# fine-tuning model

In [None]:
creterion = nn.CrossEntropyLoss()

In [None]:
current_val_loss = 0.0

for epoch in range(epochs):
    total_train_loss = 0.0
    total_val_loss = 0.0
    total_val_accuracy = 0.0
    
    for step, batch in enumerate(train_dataloader, 1):
        input_ids, attention_mask, labels = tuple(el.to(device) for el in batch)
        
        model.zero_grad()
        
        output = model(input_ids,
                       token_type_ids=None,
                       attention_mask=attention_mask)
        
        loss = creterion(output, labels)
        total_train_loss += loss
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
    print(f'Train loss: {total_train_loss / len(train_dataloader)}')
    
    for batch in val_dataloader:
        input_ids, attention_mask, labels = tuple(el.to(device) for el in batch)
        
        with torch.no_grad():
            output = model(input_ids,
                           token_type_ids=None,
                           attention_mask=attention_mask)
        
            loss = creterion(output, labels)
            total_val_loss += loss
            
            
        logits = output.detach().cpu().numpy()
        label_ids = labels.detach().cpu().numpy()

        total_val_accuracy += get_accuracy(logits, label_ids)
        
    val_loss = total_val_loss / len(val_dataloader)
    print(f'Validation loss: {val_loss}')
    print(f'Validation accuracy: {total_val_accuracy / len(val_dataloader)}')
    
    if epoch == 0:
        current_val_loss = val_loss
    else:
        if current_val_loss <= val_loss:
            print(f'Early stop: {epoch} epoch')
            break

# Test

In [None]:
test_input_ids, test_attention_masks = encode_sentences(test_sentences, max_length)

In [None]:
test_labels = torch.tensor(np.ones(len(test_input_ids)))

In [None]:
test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_data, sampler=SequentialSampler(test_data), batch_size=batch_size)

In [None]:
predictions = []

for batch in test_dataloader:
    input_ids, attention_mask, labels = tuple(el.to(device) for el in batch)
    
    with torch.no_grad():
        output = model(input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=None)
        
    logits = output.detach().cpu().numpy()
    pred_flatten = np.argmax(logits, axis=1).flatten()
    
    predictions.extend(pred_flatten)

# calculate accuracy

In [None]:
correct_df = pd.read_csv('submission.csv')

In [None]:
correct_df

In [None]:
correct_df[correct_df['target'].values == predictions].shape[0] / correct_df.shape[0]

# save submission file and submit predictions 

import csv

with open('submission.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'target'])
    for idx, target in zip(test_df['id'].values, predictions):
        writer.writerow([idx, target])

#!kaggle competitions submit -c nlp-getting-started -f submission.csv -m "My third try"

import pandas_profiling