In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
import random
import time
import datetime

import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler


# fix seeds
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)



In [2]:
# Loading the train and test data for visualization & exploration.

train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

display(train.sample(5))

Unnamed: 0,id,keyword,location,text,target
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,191,aftershock,,Aftershock back to school kick off was great. ...,0
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0


In [3]:
df = pd.concat([train, test])

texts = train['text'].values
labels = train['target'].values

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

input_ids = []
attention_masks = []
for text in texts:
    encode_dic = tokenizer.encode_plus(
        text,
        max_length=128,
        pad_to_max_length=True,
        add_special_tokens=True,
        return_attention_mask=True,
        return_tensors='pt'
        
        
    )
    input_ids.append(encode_dic['input_ids'])
    attention_masks.append(encode_dic['attention_mask'])
    
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [5]:
data = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.9 * len(data))
val_size = len(data) - train_size

train_data, val_data = random_split(data, [train_size, val_size])
train_dataloader = DataLoader(train_data, batch_size=64, sampler=RandomSampler(train_data))
val_dataloader = DataLoader(val_data, batch_size=64, sampler=SequentialSampler(val_data))

In [6]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [12]:
epochs = 2

optimizer = AdamW(model.parameters(), lr=2e-5, eps=2e-8)

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

In [13]:
# Format as hh:mm:ss
def format_time(elapsed):    
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, pred_flat)

def flat_f1(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, pred_flat)

In [15]:
for epoch in range(epochs):
    
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
    print('Training...')
    start_train = time.time()
    model.train()
    total_train_loss = 0
    for step, batch in enumerate(train_dataloader):
        
        if step % 50 ==0:
            elapsed = format_time(time.time() - start_train)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            
        batch_input_ids = batch[0].cuda()
        batch_attention_masks = batch[1].cuda()
        batch_labels = batch[2].cuda()
        
        loss, logit = model(batch_input_ids, token_type_ids=None, 
                            attention_mask=batch_attention_masks, labels=batch_labels)
        
        model.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        
        total_train_loss += loss.item()
        
    average_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - start_train)

    print("  Average training loss: {0:.2f}".format(average_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    
    print('Running Validation...')
    start_eval = time.time()
    model.eval()
    total_val_loss = 0
    total_eval_accuracy = 0
    total_eval_f1 = 0
    for step, batch in enumerate(val_dataloader):
        
        batch_input_ids = batch[0].cuda()
        batch_attention_masks = batch[1].cuda()
        batch_labels = batch[2].cuda()
        
        with torch.no_grad():
            loss, logits = model(batch_input_ids, token_type_ids=None, 
                                attention_mask=batch_attention_masks, labels=batch_labels)
            
            total_val_loss += loss.item()
            
    average_val_loss = total_val_loss / len(val_dataloader)
    
    logits = logits.detach().cpu().numpy()
    label = batch_labels.to('cpu').numpy()
    
    total_eval_accuracy += flat_accuracy(logits, label)
    total_eval_f1 += flat_f1(logits, label)
    
    validation_time = format_time(time.time() - start_eval)
    print('  Validation took: {:}'.format(validation_time))
print('Completed')

Training...
  Batch     0  of    108.    Elapsed: 0:00:00.
  Batch    50  of    108.    Elapsed: 0:00:36.
  Batch   100  of    108.    Elapsed: 0:01:11.
  Average training loss: 0.33
  Training epcoh took: 0:01:16
Running Validation...
  Validation took: 0:00:03
Training...
  Batch     0  of    108.    Elapsed: 0:00:00.
  Batch    50  of    108.    Elapsed: 0:00:36.
  Batch   100  of    108.    Elapsed: 0:01:11.
  Average training loss: 0.31
  Training epcoh took: 0:01:16
Running Validation...
  Validation took: 0:00:03
Completed


In [17]:
texts = test['text'].values

input_ids = []
attention_masks = []
for text in texts:
    encode_dic = tokenizer.encode_plus(
        text,
        max_length=128,
        add_special_tokens=True,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    input_ids.append(encode_dic['input_ids'])
    attention_masks.append(encode_dic['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
    
predict_data = TensorDataset(input_ids, attention_masks)
predict_dataloader = DataLoader(predict_data, batch_size=64, 
                                sampler=SequentialSampler(predict_data))

In [18]:
predictions = []
for step, batch in enumerate(predict_dataloader):
    
    pre_input_ids = batch[0].cuda()
    pre_attention_mask = batch[1].cuda()
    
    with torch.no_grad():
        
        outputs = model(pre_input_ids, 
                      token_type_ids=None, attention_mask=pre_attention_mask)
        
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    
    predictions.append(logits)
    
print('    DONE.')

    DONE.


In [22]:
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

test['target'] = 0
test['target'] = flat_predictions
test[['text', 'target']].head()

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",1
2,3,,,"there is a forest fire at spot pond, geese are...",1
3,9,,,Apocalypse lighting. #Spokane #wildfires,1
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,1
