# Data Normalization/Analysis

In [1]:
import torch
import pandas as pd
import re
import csv 
import string
import nltk
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from torch.utils.data import TensorDataset, RandomSampler, DataLoader, SequentialSampler
from tqdm.notebook import tqdm, trange
from tqdm import tqdm

nltk.download('stopwords')
nltk.download('wordnet')
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
lemmatizer = WordNetLemmatizer()

torch.cuda.empty_cache()

    
def normalise_text(text): 
    html_space = re.compile("%20")
    newline_pattern =  re.compile("\\n([^0-9])")
    numeric_pattern = re.compile("([0-9]+),([0-9]{3},?)+")
    numbers_pattern = re.compile("([0-9])")
    random_patterns = re.compile("Û_")
    punctuation_marks = re.compile("ûª")
    punctuation_pattern = re.compile("[^\w\s]")
    url_pattern = re.compile(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)")
    normalized_text = text
     
    while bool(html_space.search(normalized_text)):
        normalized_text = re.sub(html_space, r' ', normalized_text)
        
    while bool(newline_pattern.search(normalized_text)):
        normalized_text = re.sub(newline_pattern, r' \1', normalized_text)

    while bool(numeric_pattern.search(normalized_text)):
        normalized_text = re.sub(numeric_pattern, r'\1\2', normalized_text)
    
    while bool(punctuation_marks.search(normalized_text)):
        normalized_text = re.sub(punctuation_marks, r'', normalized_text)    
    
    while bool(random_patterns.search(normalized_text)):
        normalized_text = re.sub(random_patterns, r'', normalized_text)  
        
    normalized_text = re.sub(url_pattern, '', normalized_text)
    normalized_text = str.lower(normalized_text)
    
    lines = normalized_text.split('\n')

    lines = [x for x in csv.reader(lines, quotechar='"', delimiter=',',
               quoting=csv.QUOTE_ALL, skipinitialspace=True) if len(x) > 0]
    
    normalized_lines = []
    for line in lines:
        normalized_lines.append([re.sub(punctuation_pattern, '', x) for x in line])

    return normalized_lines



def clean_text(text):
    # Avoid stopword removal - prepositional words are useful for BERT
    blacklist = stopwords.words('english')
    
    tokens = []
    if not text:
        return tokens
    
    tokens = tokenizer.tokenize(text)
    if any(tokens):
        tokens = [x for x in tokens if x not in blacklist]
        tokens = [lemmatizer.lemmatize(x) for x in tokens]
    return tokens


max_token_len = 0 
seen_values = []
    
def clean_data(dataframe):
    new_df = []
    columns = []
    if 'target' in dataframe: 
        columns=['id','keyword','location','text','target']
    else:
        columns=['id','keyword','location','text']
    
    with tqdm(total=len(dataframe), position=0, leave=True) as pbar:
        for idx, row_data in dataframe.iterrows():     
            pbar.update(1)
            keywords = clean_text(row_data.keyword)
            text = clean_text(row_data.text) 
            location = row_data.location
            keywords = '[SEP] '.join(keywords)

            if not location:
                location = ''
            else: 
                location = location + ' [SEP]'

            if not keywords:
                keywords = ''
            else:
                keywords = keywords + ' [SEP]'

            sent = ' '.join(text)
            if sent in seen_values:
                continue
            else:
                seen_values.append(sent)                
                sent = '[CLS] ' + location + keywords + sent + ' [SEP]'
                if 'target' in dataframe:
                    new_row = [row_data.id, keywords, row_data.location, sent, row_data.target]
                else:
                    new_row = [row_data.id, keywords, row_data.location, sent]
                new_df.append(new_row)
     
        return pd.DataFrame(new_df, columns=columns)

def load_text(file):
    with open(file) as f:
        lines = f.read()
    normalised_text = normalise_text(lines)
    data = pd.DataFrame(normalised_text[1:], columns=normalised_text[0])
    data = clean_data(data) 
    return data

train_data = load_text('./data/train.csv')
test_data = load_text('./data/test.csv')[1:3]
train_data.to_csv('./data/normalized_train_data.csv', index=False)
 

[nltk_data] Downloading package stopwords to /home/jack/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jack/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
100%|██████████| 7613/7613 [00:03<00:00, 2268.20it/s]
100%|██████████| 3263/3263 [00:01<00:00, 2675.91it/s]


In [2]:
no_train_disasters = len(train_data.loc[train_data['target'] == "1"])
no_train_nondisasters = len(train_data.loc[train_data['target'] == "0"])
pct_train_disasters = round(no_train_disasters/(no_train_disasters+no_train_nondisasters), 2)
pct_no_train_disasters = round(1-pct_train_disasters, 2)
print("Balance of data  %s : %s " % (pct_train_disasters, pct_no_train_disasters))


Balance of data  0.41 : 0.59 


- Demonstrates a slight data augmentation inbalance, which we will attempt to correct by downsampling.
- Data seems to be ordered by category, so we'll shuffle it for good measure.

In [3]:
import random

no_indices_to_remove = max(0, no_train_nondisasters-no_train_disasters)
indices = [x for x in train_data.loc[train_data['target'] == "0"].index]
indices_to_remove = random.sample(indices, no_indices_to_remove)
train_data = train_data.drop(indices_to_remove)

no_train_disasters = len(train_data.loc[train_data['target'] == "1"])
no_train_nondisasters = len(train_data.loc[train_data['target'] == "0"])
pct_train_disasters = round(no_train_disasters/(no_train_disasters+no_train_nondisasters), 2)
pct_no_train_disasters = round(1-pct_train_disasters, 2)
print("Balance of data  %s : %s " % (pct_train_disasters, pct_no_train_disasters))

train_data = train_data.sample(frac=1)

Balance of data  0.5 : 0.5 


In [4]:
train_data.shape
train_data.sample(10)
len(train_data)

5622

In [5]:
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Model Training
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

labels = [int(x) for x in train_data['target'].tolist()]
sents = [tokenizer.tokenize(sent) for sent in train_data['text'].tolist()]
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in sents]
input_ids = pad_sequences(input_ids, maxlen=64, dtype="long", truncating="post", padding="post")
attention_weights = []
for seq in input_ids:
  weights = [float(i>0) for i in seq]
  attention_weights.append(weights)
  
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.25)
train_masks, validation_masks, _, _ = train_test_split(attention_weights, input_ids,
                                             random_state=2018, test_size=0.25)
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

batch_size = 16
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=len(validation_data))


In [6]:

print(len(train_inputs))
print(len(validation_inputs))


4216
1406


In [None]:
import numpy as np
from tqdm import tqdm, trange
from sklearn.metrics import f1_score
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    preds_converted = list(map(bool,pred_flat))
    labels_converted = list(map(bool,labels_flat))
    
    f1 = f1_score(y_true=labels_converted, y_pred=preds_converted)
    return f1


def train_model():
    model=BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    model.cuda()
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=2e-6,
                         warmup=.1)
    t = []  
    train_loss_set = []
    epochs = 5

    best_val_accuracy = 0
    best_train_loss = 1
    best_model = None
    for epoch in trange(epochs, desc="Epoch"):
        torch.cuda.empty_cache()
        # Set model to train mode 
        model.train()
        
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch

            # Clear dataloader gradients from previous batch
            optimizer.zero_grad()

            # Forward pass
            loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            train_loss_set.append(loss.item())

            # Backward pass
            loss.backward()

            # Gradient step.
            optimizer.step()

            # Update tracking variables
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

            train_loss = tr_loss/nb_tr_steps
            if train_loss < best_train_loss:
                best_train_loss = train_loss

        # Validation set 
        model.eval()

        # Tracking variables 
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch

            # Informing torch not to store gradients
            with torch.no_grad():
                logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1

            val_accuracy = eval_accuracy/nb_eval_steps

            if val_accuracy > best_val_accuracy:
                best_val_accuracy = val_accuracy
                best_model = model

        print("Epoch {}".format(epoch))
        print("Validation F1: {}".format(best_val_accuracy))
        print("Train loss: {}".format(best_train_loss))
        
    model.to('cpu')
    best_model.to('cpu')
         
    return best_model


print("Training model 1")
model_a = train_model()
print("Training model 2")
model_b = train_model()
print("Training model 3")
model_c = train_model()


Training model 1


t_total value of -1 results in schedule not being applied
	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  next_m.mul_(beta1).add_(1 - beta1, grad)
Epoch:  20%|██        | 1/5 [00:38<02:32, 38.13s/it]

Epoch 0
Validation F1: 0.7926136363636362
Train loss: 0.5199915481110414


Epoch:  40%|████      | 2/5 [01:16<01:55, 38.46s/it]

Epoch 1
Validation F1: 0.7969924812030076
Train loss: 0.342666357755661


Epoch:  60%|██████    | 3/5 [01:56<01:17, 38.86s/it]

Epoch 2
Validation F1: 0.7969924812030076
Train loss: 0.22512169182300568


Epoch:  80%|████████  | 4/5 [02:34<00:38, 38.81s/it]

Epoch 3
Validation F1: 0.7969924812030076
Train loss: 0.09553971886634827


Epoch: 100%|██████████| 5/5 [03:13<00:00, 38.69s/it]

Epoch 4
Validation F1: 0.7969924812030076
Train loss: 0.09553971886634827





Training model 2


t_total value of -1 results in schedule not being applied
Epoch:  20%|██        | 1/5 [00:38<02:35, 38.85s/it]

Epoch 0
Validation F1: 0.7886676875957122
Train loss: 0.5141381154570616


Epoch:  40%|████      | 2/5 [01:18<01:57, 39.08s/it]

Epoch 1
Validation F1: 0.8005657708628006
Train loss: 0.360504408677419


Epoch:  60%|██████    | 3/5 [01:56<01:17, 38.87s/it]

Epoch 2
Validation F1: 0.8005657708628006
Train loss: 0.35335104167461395


Epoch:  80%|████████  | 4/5 [02:35<00:38, 38.78s/it]

Epoch 3
Validation F1: 0.8005657708628006
Train loss: 0.21859297156333923


Epoch: 100%|██████████| 5/5 [03:13<00:00, 38.80s/it]

Epoch 4
Validation F1: 0.8005657708628006
Train loss: 0.13553287088871002





Training model 3


t_total value of -1 results in schedule not being applied
Epoch:  20%|██        | 1/5 [00:39<02:37, 39.47s/it]

Epoch 0
Validation F1: 0.7919366450683946
Train loss: 0.5139452720794714


In [None]:
# Test predictions
sents = [tokenizer.tokenize(sent) for sent in test_data['text'].tolist()]
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in sents]
input_ids = pad_sequences(input_ids, maxlen=64, dtype="long", truncating="post", padding="post")
attention_weights = []
for seq in input_ids:
  weights = [float(i>0) for i in seq]
  attention_weights.append(weights)

input_ids = torch.tensor(input_ids)
test_masks = torch.tensor(attention_weights)
test_dataset = TensorDataset(input_ids, test_masks)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=len(test_dataset))

for batch in test_dataloader:
    b_input_ids, b_input_mask = batch

    # Informing torch not to store gradients
    with torch.no_grad():
        logits1 = model_a(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits2 = model_b(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits3 = model_c(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

In [None]:
from scipy.stats import mode
results1 = np.argmax(logits1, axis=1)
results2 = np.argmax(logits2, axis=1)
results3 = np.argmax(logits3, axis=1)
tf_a = tf.stack([results1, results2, results3])  
test_preds = mode(tf_a)[0][0]
input_ids = b_input_ids.flatten()

In [None]:

import csv 

with open('submission.csv', 'w') as f:
    writer = csv.writer(f)
    f.write('id,target\n')
    writer.writerows(zip(test_data['id'].tolist(), test_preds.tolist()))