# Data Normalization/Analysis

In [1]:
import torch
import pandas as pd
import re
import csv 
import string
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from torch.utils.data import TensorDataset, RandomSampler, DataLoader, SequentialSampler
nltk.download('stopwords')
nltk.download('wordnet')

def normalise_text(text): 
    html_space = re.compile("%20")
    newline_pattern =  re.compile("\\n([^0-9])")
    numeric_pattern = re.compile("([0-9]+),([0-9]{3},?)+")
    punctuation_pattern = re.compile("[^\w\s]")
    url_pattern = re.compile(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)")
    normalized_text = text
     
    while bool(html_space.search(normalized_text)):
        normalized_text = re.sub(html_space, r' ', normalized_text)
        
    while bool(newline_pattern.search(normalized_text)):
        normalized_text = re.sub(newline_pattern, r' \1', normalized_text)

    while bool(numeric_pattern.search(normalized_text)):
        normalized_text = re.sub(numeric_pattern, r'\1\2', normalized_text)
        
    normalized_text = re.sub(url_pattern, '', normalized_text)
    normalized_text = str.lower(normalized_text)
    
    lines = normalized_text.split('\n')

    lines = [x for x in csv.reader(lines, quotechar='"', delimiter=',',
               quoting=csv.QUOTE_ALL, skipinitialspace=True) if len(x) > 0]
    
    normalized_lines = []
    for line in lines:
        normalized_lines.append([re.sub(punctuation_pattern, '', x) for x in line])

    return normalized_lines



def clean_text(text):
    lemmatizer = WordNetLemmatizer()
    # Avoid stopword removal - prepositional words are useful for BERT
    blacklist = [] # stopwords.words('english')
    tokenizer = TweetTokenizer()
    tokens = []
    if not text:
        return tokens
    
    tokens = tokenizer.tokenize(text)
    if any(tokens):
        tokens = [x for x in tokens if x not in blacklist]
        tokens = [lemmatizer.lemmatize(x) for x in tokens]
    return tokens


max_token_len = 0 
    
def clean_data(dataframe):
    for row in dataframe.iterrows():
        row_data = row[1]
        keywords = clean_text(row_data.keyword)
        text = clean_text(row_data.text) 
            
        sent = ' '.join(text)
        sent = ' [CLS] ' + sent + ' [SEP] '
        if 'target' in dataframe:
            new_row = [row_data.id, keywords, row_data.location, sent, row_data.target]
        else:
            new_row = [row_data.id, keywords, row_data.location, sent]
        dataframe.iloc[row[0]] = new_row
    return dataframe

def load_text(file):
    with open(file) as f:
        lines = f.read()
    normalised_text = normalise_text(lines)

    data = pd.DataFrame(normalised_text[1:], columns=normalised_text[0])
    data = clean_data(data)
    return data

train_data = load_text('./data/train.csv')
test_data = load_text('./data/test.csv')
train_data.to_csv('./data/normalized_train_data.csv', index=False)
 

[nltk_data] Downloading package stopwords to /Users/jack/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jack/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  arr_value = np.array(value)


In [5]:
no_train_disasters = len(train_data.loc[train_data['target'] == "1"])
no_train_nondisasters = len(train_data.loc[train_data['target'] == "0"])
pct_train_disasters = round(no_train_disasters/(no_train_disasters+no_train_nondisasters), 2)
pct_no_train_disasters = round(1-pct_train_disasters, 2)
print("Balance of data  %s : %s " % (pct_train_disasters, pct_no_train_disasters))


Balance of data  0.43 : 0.57 


- Demonstrates a slight data augmentation inbalance, which we will attempt to correct by downsampling.
- Data seems to be ordered by category, so we'll shuffle it for good measure.

In [6]:
import random

no_indices_to_remove = no_train_nondisasters-no_train_disasters
indices = [x for x in train_data.loc[train_data['target'] == "0"].index]
indices_to_remove = random.sample(indices, no_indices_to_remove)
train_data = train_data.drop(indices_to_remove)

no_train_disasters = len(train_data.loc[train_data['target'] == "1"])
no_train_nondisasters = len(train_data.loc[train_data['target'] == "0"])
pct_train_disasters = round(no_train_disasters/(no_train_disasters+no_train_nondisasters), 2)
pct_no_train_disasters = round(1-pct_train_disasters, 2)
print("Balance of data  %s : %s " % (pct_train_disasters, pct_no_train_disasters))

train_data = train_data.sample(frac=1)

Balance of data  0.5 : 0.5 


In [9]:
train_data.shape
train_data.sample(10)

Unnamed: 0,id,keyword,location,text,target
6381,9119,"[suicide, bomb]",homs syria,"[11, soldier, killed, isi, suicide, bomb, air,...",1
3143,4517,[emergency],kuwait,"[plane, new, york, kuwait, diverts, uk, declar...",1
6966,9991,[tsunami],in the word of god,"[author_mike, amen, today, day, salvation, thx...",1
159,229,"[airplane, accident]",,"[expert, france, begin, examining, airplane, d...",1
2071,2973,[dead],,"[beforeitsnews, hundred, feared, dead, libyan,...",1
6477,9264,[sunk],london england,"[still, hasnt, sunk, ive, actually, met, idol]",0
6406,9157,"[suicide, bomber]",,"[news, islamic, state, claim, suicide, bombing...",1
6253,8935,[snowstorm],in the spirit world,"[photo, mothernaturenetwork, thundersnow, hear...",1
5073,7231,"[natural, disaster]",,"[top, insurer, blast, lack, australian, govt, ...",1
1695,2446,[collide],,"[devia, ler, collide, wattys, 2015, wattpad, t...",0


In [14]:
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Model Training
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

labels = [int(x) for x in train_data['target'].tolist()]
sents = [tokenizer.tokenize(sent) for sent in train_data['text'].tolist()]
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in sents]
input_ids = pad_sequences(input_ids, maxlen=64, dtype="long", truncating="post", padding="post")
attention_weights = []
for seq in input_ids:
  weights = [float(i>0) for i in seq]
  attention_weights.append(weights)
  
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_weights, input_ids,
                                             random_state=2018, test_size=0.1)
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [16]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
print('Model loaded!')




100%|██████████| 407873900/407873900 [14:07<00:00, 481348.51B/s] 


Model loaded!


In [None]:
import numpy as np
from tqdm import tqdm, trange

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

t = []  
train_loss_set = []
epochs = 2

for _ in trange(epochs, desc="Epoch"):
  # Set model to train mode
  model.train()
  
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  for step, batch in enumerate(train_dataloader):
    b_input_ids, b_input_mask, b_labels = batch
      
    # Clear dataloader gradients from previous batch
    optimizer.zero_grad()
      
    # Forward pass
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    train_loss_set.append(loss.item())
      
    # Backward pass
    loss.backward()
    
    # Gradient step.
    optimizer.step()
    
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1
    
print("Train loss: {}".format(tr_loss/nb_tr_steps))

# Validation
# Put model in evaluation
model.eval()

# Tracking variables 
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

# Evaluate data for one epoch
for batch in validation_dataloader:
    b_input_ids, b_input_mask, b_labels = batch
    
    # Informing torch not to store gradients
    with torch.no_grad():
        logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        
    tmp_eval_accuracy = flat_accuracy(logits, b_labels)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1
    
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

t_total value of -1 results in schedule not being applied
	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1005.)
  next_m.mul_(beta1).add_(1 - beta1, grad)
