In [2]:
import numpy as np
import pandas as pd

# PyTorch
import torch
from torchtext import data

# SpaCy
import spacy 
# python -m spacy download en

In [3]:
# (0) set seed for reproduciability
torch.manual_seed(2020)

<torch._C.Generator at 0x7fa299bcdd10>

In [8]:
data_dir = '/home/lefko/personal/deep-learning/data/'

def loadTrain():
    return ''.join([data_dir, 'train.csv'])

def loadTest():
    return ''.join([data_dir, 'test.csv'])

#print('path to train.csv:', ''.join([data_dir, 'train.csv']))
train_df = pd.read_csv(loadTrain())
test_df = pd.read_csv(loadTest())

In [9]:
train_df.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [14]:
# number of rows
len(train_df)

7613

In [21]:
# How many unique locations?
print(len(train_df['location'].unique()), 'unique locations found \n')
# How many unique keywords?
print(len(train_df['keyword'].unique()), 'unique keywords found \n')
# Average tweet length?
print(np.mean(train_df['text'].str.len()), 'average length of a tweet \n')

3342 unique locations found 

222 unique keywords found 

101.03743596479706 average length of a tweet 



In [10]:
from spacy.lang.en import English
parser = English()
spacy.load('en')

# declare tokenizers
TEXT = data.Field(tokenize='spacy', batch_first=True, include_lengths=True)
LABEL = data.LabelField(dtype = torch.float, batch_first = True)

# declare fields for the dataloader - PyTorch thing
train_fields = [(None, None), (None, None), (None, None), ('text', TEXT), ('label', LABEL)] # (id), (keyword), (location), (text), (label)
test_fields = [(None, None), (None, None), (None, None), ('text', TEXT)] # (id), (keyword), (location), (text)


# custom PyTorch dataset
train_loader = data.TabularDataset(path=loadTrain(), format='csv', fields=train_fields, skip_header=True) # labelled train/validation data
test_loader = data.TabularDataset(path=loadTest(), format='csv', fields=test_fields, skip_header=True) # unlabelled test data

In [11]:
print(vars(train_loader.examples[0]))
print(vars(test_loader.examples[0]))

{'text': ['Our', 'Deeds', 'are', 'the', 'Reason', 'of', 'this', '#', 'earthquake', 'May', 'ALLAH', 'Forgive', 'us', 'all'], 'label': '1'}
{'text': ['Just', 'happened', 'a', 'terrible', 'car', 'crash']}


In [12]:
# split train into train-val (80:20)
train_data, valid_data = train_loader.split(split_ratio=.2)

In [13]:
# build training vocabulary
min_word_freq = 3 # word must occure at least x times
glove_ver = 'glove.6B.100d' # https://nlp.stanford.edu/projects/glove/

TEXT.build_vocab(train_data, min_freq = min_word_freq, vectors = glove_ver) # build word vectors using pre-trained embeddings
LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [06:51, 2.10MB/s]
100%|█████████▉| 399390/400000 [00:15<00:00, 24388.79it/s]

In [15]:
# show some statistics

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
#print(TEXT.vocab.stoi)   

Size of TEXT vocabulary: 1471
Size of LABEL vocabulary: 2
[('#', 672), ('.', 601), ('?', 551), ('the', 514), (':', 386), ('a', 381), ('to', 376), ('in', 355), ('of', 334), ('I', 320)]


In [31]:
# prepare training batches

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

#set batch size
BATCH_SIZE = 64

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [32]:
import torch.nn as nn

class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs

In [33]:
#define hyperparameters
size_of_vocab = len(TEXT.vocab)
embedding_dim = 100
num_hidden_nodes = 32
num_output_nodes = 1
num_layers = 2
bidirection = True
dropout = 0.2

#instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = True, dropout = dropout)

In [34]:
#architecture
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

classifier(
  (embedding): Embedding(1471, 100)
  (lstm): LSTM(100, 32, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (act): Sigmoid()
)
The model has 206,557 trainable parameters
torch.Size([1471, 100])


In [35]:
import torch.optim as optim

#define optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()

#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc
    
#push to cuda if available
#model = model.to(device)
#criterion = criterion.to(device)

In [36]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text   
        
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()  
        
        #compute the loss
        loss = criterion(predictions, batch.label)        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            #retrieve text and no. of words
            text, text_lengths = batch.text
            
            #convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()
            
            #compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [38]:
N_EPOCHS = 15
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Train Loss: 0.404 | Train Acc: 82.55%
	 Val. Loss: 0.511 |  Val. Acc: 77.12%
	Train Loss: 0.344 | Train Acc: 86.18%
	 Val. Loss: 0.544 |  Val. Acc: 74.37%
	Train Loss: 0.290 | Train Acc: 88.46%
	 Val. Loss: 0.553 |  Val. Acc: 76.73%
	Train Loss: 0.247 | Train Acc: 90.40%
	 Val. Loss: 0.623 |  Val. Acc: 76.09%
	Train Loss: 0.230 | Train Acc: 91.29%
	 Val. Loss: 0.652 |  Val. Acc: 75.24%
	Train Loss: 0.187 | Train Acc: 93.08%
	 Val. Loss: 0.690 |  Val. Acc: 74.55%
	Train Loss: 0.152 | Train Acc: 94.94%
	 Val. Loss: 0.779 |  Val. Acc: 72.80%
	Train Loss: 0.151 | Train Acc: 94.60%
	 Val. Loss: 0.757 |  Val. Acc: 73.73%
	Train Loss: 0.126 | Train Acc: 95.70%
	 Val. Loss: 0.778 |  Val. Acc: 74.63%
	Train Loss: 0.117 | Train Acc: 96.29%
	 Val. Loss: 0.881 |  Val. Acc: 72.62%
	Train Loss: 0.094 | Train Acc: 96.48%
	 Val. Loss: 0.909 |  Val. Acc: 74.22%
	Train Loss: 0.082 | Train Acc: 97.26%
	 Val. Loss: 0.959 |  Val. Acc: 73.47%
	Train Loss: 0.072 | Train Acc: 97.61%
	 Val. Loss: 1.022 |  Val.

In [41]:
#load weights
path='./saved_weights.pt'
model.load_state_dict(torch.load(path));
model.eval();

#inference 
import spacy
nlp = spacy.load('en')

def predict(model, sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  #tokenize the sentence 
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]          #convert to integer sequence
    length = [len(indexed)]                                    #compute no. of words
    tensor = torch.LongTensor(indexed).to(device)              #convert to tensor
    tensor = tensor.unsqueeze(1).T                             #reshape in form of batch,no. of words
    length_tensor = torch.LongTensor(length)                   #convert to tensor
    prediction = model(tensor, length_tensor)                  #prediction 
    return prediction.item()    

In [55]:
t1 = vars(test_loader.examples[0])


'Just happened a terrible car crash'

In [56]:
print(predict(model, ' '.join(t1['text'])))

0.7229582667350769


In [65]:
preds = []
for i in range(len(test_loader.examples)):
    text = ' '.join(vars(test_loader.examples[i])['text'])
    print(text)
    if predict(model, text) > .7:
        preds.append(1)
    else:
        preds.append(0)

preds

your city in a Û _ http://t.co/S6Rc7dbAL1
RT @calgarysun : Sun photographer Stuart Dryden spotted this vortex spun off a violent storm in Sylvan Lake . # abstorm http://t.co/w0BZ0JNGvB
Holy crap ! BRAVO Sir ! Amazing ! Dramatic Video Shows Plane Landing During Violent Storm http://t.co/xB0bw8h8Ur
Violent Forces Radio : Now Playing Torture - Storm Alert 
 TuneIn Player @ http://t.co/XsSgEdSbH4
Circus tent collapses in violent storm killing 2 kids in New Hampshire http://t.co/j0saXEKBTa
Storm is here ! Violent winds and pounding rains in Evergreen . # yyc
Stay inside for the next little while kids . We 're having a bit of a violent storm right now . -kjc
Watch : Violent Storm Causes Deadly Accident at New Hampshire Circus http://t.co/jpPiR3ZzKA # GMA http://t.co/nV5GCDpIBA
Violent Forces Radio : Now Playing Axegressor - Psalm Before the Storm 
 TuneIn Player @ http://t.co/XsSgEdSbH4
# calgaryweather   It would be nice if they would fix radar before another violent storm   Uninformed cit

[1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,


In [66]:
len(preds)

3263

In [68]:
sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')

In [69]:
sample_submission['target'] = preds

In [70]:
sample_submission.head(10)

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,0
3,9,0
4,11,1
5,12,0
6,21,0
7,22,0
8,27,0
9,29,0


In [72]:
sample_submission.to_csv(data_dir + "submission.csv", index=False)