Methods used:
    - Data cleaning: space tokenize, lower, remove punctuation, remove stopwords, lemmatize,  remove urls
    
Methods have yet used:
    - Data cleaning: remove htmls, convert abbreviations, remove emojis (Regex)
    - Data using: keyword, location
    

In [1]:
import torch
import torchtext
import torch.nn as nn
import torchtext.data as data
import pandas as pd
import csv
from IPython.display import display, HTML
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
import random
import torch.nn.functional as F

In [2]:
device = torch.device("cpu: 0" if torch.cuda.is_available() else "cpu")
print(device)

cpu:0


In [3]:
SEED = 2020
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
DATA_DIR = "data/train.csv"

In [5]:
training_data = pd.read_csv(DATA_DIR)

In [6]:
def preprocess(sentence):
    result = []
    punctuations = string.punctuation
    lemmatizer = WordNetLemmatizer()
    stopwords_list = stopwords.words("english")
    
    def remove_punctuations(sentence):
        result = "".join([c for c in sentence if c not in punctuations])
        return result
    
    def word_lemmatizer(sentence):
        result = lemmatizer.lemmatize(sentence)
        return result
    
    def word_lowercase(sentence):
        return sentence.lower()
    
    def excluding_word(sentence):
        if sentence in stopwords_list:
            return None
        if len(sentence) <= 1:
            return None
        return sentence
    
    def remove_URL(text):
        url = re.compile(r'https?://\S+|www\.\S+')
        return url.sub(r'',text)
    
    def generate_bigrams(x):
        n_grams = set(zip(*[x[i:] for i in range(2)]))
        for n_gram in n_grams:
            x.append(' '.join(n_gram))
        return x
    
    for word in sentence:
        word = remove_URL(word)
        word = word_lowercase(word)
        word = remove_punctuations(word)
        word = word_lemmatizer(word)
        word = excluding_word(word)
        if word is not None:
            result.append(word)
            
    result = generate_bigrams(result)       
    return result

In [7]:
display(HTML(training_data.to_html()))

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1
5,8,,,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires,1
6,10,,,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas",1
7,13,,,I'm on top of the hill and I can see a fire in the woods...,1
8,14,,,There's an emergency evacuation happening now in the building across the street,1
9,15,,,I'm afraid that the tornado is coming to our area...,1


In [8]:
print(training_data.isnull().sum())

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64


In [9]:
print(f"Number of rows: {len(training_data)}")

Number of rows: 7613


In [10]:
TEXT = data.Field(tokenize="spacy", preprocessing=preprocess)
ID = data.Field(dtype=torch.float64)
LABEL = data.LabelField(dtype=torch.float64)

In [11]:
fields = [['id',ID],[None,None],[None,None],['text',TEXT],['label',LABEL]]

In [12]:
dataset = data.TabularDataset(DATA_DIR, format="csv", fields = fields, skip_header=True)

In [13]:
#print(vars(dataset.examples[432]))

In [14]:
trainset, valset = dataset.split(split_ratio=0.9, random_state=random.seed(SEED))
print(f"Number of training samples: {len(trainset)}")
print(f"Number of validating samples: {len(valset)}")

Number of training samples: 6852
Number of validating samples: 761


In [15]:
MAX_VOCAB_SIZE = 50000
TEXT.build_vocab(trainset,
                max_size = MAX_VOCAB_SIZE,
                vectors= "glove.6B.100d",
                unk_init = torch.Tensor.normal_)
LABEL.build_vocab(trainset)
ID.build_vocab(trainset)

In [16]:
print(len(TEXT.vocab.freqs))
print(len(ID.vocab))
print(len(ID.vocab))

58915
6854
6854


In [17]:
BATCH_SIZE = 128

train_iter, val_iter = data.BucketIterator.splits(
    (trainset, valset),
    sort=False,
    batch_size = BATCH_SIZE,
    device = device)

In [18]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, output_dim, pad_idx):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx= pad_idx)
        self.fc = nn.Linear(embed_dim, output_dim)
    
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.permute(1,0,2)
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze()
        output = self.fc(pooled)
        return output

In [19]:
INPUT_DIM = len(TEXT.vocab)
EMBED_DIM = 100
OUTPUT_DIM = 1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, EMBED_DIM, OUTPUT_DIM, PAD_IDX).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 5e-4, weight_decay=1e-5)
criterion = nn.BCEWithLogitsLoss().to(device)

In [20]:
pretrained_embedding = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embedding)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBED_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBED_DIM)

In [21]:
def binary_accuracy(predictions, labels):
    predictions = torch.round(torch.sigmoid(predictions))
    correct = (predictions==labels).sum().float()
    acc = correct / len(labels)
    return acc

In [22]:
from tqdm import tqdm
def train_step(model, optimizer, criterion, train_iter):
    total_loss = 0
    total_acc = 0
    model.train()
    for batch in train_iter:
        optimizer.zero_grad()
        text = batch.text
        ids = batch.id
        label = batch.label
        
        output = model(text).squeeze()
        loss = criterion(output, label)
        acc = binary_accuracy(output, label)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_acc += acc.item()
    return total_loss/ len(train_iter), total_acc/ len(train_iter)

In [23]:
def evaluate_step(model, optimizer, criterion, val_iter):
    total_loss = 0
    total_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in val_iter:
            text = batch.text
            ids = batch.id
            label = batch.label

            output = model(text).squeeze()
            loss = criterion(output, label)
            acc = binary_accuracy(output, label)

            total_loss += loss.item()
            total_acc += acc.item()
        return total_loss/ len(val_iter), total_acc/ len(val_iter)

In [24]:
TRAIN_DATA = 1

loss_list = {
    "train": [],
    "val" : []
}

acc_list = {
    "train": [],
    "val" : []
}

if TRAIN_DATA:
    EPOCHS = 30
    best_val_loss = 100
    for epoch in range(EPOCHS):
        train_loss, train_acc = train_step(model,optimizer,criterion, train_iter)
        val_loss, val_acc = evaluate_step(model,  optimizer, criterion, val_iter)
        loss_list["train"].append(train_loss)
        loss_list["val"].append(val_loss)
        acc_list["train"].append(train_loss)
        acc_list["val"].append(val_loss)
        
        print(f'Epoch: {epoch+1:02} ')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {val_loss:.3f} |  Val. Acc: {val_acc*100:.2f}%')
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'model/model.pt')
            print("Model saved!")
        
        
    print("Training Completed!")

Epoch: 01 
	Train Loss: 0.683 | Train Acc: 62.27%
	 Val. Loss: 0.671 |  Val. Acc: 63.68%
Model saved!
Epoch: 02 
	Train Loss: 0.668 | Train Acc: 67.69%
	 Val. Loss: 0.654 |  Val. Acc: 64.49%
Model saved!
Epoch: 03 
	Train Loss: 0.652 | Train Acc: 70.28%
	 Val. Loss: 0.637 |  Val. Acc: 66.18%
Model saved!
Epoch: 04 
	Train Loss: 0.631 | Train Acc: 73.71%
	 Val. Loss: 0.620 |  Val. Acc: 67.48%
Model saved!
Epoch: 05 
	Train Loss: 0.608 | Train Acc: 75.90%
	 Val. Loss: 0.604 |  Val. Acc: 68.96%
Model saved!
Epoch: 06 
	Train Loss: 0.583 | Train Acc: 77.60%
	 Val. Loss: 0.590 |  Val. Acc: 70.56%
Model saved!
Epoch: 07 
	Train Loss: 0.553 | Train Acc: 79.96%
	 Val. Loss: 0.576 |  Val. Acc: 71.60%
Model saved!
Epoch: 08 
	Train Loss: 0.525 | Train Acc: 81.71%
	 Val. Loss: 0.561 |  Val. Acc: 72.24%
Model saved!
Epoch: 09 
	Train Loss: 0.499 | Train Acc: 82.97%
	 Val. Loss: 0.544 |  Val. Acc: 73.57%
Model saved!
Epoch: 10 
	Train Loss: 0.473 | Train Acc: 84.38%
	 Val. Loss: 0.528 |  Val. Acc: 

In [25]:
print(TEXT.vocab.stoi["jamaicaobserver cnewslive"])

8121


In [26]:
print(preprocess(["Help", "i've","been","coding","for","a","year!"]))

['help', 'ive', 'coding', 'year', 'ive coding', 'coding year', 'help ive']


In [27]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = generate_bigrams([tok.text for tok in nlp.tokenizer(sentence)])
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()