In [1]:
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import spacy
import re
import pandas as pd

from sklearn.model_selection import train_test_split
from collections import defaultdict

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from torch.utils.data import Dataset, DataLoader

## Functions for Cleaning Text

In [2]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

## Functions for Encoding Sentences

In [246]:
def get_vocab(content):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    """
    vocab = defaultdict(float)
    for line in content:
        for word in line.split():
            vocab[word] += 1
    return vocab

def encode_sentence(x, vocab2index, N=26, padding_start=True):
    enc = np.zeros(N, dtype=np.int32)
    enc1 = [vocab2index.get(w, vocab2index["UNK"]) for w in x.split()]
    l = min(N, len(enc1))
    if padding_start:
        enc[N-l:] = enc1[:l]
    else:
        enc = enc1[:l]
    return enc

In [247]:
class TweetDataset(Dataset):
    def __init__(self, X, y, N = 26, padding_start = True):
        self.X = np.array([encode_sentence(x,vocab2index, N, padding_start) for x in X])
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = self.X[idx]
        return x, self.y[idx]

## Gather Pre-Training Embeddings

In [9]:
def unpack_glove():
    ! wget http://nlp.stanford.edu/data/glove.6B.zip
    ! mkdir data
    ! unzip glove.6B.zip
    
def delete_rare_words(word_vecs, word_count, min_df=4):
    words_delete = []
    for word in word_count:
        if word_count[word] < min_df and word not in word_vecs:
            words_delete.append(word)
    for word in words_delete: word_count.pop(word)
    return word_count

def loadGloveModel(gloveFile="glove.6B.50d.txt"):
    """ Loads word vecgors into a dictionary"""
    f = open(gloveFile,'r')
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs

def create_embedding_matrix(word_vecs, word_count, min_df=4, emb_size=50):
    """Creates embedding matrix from word vectors. """
    word_count = delete_rare_words(word_vecs, word_count, min_df)
    V = len(word_count.keys()) + 2
    vocab2index = {}
    W = np.zeros((V, emb_size), dtype="float32")
    vocab = ["", "UNK"]
    # adding a vector for padding
    W[0] = np.zeros(emb_size, dtype='float32')
    # adding a vector for rare words 
    W[1] = np.random.uniform(-0.25, 0.25, emb_size)
    vocab2index["UNK"] = 1
    i = 2
    for word in word_count:
        if word in word_vecs:
            W[i] = word_vecs[word]
            vocab2index[word] = i
            vocab.append(word)
            i += 1
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
            vocab2index[word] = i
            vocab.append(word)
            i += 1   
    return W, np.array(vocab), vocab2index

## CNN Model 

In [293]:
class Tweet_CNN(nn.Module):
    def __init__(self, vocab_size, emb_size, glove_weights=None):
        super(Net, self).__init__()
        self.embed = nn.Embedding(vocab_size, emb_size, padding_idx = 0)
        if glove_weights is not None:
            self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
            
        self.conv1 = nn.Conv1d(emb_size, 100, 5, kernel_size=1)
        self.conv2 = nn.Conv1d(emb_size, 100, 5, kernel_size=2)
        self.conv3 = nn.Conv1d(emb_size, 100, 5, kernel_size=3)
        
        self.pool1 = nn.MaxPoo11d(kernel_size = 26)
        self.pool2 = nn.MaxPool1d(kernel_size = 25)
        self.pool3 = nn.MaxPool1d(kernel_size = 24)
        
        self.bn = nn.BatchNorm1d(300)
        
        self.dropout = nn.Dropout(p=0.2)
        self.fc = nn.Linear(300, 1)

    def forward(self, x):
        x = self.embed(x)
        x = x.transpose(1,2)
        x1 = self.pool(F.relu(self.conv1(x)))
        x2 = self.pool(F.relu(self.conv2(x)))
        x3 = self.pool(F.relu(self.conv3(x)))
        out = torch.cat([x1, x2, x3], 2)
        out = out.view(out.size(0), -1)
        out = self.dropout(self.bn(out))
        
        return self.fc(out)

## Training Functions

In [116]:
def train_epocs(model, optimizer, epochs=10):
    for i in range(epochs):
        model.train()
        total_loss = 0
        total = 0
        for x, y in train_dl:
            x = x.long()
            y = y.float().unsqueeze(1)
            out = model(x)
            loss = F.binary_cross_entropy_with_logits(out, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += x.size(0)*loss.item()
            total += x.size(0)
        train_loss = total_loss/total
        val_loss, val_accuracy = valid_metrics(model)
        
        print("train_loss %.3f val_loss %.3f val_accuracy %.3f" % (
            train_loss, val_loss, val_accuracy))

In [229]:
train_ds = TweetDataset(X_train, y_train)
valid_ds = TweetDataset(X_val, y_val)
train_dl = DataLoader(train_ds, batch_size=2, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=2)

In [230]:
x,y=next(iter(train_dl))

In [231]:
x

tensor([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,  357, 2356,    1, 2136,    1, 1331,  602,    1,    1,
            1,    1],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0, 2090,    3,  418,
          419,  420,  421,  422,   51,  423,  424,    9, 1100,   99, 1735, 1423,
            1,    1]], dtype=torch.int32)

In [232]:
V = len(words)
emb_size = 3

In [233]:
embed = nn.Embedding(V, emb_size, padding_idx=0)

In [None]:
# Kernal = 1

In [278]:
x1 = embed(x.long())
x1 = x1.transpose(1,2)
x1.size()
conv_1 = nn.Conv1d(in_channels=emb_size, out_channels=100, kernel_size=1)
x1 = F.relu(conv_1(x1))
pool1 = nn.MaxPool1d(kernel_size = 26)
x1 = pool1(x1)

In [None]:
nn.MaxPool1d

In [265]:
## Kernal = 2

In [279]:
x2 = embed(x.long())
x2 = x2.transpose(1,2)
x2.size()
conv_2 = nn.Conv1d(in_channels=emb_size, out_channels=100, kernel_size=2)
x2 = F.relu(conv_2(x2))
pool2 = nn.MaxPool1d(kernel_size = 25)
x2 = pool2(x2)

In [254]:
## Kernal = 3

In [280]:
x3 = embed(x.long())
x3 = x3.transpose(1,2)
x3.size()
conv_3 = nn.Conv1d(in_channels=emb_size, out_channels=100, kernel_size=3)
x3 = F.relu(conv_3(x3))
pool3 = nn.MaxPool1d(kernel_size = 24)
x3 = pool3(x3)

In [282]:
out = torch.cat([x1, x2, x3], 2)

In [283]:
out

tensor([[[0.3934, 0.4180, 0.1096],
         [1.6549, 0.6368, 0.6135],
         [0.1761, 0.5728, 0.4253],
         [0.9770, 0.7884, 0.3343],
         [0.8955, 0.8736, 0.9640],
         [0.2138, 0.5802, 0.8174],
         [0.8102, 0.7806, 0.0000],
         [0.4834, 0.7983, 0.5853],
         [0.4859, 0.7846, 0.2344],
         [0.0000, 0.3353, 0.3590],
         [0.6423, 0.9788, 0.5589],
         [0.5552, 0.5459, 0.6589],
         [0.9799, 0.9322, 0.6591],
         [0.6362, 0.8670, 0.9610],
         [0.0000, 0.6415, 0.9623],
         [1.1013, 0.0000, 0.4231],
         [1.6403, 1.0163, 0.8996],
         [0.5591, 0.6180, 1.0517],
         [0.4567, 0.7392, 0.4556],
         [0.3990, 1.3885, 0.3466],
         [0.0000, 0.8018, 0.6271],
         [0.0000, 0.6095, 0.0000],
         [0.2778, 0.5612, 0.2590],
         [0.2340, 0.1504, 0.2391],
         [0.0000, 0.5153, 0.9233],
         [0.3141, 0.4403, 0.5615],
         [0.8181, 0.7354, 0.8874],
         [0.0000, 1.0957, 0.4820],
         [1.3228, 0.

In [285]:
out = out.view(out.size(0), -1)

In [291]:
dropout = nn.Dropout(p=0.2)
out = dropout(bn(out))

In [290]:
bn = nn.BatchNorm1d(300)

## Running everything

In [10]:
disaster = pd.read_csv('nlp-getting-started/train.csv')

In [15]:
X = np.array(disaster.text)
y = np.array(disaster.target)

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
word_count = get_vocab(X_train)
for word in list(word_count):
    if word_count[word] < 5:
        del word_count[word]

In [31]:
vocab2index = {"<PAD>":0, "UNK":1} # init with padding and unknown
words = ["<PAD>", "UNK"]
for word in word_count:
    vocab2index[word] = len(words)
    words.append(word)