In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm

from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

%matplotlib inline

In [2]:
d_train = pd.read_excel('data/train-dataset.xlsx', sheetname=0)

  return func(*args, **kwargs)


In [None]:
d_train = d_train.iloc[:1000]

In [None]:
def get_flag_space(sentence):
    
    no_space = []
    flag_space = []
    for char in sentence: 
        if char != ' ':
            no_space.append(char)
            flag_space.append('0')
        elif char == ' ':
            flag_space[-1] = '1'
            
    no_space = ''.join(no_space)
    flag_space = ''.join(flag_space)
    return flag_space

In [None]:
def cleansing(sentence):
    word_list = word_tokenize(sentence)
    word_list = [word for word in word_list if word.isalnum()]
    clean = " ".join(word_list)
    
    return clean

In [None]:
def cleansing_raw(sentence):
    word_list = word_tokenize(sentence)
    word_list = [word for word in word_list if word.isalnum()]
    raw = "".join(word_list)
    
    return raw

In [None]:
d_train['clean_2'] = d_train.clean.apply(cleansing)
d_train['raw_2'] = d_train.clean.apply(cleansing_raw)

In [None]:
d_train['label'] = d_train.clean_2.apply(get_flag_space)

In [None]:
d_train = d_train[d_train.clean_2.str.len() > 10]

In [None]:
chars = set(list("".join(list(d_train.raw_2))))

In [None]:
text = ' '.join(chars)
chars = sorted(list(set(text)))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
def prepare_sequence(sentence, char2idx):
    idxs = [char2idx[char]for char in list(sentence)]
    
    return torch.tensor(idxs, dtype=torch.long)

In [None]:
def transform_label(input_):
    label = np.array([int(i) for i in list(input_)]).reshape(len(input_), 1)
    label = torch.FloatTensor(label)
    
    return label

In [None]:
train_sent = list(zip(d_train.raw_2, d_train.label))

In [None]:
class Classifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(Classifier, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, 100)
        self.fc2 = nn.Linear(100, 80)
        self.fc3 = nn.Linear(80, 60)
        self.fc4 = nn.Linear(60, tagset_size)
        
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        reshape = embeds.view(1, len(sentence), -1)
        
        lstm_out, _ = self.lstm(reshape)
        
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        
        tag_space = self.fc2(tag_space)
        tag_space = self.fc3(tag_space)
        tag_space = self.fc4(tag_space)
        
        tag_scores = torch.sigmoid(tag_space)
        
        return tag_scores

In [None]:
classifier = Classifier(embedding_dim=300, hidden_dim=100, vocab_size=len(char_indices), tagset_size=1)

In [None]:
optimizer = optim.Adam(classifier.parameters(), lr = 0.001)
loss_func = nn.BCEWithLogitsLoss()

In [None]:
running_loss = 0

In [None]:
with torch.no_grad():
    sent_in = prepare_sequence(train_sent[0][0], char_indices)
    output = classifier(sent_in)

In [None]:
output.shape

In [None]:
loss_list = []
for epoch in range(2):  # again, normally you would NOT do 300 epochs, it is toy data
    running_loss = 0
    for sent_index, (sentence, tags) in enumerate(train_sent[:150], 1):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        optimizer.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, char_indices)
        label= transform_label(tags)

        # Step 3. Run our forward pass.
        tag_scores = classifier(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by calling optimizer.step()
        loss = loss_func(tag_scores, label)
        
        loss.backward()
        running_loss += (loss.item() - running_loss) / sent_index
        
        optimizer.step()
        
    print("epoch {}/{} loss: {}".format(epoch, 300, running_loss))
        
    loss_list.append(loss.item())        

In [None]:
reshape.view

In [None]:
tag_scores.shape

In [None]:
sns.set_style("white")
sns.lineplot(y = loss_list, x = list(range(len(loss_list))))

In [None]:
tmp = prepare_sequence("andreaschandra", char2idx)

In [None]:
tmp

In [None]:
y_pred = classifier(tmp)

In [None]:
y_pred