# Approach

1. Load datasets, change multiclass to binary class (positive or negative only), limit to text column
2. Sklearn pipeline: encode text (sentiment category and tweet), tf-idf
3. Torch pipeline: MLP with sigmoid

In [61]:
import pandas as pd
import numpy as np
import scipy
import torch

import time

from torch import nn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from torch.utils.data import DataLoader

#from d2l import torch as d2l

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print (device)

cpu


In [4]:
def data_transformer(df):
    '''
    Change multiclass to binary class: positive or negative tweets only
    Apply sklearn encoding on Sentiment column
    
    Param: Dataframe to transform
    Returns: Transformed dataframe
    '''
    df['Sentiment'] = df['Sentiment'].map({'Positive':'Positive', 'Extremely Positive':'Positive', 
                                           'Negative':'Negative', 'Extremely Negative':'Negative',
                                           'Neutral':'Positive'
                                          })
    df = df.drop(['UserName','ScreenName','Location','TweetAt'], axis=1)
    
    # Encode sentiment values
    df_le = LabelEncoder().fit(df['Sentiment'])
    df['encoded_sentiment'] = df_le.transform(df['Sentiment'])
    
    return df

In [5]:
df_train = pd.read_csv('./Data/Corona_NLP_train.csv', encoding='latin-1')
df_test = pd.read_csv('./Data/Corona_NLP_test.csv')

In [6]:
df_train = data_transformer(df_train)
df_test = data_transformer(df_test)

### Sklearn pipeline

In [69]:
x_train, x_test, y_train, y_test = df_train['OriginalTweet'], df_test['OriginalTweet'], df_train['encoded_sentiment'], df_test['encoded_sentiment']

# Perform tf-idf on OriginalTweets

tf_idf = TfidfVectorizer()
tf_idf.fit(x_train)
x_test = tf_idf.transform(x_test)

train_set = []
for idx, _ in enumerate(y_train):
    X_out = df_train['OriginalTweet'].iloc[idx]
    y_out = df_train['encoded_sentiment'].iloc[idx]            
    train_set.append((y_out, X_out))


def collate_batch(batch, xfrmer):
    label_list, text_list, offsets, unprocessed_text = [], [], [0], []

    for (_label, _text) in batch:
        label_list.append(_label)
        unprocessed_text.append(_text)

    label_list = torch.tensor(label_list, dtype=torch.int64)
    brutal = xfrmer.transform(unprocessed_text)
    output = torch.tensor(scipy.sparse.csr_matrix.todense(brutal)).long()
    #text_list = torch.cat(text_list)
    
    return label_list.to(device), output.to(device) # , offsets.to(device)

train_dataloader = DataLoader(train_set,
                            batch_size=256,
                            shuffle=True, 
                            num_workers=0,
                            collate_fn=lambda batch: collate_batch(batch, tf_idf))

In [70]:
next(iter(train_dataloader))

(tensor([0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
         0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
         0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0,
         1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
         1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
         0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
         0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0,
         1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
         1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1,
         1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1]),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0,

### PyTorch pipeline

In [71]:
class BiRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 **kwargs):
        super(BiRNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # Set `bidirectional` to True to get a bidirectional RNN
        self.encoder = nn.LSTM(embed_size, num_hiddens, num_layers=num_layers,
                               bidirectional=True, batch_first=True)
        self.decoder = nn.Linear(4 * num_hiddens, 2)

    def forward(self, inputs):
        # The shape of `inputs` is (batch size, no. of time steps). Because
        # LSTM requires its input's first dimension to be the temporal
        # dimension, the input is transposed before obtaining token
        # representations. The output shape is (no. of time steps, batch size,
        # word vector dimension)
        embeddings = self.embedding(inputs)
        self.encoder.flatten_parameters()
        # Returns hidden states of the last hidden layer at different time
        # steps. The shape of `outputs` is (no. of time steps, batch size,
        # 2 * no. of hidden units)
        outputs, _ = self.encoder(embeddings)
        # Concatenate the hidden states of the initial time step and final
        # time step to use as the input of the fully connected layer. Its
        # shape is (batch size, 4 * no. of hidden units)
        encoding = torch.cat((outputs[0], outputs[-1]), dim=1)
        # Concatenate the hidden states at the initial and final time steps as
        # the input of the fully-connected layer. Its shape is (batch size,
        # 4 * no. of hidden units)
        outs = self.decoder(encoding)
        return outs

In [72]:
def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = criterion(predicted_label, label)
        
        # log in tensorboard
        writer.add_scalar("Loss/train", loss)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        
        writer.add_scalar("accuracy/train", total_acc/total_count)
        
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            predicted_label = model(text)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count


In [73]:
next(iter(train_dataloader))[1].shape[1]

80424

In [74]:
    #num_class = len(set([label for (label, text) in train_iter]))
    vocab_size = len(tf_idf.vocabulary_) + 1
    emsize = 100#next(iter(train_dataloader))[1].shape[1]
    model = BiRNN(vocab_size, emsize, 100, 2).to(device)

    # Hyperparameters
    EPOCHS = 20 # epoch
    LR = 6.0  # learning rate
    BATCH_SIZE = 128 # batch size for training

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=LR)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
    total_accu = None

    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        
        train(train_dataloader)
        writer.flush()
        
        accu_val = evaluate(train_dataloader)
        writer.add_scalar("Acc/val", accu_val, epoch)
        
        if total_accu is not None and total_accu > accu_val:
            scheduler.step()
        else:
            total_accu = accu_val
        print('-' * 59)
        print('| end of epoch {:3d} | time: {:5.2f}s | '
            'valid accuracy {:8.3f} '.format(epoch,
                                            time.time() - epoch_start_time,
                                            accu_val))
        print('-' * 59)
        
    
    writer.close()

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:73] data. DefaultCPUAllocator: not enough memory: you tried to allocate 32941670400 bytes. Buy new RAM!

In [9]:
vocab_size = 
#output_size = 1
embedding_dim = x_train.shape[1]
hidden_dim = 64
n_layers = 1

In [18]:
#x_train = torch.tensor(scipy.sparse.csr_matrix.todense(x_train.astype('float32'))).float()
#x_test  = torch.tensor(scipy.sparse.csr_matrix.todense(x_test.astype('float32'))).float()
#x_val = x_test

y_train = torch.tensor(y_train.values).long()
y_test = torch.tensor(y_test.values).long()
#y_val = y_test

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:73] data. DefaultCPUAllocator: not enough memory: you tried to allocate 13240042272 bytes. Buy new RAM!

In [23]:
# model = nn.Sequential(nn.LSTM(x_train.shape[1], 128, num_layers=1,
#                                bidirectional=True),
#                       nn.ReLU(),
#                       nn.Linear(128,64),
#                       #BiRNN(64, 100, 100, 2),
#                       #LSTM1(1,5,2,1,x_train.shape[1])
#                       nn.Dropout(0.1),
#                       nn.Linear(64, 2), # There are 2 output classes = +ve & -ve
#                       nn.Sigmoid())  #nn.LogSoftmax(dim=1)) # The tutorial website used logsoftmax for binary class

# vocab_size, embed_size, num_hiddens, num_layers,
model = BiRNN(vocab_size, embedding_dim, hidden_dim, n_layers)

# Define the loss
criterion = nn.CrossEntropyLoss() #NLLLoss() # The tutorial website used NLLLoss for binary class

# Forward pass, get our logits
output = model(x_train)

# Calculate the loss with the logits and the labels
loss = criterion(output, y_train)

loss.backward()

# Optimizers require the parameters to optimize and a learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not DataLoader

In [21]:
# from torch.utils.data import DataLoader
# import numpy as np

# batch_size = 400

# train_loader = DataLoader((x_train, y_train), shuffle=True, batch_size=batch_size)
# val_loader = DataLoader(x_val, shuffle=True, batch_size=batch_size)
# test_loader = DataLoader(x_test, shuffle=True, batch_size=batch_size)

In [17]:
# epochs = 2
# counter = 0
# print_every = 1000
# clip = 5
# valid_loss_min = np.Inf

# model.train()
# for i in range(epochs):
#     h = model.init_hidden(batch_size)
    
#     for inputs, labels in train_loader:
#         counter += 1
#         h = tuple([e.data for e in h])
#         inputs, labels = inputs.to(device), labels.to(device)
#         model.zero_grad()
#         output, h = model(inputs, h)
#         loss = criterion(output.squeeze(), labels.float())
#         loss.backward()
#         nn.utils.clip_grad_norm_(model.parameters(), clip)
#         optimizer.step()
        
#         if counter%print_every == 0:
#             val_h = model.init_hidden(batch_size)
#             val_losses = []
#             model.eval()
#             for inp, lab in val_loader:
#                 val_h = tuple([each.data for each in val_h])
#                 inp, lab = inp.to(device), lab.to(device)
#                 out, val_h = model(inp, val_h)
#                 val_loss = criterion(out.squeeze(), lab.float())
#                 val_losses.append(val_loss.item())
                
#             model.train()
#             print("Epoch: {}/{}...".format(i+1, epochs),
#                   "Step: {}...".format(counter),
#                   "Loss: {:.6f}...".format(loss.item()),
#                   "Val Loss: {:.6f}".format(np.mean(val_losses)))
#             if np.mean(val_losses) <= valid_loss_min:
#                 torch.save(model.state_dict(), './state_dict.pt')
#                 print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
#                 valid_loss_min = np.mean(val_losses)


ValueError: too many values to unpack (expected 2)

In [None]:
train_losses = []
test_losses = []
test_accuracies = []

epochs = 200

for e in range(epochs):
    optimizer.zero_grad()
    h = model.init_hidden(batch_size)

    output = model.forward(x_train, h)
    loss = criterion(output, y_train)
    loss.backward()
    train_loss = loss.item()
    train_losses.append(train_loss)
    
    optimizer.step()

    with torch.no_grad():
        model.eval()
        log_ps = model(x_test)
        test_loss = criterion(log_ps, y_test)
        test_losses.append(test_loss)

        ps = torch.exp(log_ps)
        top_p, top_class = ps.topk(1, dim=1)
        equals = top_class == y_test.view(*top_class.shape)
        test_accuracy = torch.mean(equals.float())
        test_accuracies.append(test_accuracy)

    model.train()

    print(f"Epoch: {e+1}/{epochs}.. ",
          f"Training Loss: {train_loss:.3f}.. ",
          f"Test Loss: {test_loss:.3f}.. ",
          f"Test Accuracy: {test_accuracy:.3f}")

# Sources

1. PyTorch TF-IDF:
https://medium.com/swlh/text-classification-using-scikit-learn-pytorch-and-tensorflow-a3350808f9f7

2. PyTorch loss function for binary class:
https://discuss.pytorch.org/t/runtimeerror-expected-object-of-scalar-type-long-but-got-scalar-type-float-when-using-crossentropyloss/30542
