# Approach

1. Load datasets, change multiclass to binary class (positive or negative only), limit to text column
2. Sklearn pipeline: encode text (sentiment category and tweet), tf-idf
3. Torch pipeline: MLP with sigmoid

In [1]:
import pandas as pd
import scipy
import torch

from torch import nn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from d2l import torch as d2l

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print (device)

cuda:0


In [3]:
def data_transformer(df):
    '''
    Change multiclass to binary class: positive or negative tweets only
    Apply sklearn encoding on Sentiment column
    
    Param: Dataframe to transform
    Returns: Transformed dataframe
    '''
    df['Sentiment'] = df['Sentiment'].map({'Positive':'Positive', 'Extremely Positive':'Positive', 
                                           'Negative':'Negative', 'Extremely Negative':'Negative',
                                           'Neutral':'Positive'
                                          })
    df = df.drop(['UserName','ScreenName','Location','TweetAt'], axis=1)
    
    # Encode sentiment values
    df_le = LabelEncoder().fit(df['Sentiment'])
    df['encoded_sentiment'] = df_le.transform(df['Sentiment'])
    
    return df

In [4]:
df_train = pd.read_csv('./Data/Corona_NLP_train.csv', encoding='latin-1')
df_test = pd.read_csv('./Data/Corona_NLP_test.csv')

In [5]:
df_train = data_transformer(df_train)
df_test = data_transformer(df_test)

### Sklearn pipeline

In [17]:
x_train, x_test, y_train, y_test = df_train['OriginalTweet'], df_test['OriginalTweet'], df_train['encoded_sentiment'], df_test['encoded_sentiment']

# Perform tf-idf on OriginalTweets

tf_idf = TfidfVectorizer()
x_train = tf_idf.fit_transform(x_train)
x_test = tf_idf.transform(x_test)

### PyTorch pipeline

In [30]:
# batch_size = 64
# vocab = d2l.load_data_imdb(batch_size)

In [31]:
# vocab, len(vocab)

In [44]:
# x_train = torch.tensor(scipy.sparse.csr_matrix.todense(x_train)).long()

In [18]:
x_train = torch.tensor(scipy.sparse.csr_matrix.todense(x_train)).long()
x_test  = torch.tensor(scipy.sparse.csr_matrix.todense(x_test)).long()
#x_val = x_test

y_train = torch.tensor(y_train.values).long()
y_test = torch.tensor(y_test.values).long()
#y_val = y_test

In [22]:
class BiRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 **kwargs):
        super(BiRNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # Set `bidirectional` to True to get a bidirectional RNN
        self.encoder = nn.LSTM(embed_size, num_hiddens, num_layers=num_layers,
                               bidirectional=True, batch_first=True)
        self.decoder = nn.Linear(4 * num_hiddens, 2)

    def forward(self, inputs):
        # The shape of `inputs` is (batch size, no. of time steps). Because
        # LSTM requires its input's first dimension to be the temporal
        # dimension, the input is transposed before obtaining token
        # representations. The output shape is (no. of time steps, batch size,
        # word vector dimension)
        embeddings = self.embedding(inputs)
        self.encoder.flatten_parameters()
        # Returns hidden states of the last hidden layer at different time
        # steps. The shape of `outputs` is (no. of time steps, batch size,
        # 2 * no. of hidden units)
        outputs, _ = self.encoder(embeddings)
        # Concatenate the hidden states of the initial time step and final
        # time step to use as the input of the fully connected layer. Its
        # shape is (batch size, 4 * no. of hidden units)
        encoding = torch.cat((outputs[0], outputs[-1]), dim=1)
        # Concatenate the hidden states at the initial and final time steps as
        # the input of the fully-connected layer. Its shape is (batch size,
        # 4 * no. of hidden units)
        outs = self.decoder(encoding)
        return outs

In [11]:
# class LSTM1(nn.Module):
#     def __init__(self, num_classes, input_size, hidden_size, num_layers, seq_length):
#         super(LSTM1, self).__init__()
#         self.num_classes = num_classes #number of classes
#         self.num_layers = num_layers #number of layers
#         self.input_size = input_size #input size
#         self.hidden_size = hidden_size #hidden state
#         self.seq_length = seq_length #sequence length

#         self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
#                           num_layers=num_layers, batch_first=True) #lstm
#         self.fc_1 =  nn.Linear(hidden_size, 128) #fully connected 1
#         self.fc = nn.Linear(128, num_classes) #fully connected last layer

#         self.relu = nn.ReLU()
    
#     def forward(self,x):
#         h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #hidden state
#         c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #internal state
#         # Propagate input through LSTM
#         output, (hn, cn) = self.lstm(x, (h_0, c_0)) #lstm with input, hidden, and internal state
#         hn = hn.view(-1, self.hidden_size) #reshaping the data for Dense layer next
#         out = self.relu(hn)
#         out = self.fc_1(out) #first Dense
#         out = self.relu(out) #relu
#         out = self.fc(out) #Final Output
#         return out

In [40]:
# class SentimentNet(nn.Module):
#     def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
#         super(SentimentNet, self).__init__()
#         self.output_size = output_size
#         self.n_layers = n_layers
#         self.hidden_dim = hidden_dim
        
#         self.embedding = nn.Embedding(vocab_size, embedding_dim)
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
#         self.dropout = nn.Dropout(drop_prob)
#         self.fc = nn.Linear(hidden_dim, output_size)
#         self.sigmoid = nn.Sigmoid()
        
#     def forward(self, x, hidden):
#         batch_size = x.size(0)
#         x = x.long()
#         embeds = self.embedding(x)
#         lstm_out, hidden = self.lstm(embeds, hidden)
#         lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
#         out = self.dropout(lstm_out)
#         out = self.fc(out)
#         out = self.sigmoid(out)
        
#         out = out.view(batch_size, -1)
#         out = out[:,-1]
#         return out, hidden
    
#     def init_hidden(self, batch_size):
#         weight = next(self.parameters()).data
#         hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
#                       weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
#         return hidden

In [19]:
vocab_size = len(tf_idf.vocabulary_) + 1
#output_size = 1
embedding_dim = x_train.shape[1]
hidden_dim = 64
n_layers = 1

# model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
# model.to(device)

# lr=0.005
# criterion = nn.BCELoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [20]:
x_train  = torch.utils.data.DataLoader(x_train, batch_size=512, shuffle=True)
x_test  = torch.utils.data.DataLoader(x_test, batch_size=512, shuffle=True)

In [23]:
# model = nn.Sequential(nn.LSTM(x_train.shape[1], 128, num_layers=1,
#                                bidirectional=True),
#                       nn.ReLU(),
#                       nn.Linear(128,64),
#                       #BiRNN(64, 100, 100, 2),
#                       #LSTM1(1,5,2,1,x_train.shape[1])
#                       nn.Dropout(0.1),
#                       nn.Linear(64, 2), # There are 2 output classes = +ve & -ve
#                       nn.Sigmoid())  #nn.LogSoftmax(dim=1)) # The tutorial website used logsoftmax for binary class

# vocab_size, embed_size, num_hiddens, num_layers,
model = BiRNN(vocab_size, embedding_dim, hidden_dim, n_layers)

# Define the loss
criterion = nn.CrossEntropyLoss() #NLLLoss() # The tutorial website used NLLLoss for binary class

# Forward pass, get our logits
output = model(x_train)

# Calculate the loss with the logits and the labels
loss = criterion(output, y_train)

loss.backward()

# Optimizers require the parameters to optimize and a learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not DataLoader

In [21]:
# from torch.utils.data import DataLoader
# import numpy as np

# batch_size = 400

# train_loader = DataLoader((x_train, y_train), shuffle=True, batch_size=batch_size)
# val_loader = DataLoader(x_val, shuffle=True, batch_size=batch_size)
# test_loader = DataLoader(x_test, shuffle=True, batch_size=batch_size)

In [17]:
# epochs = 2
# counter = 0
# print_every = 1000
# clip = 5
# valid_loss_min = np.Inf

# model.train()
# for i in range(epochs):
#     h = model.init_hidden(batch_size)
    
#     for inputs, labels in train_loader:
#         counter += 1
#         h = tuple([e.data for e in h])
#         inputs, labels = inputs.to(device), labels.to(device)
#         model.zero_grad()
#         output, h = model(inputs, h)
#         loss = criterion(output.squeeze(), labels.float())
#         loss.backward()
#         nn.utils.clip_grad_norm_(model.parameters(), clip)
#         optimizer.step()
        
#         if counter%print_every == 0:
#             val_h = model.init_hidden(batch_size)
#             val_losses = []
#             model.eval()
#             for inp, lab in val_loader:
#                 val_h = tuple([each.data for each in val_h])
#                 inp, lab = inp.to(device), lab.to(device)
#                 out, val_h = model(inp, val_h)
#                 val_loss = criterion(out.squeeze(), lab.float())
#                 val_losses.append(val_loss.item())
                
#             model.train()
#             print("Epoch: {}/{}...".format(i+1, epochs),
#                   "Step: {}...".format(counter),
#                   "Loss: {:.6f}...".format(loss.item()),
#                   "Val Loss: {:.6f}".format(np.mean(val_losses)))
#             if np.mean(val_losses) <= valid_loss_min:
#                 torch.save(model.state_dict(), './state_dict.pt')
#                 print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
#                 valid_loss_min = np.mean(val_losses)


ValueError: too many values to unpack (expected 2)

In [None]:
train_losses = []
test_losses = []
test_accuracies = []

epochs = 200

for e in range(epochs):
    optimizer.zero_grad()
    h = model.init_hidden(batch_size)

    output = model.forward(x_train, h)
    loss = criterion(output, y_train)
    loss.backward()
    train_loss = loss.item()
    train_losses.append(train_loss)
    
    optimizer.step()

    with torch.no_grad():
        model.eval()
        log_ps = model(x_test)
        test_loss = criterion(log_ps, y_test)
        test_losses.append(test_loss)

        ps = torch.exp(log_ps)
        top_p, top_class = ps.topk(1, dim=1)
        equals = top_class == y_test.view(*top_class.shape)
        test_accuracy = torch.mean(equals.float())
        test_accuracies.append(test_accuracy)

    model.train()

    print(f"Epoch: {e+1}/{epochs}.. ",
          f"Training Loss: {train_loss:.3f}.. ",
          f"Test Loss: {test_loss:.3f}.. ",
          f"Test Accuracy: {test_accuracy:.3f}")

# Sources

1. PyTorch TF-IDF:
https://medium.com/swlh/text-classification-using-scikit-learn-pytorch-and-tensorflow-a3350808f9f7

2. PyTorch loss function for binary class:
https://discuss.pytorch.org/t/runtimeerror-expected-object-of-scalar-type-long-but-got-scalar-type-float-when-using-crossentropyloss/30542
