# Approach

1. Load datasets, change multiclass to binary class (positive or negative only), limit to text column
2. Sklearn pipeline: encode text (sentiment category and tweet), tf-idf
3. Torch pipeline: MLP with sigmoid

In [1]:
import pandas as pd
import scipy
import torch

from torch import nn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print (device)

cuda:0


In [3]:
def data_transformer(df):
    '''
    Change multiclass to binary class: positive or negative tweets only
    Apply sklearn encoding on Sentiment column
    
    Param: Dataframe to transform
    Returns: Transformed dataframe
    '''
    df['Sentiment'] = df['Sentiment'].map({'Positive':'Positive', 'Extremely Positive':'Positive', 
                                           'Negative':'Negative', 'Extremely Negative':'Negative',
                                           'Neutral':'Positive'
                                          })
    df = df.drop(['UserName','ScreenName','Location','TweetAt'], axis=1)
    
    # Encode sentiment values
    df_le = LabelEncoder().fit(df['Sentiment'])
    df['encoded_sentiment'] = df_le.transform(df['Sentiment'])
    
    return df

In [4]:
df_train = pd.read_csv('./Data/Corona_NLP_train.csv')
df_test = pd.read_csv('./Data/Corona_NLP_test.csv')

In [5]:
df_train = data_transformer(df_train)
df_test = data_transformer(df_test)

### Sklearn pipeline

In [6]:
x_train, x_test, y_train, y_test = df_train['OriginalTweet'], df_test['OriginalTweet'], df_train['encoded_sentiment'], df_test['encoded_sentiment']

# Perform tf-idf on OriginalTweets

tf_idf = TfidfVectorizer()
x_train = tf_idf.fit_transform(x_train)
x_test = tf_idf.transform(x_test)

### PyTorch pipeline

In [7]:
x_train = torch.tensor(scipy.sparse.csr_matrix.todense(x_train)).float()
x_test = torch.tensor(scipy.sparse.csr_matrix.todense(x_test)).float()

y_train = torch.tensor(y_train.values).long()
y_test = torch.tensor(y_test.values).long()

In [8]:
# new_test_shape = int((x_test.shape[1]*x_test.shape[0]) / 18) # You can try: 6, 9, 18

# x_test = x_test.reshape(18, new_test_shape)

# x_test = x_test.flatten()

In [13]:
model = nn.Sequential(nn.Linear(x_train.shape[1],64),
                      nn.ReLU(),
                      nn.Dropout(0.1),
                      nn.Linear(64, 2), # There are 2 output classes = +ve & -ve
                      nn.Sigmoid())  #nn.LogSoftmax(dim=1)) # The tutorial website used logsoftmax for binary class

# Define the loss
criterion = nn.CrossEntropyLoss() #NLLLoss() # The tutorial website used NLLLoss for binary class

# Forward pass, get our logits
output = model(x_train)

# Calculate the loss with the logits and the labels
loss = criterion(output, y_train)

loss.backward()

# Optimizers require the parameters to optimize and a learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)

In [14]:
train_losses = []
test_losses = []
test_accuracies = []

epochs = 200

for e in range(epochs):
    optimizer.zero_grad()

    output = model.forward(x_train)
    loss = criterion(output, y_train)
    loss.backward()
    train_loss = loss.item()
    train_losses.append(train_loss)
    
    optimizer.step()

    with torch.no_grad():
        model.eval()
        log_ps = model(x_test)
        test_loss = criterion(log_ps, y_test)
        test_losses.append(test_loss)

        ps = torch.exp(log_ps)
        top_p, top_class = ps.topk(1, dim=1)*
        equals = top_class == y_test.view(*top_class.shape)
        test_accuracy = torch.mean(equals.float())
        test_accuracies.append(test_accuracy)

    model.train()

    print(f"Epoch: {e+1}/{epochs}.. ",
          f"Training Loss: {train_loss:.3f}.. ",
          f"Test Loss: {test_loss:.3f}.. ",
          f"Test Accuracy: {test_accuracy:.3f}")

Epoch: 1/200..  Training Loss: 0.699..  Test Loss: 0.696..  Test Accuracy: 0.430
Epoch: 2/200..  Training Loss: 0.698..  Test Loss: 0.695..  Test Accuracy: 0.430
Epoch: 3/200..  Training Loss: 0.697..  Test Loss: 0.695..  Test Accuracy: 0.430
Epoch: 4/200..  Training Loss: 0.695..  Test Loss: 0.694..  Test Accuracy: 0.430
Epoch: 5/200..  Training Loss: 0.694..  Test Loss: 0.693..  Test Accuracy: 0.460
Epoch: 6/200..  Training Loss: 0.692..  Test Loss: 0.692..  Test Accuracy: 0.612
Epoch: 7/200..  Training Loss: 0.690..  Test Loss: 0.691..  Test Accuracy: 0.635
Epoch: 8/200..  Training Loss: 0.688..  Test Loss: 0.690..  Test Accuracy: 0.628
Epoch: 9/200..  Training Loss: 0.686..  Test Loss: 0.689..  Test Accuracy: 0.625
Epoch: 10/200..  Training Loss: 0.683..  Test Loss: 0.687..  Test Accuracy: 0.621
Epoch: 11/200..  Training Loss: 0.681..  Test Loss: 0.686..  Test Accuracy: 0.619
Epoch: 12/200..  Training Loss: 0.678..  Test Loss: 0.685..  Test Accuracy: 0.618
Epoch: 13/200..  Training

# Sources

1. PyTorch TF-IDF:
https://medium.com/swlh/text-classification-using-scikit-learn-pytorch-and-tensorflow-a3350808f9f7

2. PyTorch loss function for binary class:
https://discuss.pytorch.org/t/runtimeerror-expected-object-of-scalar-type-long-but-got-scalar-type-float-when-using-crossentropyloss/30542
