In [1]:
import torch
import torch.nn.functional as F
import torchtext
import time
import random 
import pandas as pd

torch.backends.cudnn.deterministic = True

In [2]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

LEARNING_RATE = 0.005
BATCH_SIZE = 128
NUM_EPOCHS = 15
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 128
HIDDEN_DIM = 256
NUM_CLASSES = 5

In [3]:
import pandas as pd

train_set = pd.read_csv('/content/Corona_NLP_train.csv', encoding='latin', usecols=['OriginalTweet', 'Sentiment'])
test_set = pd.read_csv('/content/Corona_NLP_test.csv', encoding='latin', usecols=['OriginalTweet', 'Sentiment'])

# training set
train_set.loc[train_set['Sentiment'] == 'Extremely Negative', 'Sentiment'] = -1
train_set.loc[train_set['Sentiment'] == 'Negative', 'Sentiment'] = -1
train_set.loc[train_set['Sentiment'] == 'Neutral', 'Sentiment'] = 0
train_set.loc[train_set['Sentiment'] == 'Positive', 'Sentiment'] = 1
train_set.loc[train_set['Sentiment'] == 'Extremely Positive', 'Sentiment'] = 1
print(train_set.head(5))

# test set
test_set.loc[test_set['Sentiment'] == 'Extremely Negative', 'Sentiment'] = -1
test_set.loc[test_set['Sentiment'] == 'Negative', 'Sentiment'] = -1
test_set.loc[test_set['Sentiment'] == 'Neutral', 'Sentiment'] = 0
test_set.loc[test_set['Sentiment'] == 'Positive', 'Sentiment'] = 1
test_set.loc[test_set['Sentiment'] == 'Extremely Positive', 'Sentiment'] = 1
print(test_set.head(5))

                                       OriginalTweet Sentiment
0  advice Talk to your neighbours family to excha...         1
1  Coronavirus Australia: Woolworths to give elde...         1
2  My food stock is not the only one which is emp...         1
3  Me, ready to go at supermarket during the #COV...        -1
4  As news of the regionÂs first confirmed COVID...         1
                                       OriginalTweet Sentiment
0  TRENDING: New Yorkers encounter empty supermar...        -1
1  When I couldn't find hand sanitizer at Fred Me...         1
2  Find out how you can protect yourself and love...         1
3  #Panic buying hits #NewYork City as anxious sh...        -1
4  #toiletpaper #dunnypaper #coronavirus #coronav...         0


In [4]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
import re

def cleaning_text(text):
  stop_word = stopwords.words("english")

  text = re.sub(r'http\S+', " ", text)    # remove urls
  text = re.sub(r'@\w+',' ',text)         # remove mentions
  text = re.sub(r'#\w+', ' ', text)       # remove hastags
  text = re.sub('r<.*?>',' ', text)       # remove html tags

  # remove stopwords 
  text = text.split()
  text = " ".join([word for word in text if not word in stop_word])

  for punctuation in string.punctuation:
    text = text.replace(punctuation, "")
    
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
train_set['OriginalTweet'] = train_set['OriginalTweet'].apply(lambda x: cleaning_text(x))
test_set['OriginalTweet'] = test_set['OriginalTweet'].apply(lambda x: cleaning_text(x))

print(train_set.head(5))
print(test_set.head(5))

train_set.to_csv('train_set.csv', index=None)
test_set.to_csv('test_set.csv', index=None)

                                       OriginalTweet Sentiment
0  advice Talk neighbours family exchange phone n...         1
1  Coronavirus Australia Woolworths give elderly ...         1
2  My food stock one empty PLEASE panic THERE WIL...         1
3  Me ready go supermarket outbreak Not Im parano...        -1
4  As news regionÂs first confirmed COVID19 case...         1
                                       OriginalTweet Sentiment
0  TRENDING New Yorkers encounter empty supermark...        -1
1  When I find hand sanitizer Fred Meyer I turned...         1
2                          Find protect loved ones           1
3  buying hits City anxious shoppers stock foodam...        -1
4  One week everyone buying baby milk powder next...         0


In [6]:
from torchtext.legacy.data import Field, LabelField, TabularDataset, BucketIterator

TEXT = Field(tokenize='spacy', tokenizer_language='en_core_web_sm')

LABEL = LabelField(dtype=torch.long)

fields = [('TEXT_COLUMN_NAME', TEXT), ('LABEL_COLUMN_NAME', LABEL)]

train = TabularDataset(path='train_set.csv', format='csv', skip_header=True, fields=fields)
test = TabularDataset(path='test_set.csv', format='csv', skip_header=True, fields=fields)

In [7]:
train_data, valid_data = train.split(split_ratio=[0.85, 0.25], random_state=random.seed(RANDOM_SEED))

print(f'Num Train: {len(train_data)}')
print(f'Num Validation: {len(valid_data)}')

print(vars(train_data.examples[0]))

Num Train: 31802
Num Validation: 9354
{'TEXT_COLUMN_NAME': ['We', 'd', '3', '18', 'store', 'fine', 'Take', 'deep', 'breath', 'Yeah', 'shelves', 'picked', 'plenty', 'good', 'food', 'Also', 'please', 'stop', 'panic', 'buying', 'toilet', 'paper', 'poopy', 'weirdos'], 'LABEL_COLUMN_NAME': '-1'}


In [8]:
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

print(TEXT.vocab.freqs.most_common(20))

Vocabulary size: 46195
Number of classes: 3
[(' ', 7342), ('I', 7325), ('prices', 5588), ('store', 4964), ('supermarket', 4504), ('food', 4477), ('COVID19', 4179), ('grocery', 4123), ('19', 3800), ('amp', 3769), ('people', 3702), ('The', 3169), ('shopping', 2350), ('consumer', 2283), ('online', 2183), ('get', 2048), ('need', 1958), ('pandemic', 1905), ('workers', 1853), ('We', 1723)]


In [9]:
train_loader = BucketIterator(train_data, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.TEXT_COLUMN_NAME), sort_within_batch=False, device=DEVICE)
valid_loader = BucketIterator(valid_data, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.TEXT_COLUMN_NAME), sort_within_batch=False, device=DEVICE)
test_loader = BucketIterator(test, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.TEXT_COLUMN_NAME), sort_within_batch=False, device=DEVICE)

In [10]:
print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch.TEXT_COLUMN_NAME.size()}')
    print(f'Target vector size: {batch.LABEL_COLUMN_NAME.size()}')
    break
    
print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch.TEXT_COLUMN_NAME.size()}')
    print(f'Target vector size: {batch.LABEL_COLUMN_NAME.size()}')
    break
    
print('\nTest:')
for batch in test_loader:
    print(f'Text matrix size: {batch.TEXT_COLUMN_NAME.size()}')
    print(f'Target vector size: {batch.LABEL_COLUMN_NAME.size()}')
    break

Train
Text matrix size: torch.Size([39, 128])
Target vector size: torch.Size([128])

Valid:
Text matrix size: torch.Size([41, 128])
Target vector size: torch.Size([128])

Test:
Text matrix size: torch.Size([36, 128])
Target vector size: torch.Size([128])


In [11]:
class TextClassificationModel(torch.nn.Module):

  def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
    super().__init__()

    self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
    self.rnn = torch.nn.RNN(embedding_dim, hidden_dim, nonlinearity='relu', bidirectional=True)
    self.fc = torch.nn.Linear(hidden_dim, output_dim)

  def forward(self, text):
    embedded = self.embedding(text)
    output, (hidden, cell) = self.rnn(embedded)
    # hidden.squeeze_(0)
    output = self.fc(hidden)

    return output

In [12]:
torch.manual_seed(RANDOM_SEED)

model = TextClassificationModel(len(TEXT.vocab), EMBEDDING_DIM, HIDDEN_DIM, NUM_CLASSES)

model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [13]:
def compute_accuracy(model, data_loader, device):

  with torch.no_grad():

    correct_pred, num_example = 0, 0 

    for i, (feature, target) in enumerate(data_loader):

      feature = feature.to(device)
      target = target.float().to(device)

      logit = model(feature)
      _, predicted_label = torch.max(logit, 1)

      num_example += target.size(0)
      correct_pred += (predicted_label == target).sum()
  return correct_pred.float()/num_example * 100


In [14]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch_data in enumerate(train_loader):
        
        text = batch_data.TEXT_COLUMN_NAME.to(DEVICE)
        labels = batch_data.LABEL_COLUMN_NAME.to(DEVICE)

        ### FORWARD AND BACK PROP
        logits = model(text)
        loss = F.cross_entropy(logits, labels)
        optimizer.zero_grad()
        
        loss.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Loss: {loss:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 001/015 | Batch 000/249 | Loss: 1.5778
Epoch: 001/015 | Batch 050/249 | Loss: 1.0369
Epoch: 001/015 | Batch 100/249 | Loss: 1.0567
Epoch: 001/015 | Batch 150/249 | Loss: 1.0650
Epoch: 001/015 | Batch 200/249 | Loss: 1.0376
training accuracy: 43.92%
valid accuracy: 43.86%
Time elapsed: 1.46 min
Epoch: 002/015 | Batch 000/249 | Loss: 1.0035
Epoch: 002/015 | Batch 050/249 | Loss: 0.9887
Epoch: 002/015 | Batch 100/249 | Loss: 1.0391
Epoch: 002/015 | Batch 150/249 | Loss: 1.0966
Epoch: 002/015 | Batch 200/249 | Loss: 1.0191
training accuracy: 43.98%
valid accuracy: 43.90%
Time elapsed: 2.98 min
Epoch: 003/015 | Batch 000/249 | Loss: 1.0504
Epoch: 003/015 | Batch 050/249 | Loss: 1.0161
Epoch: 003/015 | Batch 100/249 | Loss: 1.0463
Epoch: 003/015 | Batch 150/249 | Loss: 1.1172
Epoch: 003/015 | Batch 200/249 | Loss: 1.0613
training accuracy: 44.06%
valid accuracy: 44.00%
Time elapsed: 4.29 min
Epoch: 004/015 | Batch 000/249 | Loss: 1.0602
Epoch: 004/015 | Batch 050/249 | Loss: 1.0425
Ep

In [15]:
import spacy


nlp = spacy.blank("en")

def predict_sentiment(model, sentence):

    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(DEVICE)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.nn.functional.softmax(model(tensor), dim=1)
    return prediction[0][-1].item(), prediction[0][0].item(), prediction[0][1].item()

tweet1 = "Tonight, Flotus and I tested positive for COVID-19. We will begin our quarantine and recovery process immeidately. We will get through this TOGETHER!"

print('Probability Negative, Neutral, Positive:')
predict_sentiment(model, tweet1)

Probability Negative, Neutral, Positive:


(2.485761433929784e-13, 0.516394317150116, 0.4544287323951721)

In [16]:
tweet2 = "The first doses of the COVID-19 vaccine in San Francisco were given this morning to frontline healthcare workers at SF General Hospital! There is an end to this pandemic in sight. Let's do everything we can to keep each other safe until we get there."
print('Probability Extremely Negative, Negative, Neutral, Positive:')
predict_sentiment(model, tweet2)

Probability Extremely Negative, Negative, Neutral, Positive:


(3.1402910281030927e-06, 0.4221319258213043, 0.5380465984344482)

In [17]:
tweet3 = "The COVID-19 pandemic is finally over."
print('Probability Extremely Negative, Negative, Neutral, Positive:')
predict_sentiment(model, tweet3)

Probability Extremely Negative, Negative, Neutral, Positive:


(7.58814485379844e-06, 0.40283215045928955, 0.45040223002433777)