In [1]:
path_root = '/content/drive/My Drive/Colab Notebooks/DU/data/'
path_train = path_root + 'Corona_NLP_train.csv'
path_test = path_root + 'Corona_NLP_test.csv'

### Step 1: Data preparation

In [115]:
import re
import pandas as pd

def load_csv(path):
    df = pd.read_csv(path, encoding='latin')
    df = df.drop(columns=['UserName', 'ScreenName', 'Location', 'TweetAt'])
    df = df.rename(columns={'OriginalTweet':'x', 'Sentiment':'y'})

    df['y'] = df['y'].apply(lambda x: re.sub('Extremely ', '', x))

    return df

train, test = load_csv(path_train), load_csv(path_test)

In [76]:
import nltk

_ = nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [93]:
!pip install unidecode

Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/74/65/91eab655041e9e92f948cb7302e54962035762ce7b518272ed9d6b269e93/Unidecode-1.1.2-py2.py3-none-any.whl (239kB)
[K     |█▍                              | 10kB 22.0MB/s eta 0:00:01[K     |██▊                             | 20kB 19.1MB/s eta 0:00:01[K     |████                            | 30kB 12.4MB/s eta 0:00:01[K     |█████▌                          | 40kB 10.7MB/s eta 0:00:01[K     |██████▉                         | 51kB 11.1MB/s eta 0:00:01[K     |████████▏                       | 61kB 11.6MB/s eta 0:00:01[K     |█████████▋                      | 71kB 10.7MB/s eta 0:00:01[K     |███████████                     | 81kB 11.1MB/s eta 0:00:01[K     |████████████▎                   | 92kB 11.3MB/s eta 0:00:01[K     |█████████████▊                  | 102kB 11.3MB/s eta 0:00:01[K     |███████████████                 | 112kB 11.3MB/s eta 0:00:01[K     |████████████████▍               | 12

In [116]:
%%time

import html
import unidecode
from string import ascii_lowercase
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords

def clean_text(df):
    df['x'] = [html.unescape(x) for x in df['x']]
    df['x'] = [re.sub(r'https?://\S+', '', x) for x in df['x']]
    df['x'] = [re.sub(r'[^\w\s]|\d+', '', x) for x in df['x']]
    df['x'] = [re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', x) for x in df['x']]
    df['x'] = [re.sub(r'\s\s+|_|\'', ' ', x) for x in df['x']]
    df['x'] = [x.strip().lower() for x in df['x']]
    df['x'] = [unidecode.unidecode(x) for x in df['x']]

    for c in ascii_lowercase:
        df['x'] = [re.sub(c+'{3,}', c+c, x) for x in df['x']]

    df['x'] = [regexp_tokenize(x, '\w+') for x in df['x']]
    df['x'] = [' '.join(w for w in x if not w in stopwords.words('english')) for x in df['x']]

clean_text(train), clean_text(test)

CPU times: user 24.5 ms, sys: 5.32 ms, total: 29.8 ms
Wall time: 29.9 ms


### Step 2: Data exploration

In [117]:
from collections import Counter
import matplotlib.pyplot as plt

y_count = Counter(df['y'])
plt.figure(figsize=(20, 5))
plt.pie(y_count.values(), labels=[class_trans[x] for x in y_count.keys()], autopct='%1.1f%%')
plt.show()

NameError: ignored

<Figure size 1440x360 with 0 Axes>

In [None]:
from wordcloud import WordCloud

for c in classes:
    x = df[df['y'] == class_trans[c]]['x'].to_string()
    plt.imshow(WordCloud().generate(x))
    plt.show()

### Classifier 1: Logistic regression

In [119]:
%%time

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

model = Pipeline([('vectorizer', TfidfVectorizer()),
                  ('clf', LogisticRegression(max_iter=500))])
model.fit(train['x'], train['y'])

y_pred = model.predict(test['x'])
print(classification_report(test['y'], y_pred))

CPU times: user 3min 53s, sys: 5min 13s, total: 9min 6s
Wall time: 4min 44s


### Classifier 2: BERT

In [None]:
!pip install transformers

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [6]:
data_train = tokenizer.batch_encode_plus(
    train['x'],
    truncation=True,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='max_length',
    max_length=50,
    return_tensors='pt'
)

data_test = tokenizer.batch_encode_plus(
    test['x'],
    truncation=True,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='max_length',
    max_length=50,
    return_tensors='pt'
)

In [55]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

train['y_encd'] = encoder.fit_transform(train['y'])
test['y_encd'] = encoder.fit_transform(test['y'])

In [10]:
from torch import tensor
from torch.utils.data import TensorDataset

dataset_train = TensorDataset(data_train['input_ids'],
                              data_train['attention_mask'],
                              torch.tensor(train['y_encd'].values))

dataset_test = TensorDataset(data_test['input_ids'],
                             data_test['attention_mask'],
                             torch.tensor(test['y_encd'].values))

In [11]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=128)

dataloader_test = DataLoader(dataset_test,
                             sampler=SequentialSampler(dataset_test),
                             batch_size=128)

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

In [14]:
from transformers import AdamW, get_linear_schedule_with_warmup

epochs = 10
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [18]:
import numpy as np
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
%%time

device = torch.device('cuda')
model.to(device)

for epoch in range(epochs):
    model.train()
    loss_total = 0

    for i, batch in enumerate(dataloader_train):
        if i % 10 == 0:
            print(f'Batch: #{i+1}')

        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0].to(device),
                  'attention_mask': batch[1].to(device),
                  'labels': batch[2].to(device)
                  }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    print(f'Epoch: #{epoch+1}')
    print(f'Loss: {loss_total}')

In [59]:
model.eval()

loss_total = 0
y_true, y_pred = [], []

for batch in dataloader_test:
    batch = tuple(b.to(device) for b in batch)
    inputs = {'input_ids':   batch[0],
              'attention_mask': batch[1],
              'labels': batch[2]
              }

    with torch.no_grad():   
        outputs = model(**inputs)

    loss_total += outputs[0].item()
    label_ids = inputs['labels'].cpu().numpy()
    logits = outputs[1].detach().cpu().numpy()
    y_true.append(label_ids)
    y_pred.append(logits)

y_true = np.concatenate(y_true, axis=0)
y_pred = np.concatenate(y_pred, axis=0)

y_pred = [encoder.classes_[np.argmax(y)] for y in y_pred]
y_true = [encoder.classes_[y] for y in y_true]

print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

    Negative       0.07      0.02      0.04      1633
     Neutral       0.86      0.88      0.87      1546
    Positive       0.04      0.11      0.06       619

    accuracy                           0.39      3798
   macro avg       0.32      0.34      0.32      3798
weighted avg       0.39      0.39      0.38      3798

