In [None]:
function ConnectButton() {
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() 
}
setInterval(ConnectButton, 60000);

In [1]:
path_root = '/content/drive/My Drive/Colab Notebooks/DU/'
path_train = path_root + 'data/Corona_NLP_train.csv'
path_test = path_root + 'data/Corona_NLP_test.csv'
path_models = path_root + 'models/'
path_logs = 'logs/'

In [2]:
!rm -rf '{path_models}' '{path_logs}'
!mkdir '{path_models}' '{path_logs}'

In [3]:
%load_ext tensorboard

### Step 1: Data preparation

In [4]:
!pip install unidecode

Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/74/65/91eab655041e9e92f948cb7302e54962035762ce7b518272ed9d6b269e93/Unidecode-1.1.2-py2.py3-none-any.whl (239kB)
[K     |█▍                              | 10kB 22.8MB/s eta 0:00:01[K     |██▊                             | 20kB 20.2MB/s eta 0:00:01[K     |████                            | 30kB 11.6MB/s eta 0:00:01[K     |█████▌                          | 40kB 9.6MB/s eta 0:00:01[K     |██████▉                         | 51kB 7.2MB/s eta 0:00:01[K     |████████▏                       | 61kB 7.6MB/s eta 0:00:01[K     |█████████▋                      | 71kB 8.1MB/s eta 0:00:01[K     |███████████                     | 81kB 8.1MB/s eta 0:00:01[K     |████████████▎                   | 92kB 8.0MB/s eta 0:00:01[K     |█████████████▊                  | 102kB 8.5MB/s eta 0:00:01[K     |███████████████                 | 112kB 8.5MB/s eta 0:00:01[K     |████████████████▍               | 122kB 8.5M

In [5]:
import nltk
import html
import unidecode
from string import ascii_lowercase
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize

nltk.download('stopwords')

def clean_text(df):
    df['x'] = [html.unescape(x) for x in df['x_orig']]
    df['x'] = [re.sub(r'https?://\S+', '', x) for x in df['x']]
    df['x'] = [re.sub(r'[^\w\s]|\d+', '', x) for x in df['x']]
    df['x'] = [re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', x) for x in df['x']]
    df['x'] = [re.sub(r'\s\s+|_|\'', ' ', x) for x in df['x']]
    df['x'] = [x.strip().lower() for x in df['x']]
    df['x'] = [unidecode.unidecode(x) for x in df['x']]

    for c in ascii_lowercase:
        df['x'] = [re.sub(c+'{3,}', c+c, x) for x in df['x']]

    df['x'] = [regexp_tokenize(x, '\w+') for x in df['x']]
    df['x'] = [' '.join(w for w in x if not w in stopwords.words('english')) for x in df['x']]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
import re
import pandas as pd

def load_csv(path):
    df = pd.read_csv(path, encoding='latin')
    df = df.drop(columns=['UserName', 'ScreenName', 'Location', 'TweetAt'])
    df = df.rename(columns={'OriginalTweet':'x_orig', 'Sentiment':'y'})

    df['y'] = df['y'].apply(lambda x: re.sub('Extremely ', '', x))

    clean_text(df)

    return df

### Step 2: Data exploration

In [None]:
from collections import Counter
import matplotlib.pyplot as plt

y_count = Counter(df['y'])
plt.figure(figsize=(20, 5))
plt.pie(y_count.values(), labels=[class_trans[x] for x in y_count.keys()], autopct='%1.1f%%')
plt.show()

In [None]:
from wordcloud import WordCloud

for c in classes:
    x = df[df['y'] == class_trans[c]]['x'].to_string()
    plt.imshow(WordCloud().generate(x))
    plt.show()

### Step 3: Pipeline construction

In [None]:
!pip install transformers

In [62]:
import torch
import logging
import numpy as np
import tensorflow as tf
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import TFTrainer, TFTrainingArguments
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

In [81]:
def run_train(config):
    logging.basicConfig(level=logging.INFO)

    tokenizer = config['tokenizer'].from_pretrained(config['name'])

    if config['token_add_special']:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    train = load_csv(path_train)

    data_train = tokenizer.batch_encode_plus(
        train['x'].tolist(),
        truncation=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=config['token_max_length'],
        add_special_tokens=config['token_add_special'],
        return_tensors='pt'
    )

    encoder = LabelEncoder()
    train['y_encd'] = encoder.fit_transform(train['y'])

    dataset_train = tf.data.Dataset.from_tensor_slices((
        data_train,
        train['y_encd'].tolist()
    ))

    args = TFTrainingArguments(
        output_dir=path_models+config['name'],
        num_train_epochs=config['num_epochs'],
        per_device_train_batch_size=config['batch_size'],
        warmup_steps=config['warmup_steps'],
        weight_decay=config['weight_decay'],
        logging_dir=path_logs+config['name'],
        logging_steps=config['logging_steps']
    )

    with args.strategy.scope():
        model = tokenizer = config['model'].from_pretrained(
            config['name'],
            num_labels=config['num_labels'])

    trainer = TFTrainer(
        model=model,
        args=args,
        train_dataset=dataset_train
    )

    trainer.train()

    model.save_pretrained(path_models+config['name'])

In [101]:
def run_test(config):
    logging.basicConfig(level=logging.INFO)

    tokenizer = config['tokenizer'].from_pretrained(config['name'])

    if config['token_add_special']:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    test = load_csv(path_test)

    encoder = LabelEncoder()
    test['y_encd'] = encoder.fit_transform(test['y'])

    if config['model_from_tf']:
        model = config['model'].from_pretrained(
            path_models+config['name'],
            num_labels=config['num_labels'],
            from_tf=True)
    else:
        model = config['model'].from_pretrained(
            path_models+config['name'],
            num_labels=config['num_labels'])

    if config['model_to_cuda']:
        device = torch.device('cuda')
        model.to(device)
        model.eval()

    y_true, y_pred = [], []

    for i, row in test.iterrows():
        if config['tensor_type'] == 'tf':
            inputs = tokenizer(row['x'], return_tensors='tf')
            inputs['labels'] = tf.reshape(tf.constant(1), (-1, 1))
            outputs = model(inputs)
            labels = inputs['labels']
            logits = outputs.logits
        elif config['tensor_type'] == 'pt':
            inputs = tokenizer(row['x'], return_tensors='pt').to(device)
            labels = torch.tensor([row['y_encd']]).unsqueeze(0).to(device)
            outputs = model(**inputs, labels=labels)
            labels = labels.detach().cpu().numpy()
            logits = outputs.logits.detach().cpu().numpy()

        y_true.append(labels)
        y_pred.append(logits)

    y_true = np.concatenate(y_true, axis=0)
    y_pred = np.concatenate(y_pred, axis=0)

    y_pred = [encoder.classes_[np.argmax(y)] for y in y_pred]
    y_true = [encoder.classes_[y] for y in y_true]

    print(classification_report(y_true, y_pred))

### Classifier 1: Logistic regression

In [13]:
%%time
train = load_csv(path_train)
model = Pipeline([('vectorizer', TfidfVectorizer()),
                  ('clf', LogisticRegression(max_iter=500))])
model.fit(train['x'], train['y'])

CPU times: user 1min 57s, sys: 30.5 s, total: 2min 27s
Wall time: 2min 12s


In [14]:
%%time
test = load_csv(path_test)
y_pred = model.predict(test['x'])
print(classification_report(test['y'], y_pred))

              precision    recall  f1-score   support

    Negative       0.79      0.76      0.78      1633
     Neutral       0.69      0.58      0.63       619
    Positive       0.77      0.84      0.80      1546

    accuracy                           0.77      3798
   macro avg       0.75      0.73      0.74      3798
weighted avg       0.76      0.77      0.76      3798

CPU times: user 10.2 s, sys: 1.32 s, total: 11.6 s
Wall time: 12.8 s


### Classifier 2: DistilBERT

In [104]:
config = {
    'name': 'distilbert-base-uncased',
    'tokenizer': AutoTokenizer,
    'model': TFAutoModelForSequenceClassification,
    'batch_size': 16,
    'num_epochs': 5,
    'num_labels': 3,
    'warmup_steps': 500,
    'weight_decay': 0.01,
    'logging_steps': 10,
    'token_max_length': 50,
    'token_add_special': False,
    'model_from_tf': False,
    'model_to_cuda': False,
    'tensor_type': 'tf'
}

In [None]:
%tensorboard --logdir '{path_logs}'{config['name']}

In [None]:
%%time
run_train(config)

In [None]:
%%time
run_test(config)

Some layers from the model checkpoint at /content/drive/My Drive/Colab Notebooks/DU/data/models/distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['dropout_259']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /content/drive/My Drive/Colab Notebooks/DU/data/models/distilbert-base-uncased and are newly initialized: ['dropout_379']
You should probably TRAIN this model on a down-stream task 

### Classifier 3: DistilGPT2

In [None]:
config = {
    'name': 'distilgpt2',
    'tokenizer': AutoTokenizer,
    'model': TFAutoModelForSequenceClassification,
    'batch_size': 1,
    'num_epochs': 5,
    'num_labels': 3,
    'warmup_steps': 500,
    'weight_decay': 0.01,
    'logging_steps': 10,
    'token_max_length': 50,
    'token_add_special': True,
    'model_from_tf': True,
    'model_to_cuda': True,
    'tensor_type': 'pt'
}

In [None]:
%tensorboard --logdir '{path_logs}'{config['name']}

In [None]:
%%time
run_train(config)

In [None]:
%%time
run_test(config)