In [None]:
function ConnectButton() {
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() 
}
setInterval(ConnectButton, 60000);

In [1]:
path_root = '/content/drive/My Drive/Colab Notebooks/DU/data/'
path_train = path_root + 'Corona_NLP_train.csv'
path_test = path_root + 'Corona_NLP_test.csv'
path_models = path_root + 'models/'
path_logs = path_root + 'logs/'

In [None]:
!rm -rf '{path_models}' '{path_logs}'
!mkdir '{path_models}' '{path_logs}'

### Step 1: Data preparation

In [None]:
!pip install unidecode

In [None]:
import nltk
import html
import unidecode
from string import ascii_lowercase
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize

nltk.download('stopwords')

def clean_text(df):
    df['x'] = [html.unescape(x) for x in df['x']]
    df['x'] = [re.sub(r'https?://\S+', '', x) for x in df['x']]
    df['x'] = [re.sub(r'[^\w\s]|\d+', '', x) for x in df['x']]
    df['x'] = [re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', x) for x in df['x']]
    df['x'] = [re.sub(r'\s\s+|_|\'', ' ', x) for x in df['x']]
    df['x'] = [x.strip().lower() for x in df['x']]
    df['x'] = [unidecode.unidecode(x) for x in df['x']]

    for c in ascii_lowercase:
        df['x'] = [re.sub(c+'{3,}', c+c, x) for x in df['x']]

    df['x'] = [regexp_tokenize(x, '\w+') for x in df['x']]
    df['x'] = [' '.join(w for w in x if not w in stopwords.words('english')) for x in df['x']]

In [4]:
import re
import pandas as pd

def load_csv(path):
    df = pd.read_csv(path, encoding='latin')
    df = df.drop(columns=['UserName', 'ScreenName', 'Location', 'TweetAt'])
    df = df.rename(columns={'OriginalTweet':'x', 'Sentiment':'y'})

    df['y'] = df['y'].apply(lambda x: re.sub('Extremely ', '', x))

    clean_text(df)

    return df

### Step 2: Data exploration

In [None]:
from collections import Counter
import matplotlib.pyplot as plt

y_count = Counter(df['y'])
plt.figure(figsize=(20, 5))
plt.pie(y_count.values(), labels=[class_trans[x] for x in y_count.keys()], autopct='%1.1f%%')
plt.show()

In [None]:
from wordcloud import WordCloud

for c in classes:
    x = df[df['y'] == class_trans[c]]['x'].to_string()
    plt.imshow(WordCloud().generate(x))
    plt.show()

### Step 3: Pipeline construction

In [None]:
import torch
import logging
import numpy as np
from tf.data import Dataset
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import TFTrainer, TFTrainingArguments
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

In [None]:
!pip install transformers

In [None]:
def train(config):
    logging.basicConfig(level=logging.INFO)

    tokenizer = config['tokenizer'].from_pretrained(config['name'])

    if config['token_add_special']:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    train = load_csv(path_train)

    data_train = tokenizer.batch_encode_plus(
        train['x'].tolist(),
        truncation=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=config['token_max_length'],
        add_special_tokens=config['token_add_special'],
        return_tensors='pt'
    )

    encoder = LabelEncoder()
    train['y_encd'] = encoder.fit_transform(train['y'])

    dataset_train = Dataset.from_tensor_slices((
        data_train,
        train['y_encd'].tolist()
    ))

    args = TFTrainingArguments(
        output_dir=path_models+config['name'],
        num_train_epochs=config['num_epochs'],
        per_device_train_batch_size=config['batch_size'],
        warmup_steps=config['warmup_steps'],
        weight_decay=config['weight_decay'],
        logging_dir=path_logs+config['name'],
        logging_steps=config['logging_steps']
    )

    with args.strategy.scope():
        model = tokenizer = config['model'].from_pretrained(
            config['name'],
            num_labels=config['num_labels'])

    trainer = TFTrainer(
        model=model,
        args=args,
        train_dataset=dataset_train
    )

    trainer.train()
    trainer.save_model()

In [None]:
def test(config):
    logging.basicConfig(level=logging.INFO)

    tokenizer = config['tokenizer'].from_pretrained(config['name'])

    if config['token_add_special']:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    test = load_csv(path_test)

    encoder = LabelEncoder()
    test['y_encd'] = encoder.fit_transform(test['y'])

    model = config['model'].from_pretrained(
        path_models+config['name'],
        num_labels=config['num_labels'],
        from_tf=True)

    device = torch.device('cuda')
    model.to(device)
    model.eval()

    y_true, y_pred = [], []

    for i, row in test.iterrows():
        inputs = tokenizer(row['x'], return_tensors='pt').to(device)
        labels = torch.tensor([row['y_encd']]).unsqueeze(0).to(device)

        outputs = model(**inputs, labels=labels)
        labels = labels.detach().cpu().numpy()
        logits = outputs.logits.detach().cpu().numpy()

        y_true.append(labels)
        y_pred.append(logits)

    y_true = np.concatenate(y_true, axis=0)
    y_pred = np.concatenate(y_pred, axis=0)

    y_pred = [encoder.classes_[np.argmax(y)] for y in y_pred]
    y_true = [encoder.classes_[y] for y in y_true]

    print(classification_report(y_true, y_pred))

### Classifier 1: Logistic regression

In [None]:
%%time
train = load_csv(path_train)
model = Pipeline([('vectorizer', TfidfVectorizer()),
                  ('clf', LogisticRegression(max_iter=500))])
model.fit(train['x'], train['y'])

In [None]:
%%time
test = load_csv(path_test)
y_pred = model.predict(test['x'])
print(classification_report(test['y'], y_pred))

### Classifier 2: DistilBERT

In [None]:
config = {
    'name': 'distilbert-base-uncased',
    'tokenizer': DistilBertTokenizerFast,
    'model': TFDistilBertForSequenceClassification,
    'batch_size': 16,
    'num_epochs': 5,
    'num_labels': 3,
    'warmup_steps': 500,
    'weight_decay': 0.01,
    'logging_steps': 10,
    'token_max_length': 50,
    'token_add_special': False
}

In [None]:
%%time
train(config)

In [None]:
%%time
test(config)

### Classifier 3: DistilGPT2

In [None]:
config = {
    'name': 'distilgpt2',
    'tokenizer': AutoTokenizer,
    'model': TFAutoModelForSequenceClassification,
    'batch_size': 1,
    'num_epochs': 5,
    'num_labels': 3,
    'warmup_steps': 500,
    'weight_decay': 0.01,
    'logging_steps': 10,
    'token_max_length': 50,
    'token_add_special': True
}

In [None]:
%%time
train(config)

In [None]:
%%time
test(config)