In [4]:
import torch
import random
import pandas as pd
from sklearn.metrics import accuracy_score
from simpletransformers.classification import ClassificationModel

from typing import List, Tuple, Dict, Union, Any

In [5]:
def get_data(
    full: bool,
    seed: int,
    test_size: float
) -> Tuple[Tuple[str], torch.Tensor, Tuple[str], torch.Tensor]:
    """
    Loads training and testing data

    :param full: whether to use the full dataset (True) or not (False)
    :param seed: a seed to determine the shuffling
    :param test_size: the relative size of the test set
    :return: a tuple (X_train, y_train, X_test, y_test)
    """
    prefix = 'twitter-datasets/train_'
    suffix = '_full' if full else ''
    with open(f'{prefix}neg{suffix}.txt', 'r') as f:
        neg = f.read().split('\n')[:-1]
    with open(f'{prefix}pos{suffix}.txt', 'r') as f:
        pos = f.read().split('\n')[:-1]

    data = list(zip(neg, [-1] * len(neg))) + list(zip(pos, [1] * len(pos)))
    random.Random(seed).shuffle(data)

    split_idx = int((1 - test_size) * len(data))
    train = data[:split_idx]
    test = data[split_idx:]

    X_train, y_train = list(zip(*train))
    X_test, y_test = list(zip(*test))
    y_train = torch.Tensor(y_train)
    y_test = torch.Tensor(y_test)

    return X_train, y_train, X_test, y_test

In [6]:
def calc_clf_acc(
    true: torch.Tensor,
    predicted: torch.Tensor
) -> float:
    """
    Calculates the classification accuracy.

    :param true: tensor of true labels (-1 for neg, 1 for pos)
    :param predicted: tensor of predicted labels (-1 for neg, 1 for pos)
    :return: the classification accuracy (n_correct / n_total)
    """
    return (true == predicted).float().mean().item()


def predict_holdout(
    clf: Any,
    out_path: str
) -> None:
    """
    Predicts on the holdout "test_data.txt" using provided classifier.

    :param clf: Classifier with .predict() method
    :param out_path: where to save output to
    """
    with open('twitter-datasets/test_data.txt', 'r') as f:
        X = [','.join(l.split(',')[1:]) for l in f.read().split('\n')[:-1]]

    ids = range(1, len(X) + 1)
    predictions = clf.predict(X).to(int).tolist()
    submission = pd.DataFrame([ids, predictions], index=['Id', 'Prediction']).T

    submission.to_csv(out_path, index=False)

In [8]:
def pipeline(
    full: bool,
    seed: int,
    test_size: float,
    model_class: Any,
    model_args: Dict[str, Any],
    out_path: str=None
) -> None:
    """
    Loads data, fits and evaluates the model and predicts on holdout set.

    :param full: whether to use the full dataset (True) or not (False)
    :param seed: a seed to determine the shuffling
    :param test_size: the relative size of the test set
    :param model_class: a class implementing .fit() and .predict() methods
    :param model_args: a dictionary of model arguments

    """
    X_train, y_train, X_test, y_test = get_data(full, seed, test_size)

    clf = model_class(model_args)
    clf.fit(X_train, y_train)
    
    y_train_hat = clf.predict(X_train)
    y_test_hat = clf.predict(X_test)

    train_acc = calc_clf_acc(y_train, y_train_hat)
    test_acc = calc_clf_acc(y_test, y_test_hat)
    print(f'Training accuracy: {train_acc * 100:.2f}%')
    print(f'Testing accuracy: {test_acc * 100:.2f}%')

    if out_path is not None:
        predict_holdout(clf, out_path)

In [9]:
class BertweetClassifier():
    """ Finetuning BERTweet-base via simpletransformers """
    def __init__(
        self,
        model_args: Dict[str, Any]
    ) -> None:
        self.model = ClassificationModel(
            model_type='bertweet',
            model_name='vinai/bertweet-base',
            args=model_args,
            num_labels=2
        )
    
    def fit(
        self,
        X: Tuple[str],
        y: torch.Tensor
    ) -> None:
        train_df = pd.DataFrame({'text': X, 'labels': (y + 1) / 2})
        self.model.train_model(train_df, acc=accuracy_score)
    
    def predict(
        self,
        X: Tuple[str]
    ) -> torch.Tensor:
        return 2 * torch.Tensor(self.model.predict(X)[0]) - 1

In [10]:
# parameters
FULL = False
SEED = 42
TEST_SIZE = 0.3
MODEL_CLASS = BertweetClassifier
MODEL_ARGS = {
    'manual_seed': 69,
    'num_train_epochs': 1,
    'train_batch_size': 16,
    'learning_rate': 2e-5,
    'dropout': 0.1,
    'weight_decay': 0.01,
    'warmup_ratio': 0.1,
    'optimizer': 'AdamW',
    'overwrite_output_dir': True,
}
OUT_PATH = 'submission.csv'

In [None]:
# run
pipeline(
    full=FULL,
    seed=SEED,
    test_size=TEST_SIZE,
    model_class=MODEL_CLASS,
    model_args=MODEL_ARGS,
    out_path=OUT_PATH
)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

  0%|          | 0/140000 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/8750 [00:00<?, ?it/s]

  0%|          | 0/140000 [00:00<?, ?it/s]

  0%|          | 0/17500 [00:00<?, ?it/s]

  0%|          | 0/60000 [00:00<?, ?it/s]

  0%|          | 0/7500 [00:00<?, ?it/s]

Training accuracy: 93.23%
Testing accuracy: 90.76%


  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1250 [00:00<?, ?it/s]

In [98]:
class BaselineModel:
    def __init__(
        self,
        model_args: Dict[str, Any]
    ) -> None:
        """
        :param n: the length of the word tuple
        :param p: the power by which the frequency is weighted within the score
        """
        self.n = model_args['n']
        self.p = model_args['p']

    def fit(
        self,
        X: List[str],
        y: torch.Tensor
    ) -> None:
        """
        Computes an occurrence counter for each consecutive n-tuple of words
        for each sentiment. Then creates a score by scaling the count by a power.

        :param X: list of tweets
        :param y: tensor of labels
        """
        self.n_tuple_counters = {-1: dict(), 1: dict()}
        for tweet, label in zip(X, y):
            words, sentiment = tweet.split(' '), label.item()
            if len(words) >= self.n:
                words_offset = []
                for i in range(self.n):
                    words_offset.append(words[i:len(words) - self.n + i + 1])
                for n_tuple_tuple in zip(*words_offset):
                    n_tuple = ' '.join(n_tuple_tuple)
                    if n_tuple in self.n_tuple_counters[sentiment]:
                        self.n_tuple_counters[sentiment][n_tuple] += 1
                    else:
                        self.n_tuple_counters[sentiment][n_tuple] = 0
        self.scores = {-1: dict(), 1: dict()}
        for sentiment, counter in self.n_tuple_counters.items():
            for n_tuple, count in counter.items():
                self.scores[sentiment][n_tuple] = count ** self.p

    def predict_proba(
        self,
        X: List[str],
    ) -> torch.Tensor:
        """
        Creates a score for each tweet and each sentiment as the sum of scores for each
        consecutive n-tuple of words in the tweet of that sentiment. Calculates the
        probability for positive label as the share of positive score to overall score.

        :param X: List of tweets as strings
        :return: a tensor of predicted probabilities for positive label
        """
        pos_probabilities = []
        for tweet in X:
            neg_score, pos_score = 0, 0
            words = tweet.split(' ')
            if len(words) >= self.n:
                words_offset = []
                for i in range(self.n):
                    words_offset.append(words[i:len(words) - self.n + i + 1])
                for n_tuple_tuple in zip(*words_offset):
                    n_tuple = ' '.join(n_tuple_tuple)
                    if n_tuple in self.scores[-1]:
                        neg_score += self.scores[-1][n_tuple]
                    if n_tuple in self.scores[1]:
                        pos_score += self.scores[1][n_tuple]
            scores_sum = neg_score + pos_score
            if scores_sum == 0:
                pos_prob = 0.5
            else:
                pos_prob = pos_score / scores_sum
            pos_probabilities.append(pos_prob)
        return torch.Tensor(pos_probabilities)

    def predict(
        self,
        X: List[str]
    ) -> torch.Tensor:
        """
        predicts labels

        :param tweets: List of tweets as strings
        :return: a tensor of predicted probabilities for positive label
        """
        probabilities = self.predict_proba(X)
        predictions = probabilities.round() * 2 - 1
        return predictions

In [99]:
# baseline model
BASELINE_MODEL_CLASS = BaselineModel
BASELINE_MODEL_ARGS = {'n': 2, 'p': 0.25}
pipeline(
    full=FULL,
    seed=SEED,
    test_size=TEST_SIZE,
    model_class=BASELINE_MODEL_CLASS,
    model_args=BASELINE_MODEL_ARGS
)

Training accuracy: 88.13%
Testing accuracy: 80.04%
