## Initialize notebook

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive

import os

PROJECT_DIR = '/content/drive/MyDrive/afrisent/afrisent-semeval-2023'
PROJECT_GITHUB_URL = 'https://github.com/afrisenti-semeval/afrisent-semeval-2023.git'

if not os.path.isdir(PROJECT_DIR):
  !git clone {PROJECT_GITHUB_URL}
else:
  %cd {PROJECT_DIR}
  !git pull {PROJECT_GITHUB_URL}

/content/drive/MyDrive
/content/drive/MyDrive/afrisent/afrisent-semeval-2023
From https://github.com/afrisenti-semeval/afrisent-semeval-2023
 * branch            HEAD       -> FETCH_HEAD
Already up to date.


In [3]:
if os.path.isdir(PROJECT_DIR):
  #The requirements file should be in PROJECT_DIR
  if os.path.isfile(os.path.join(PROJECT_DIR, 'starter_kit/requirements.txt')):
    !pip install -r starter_kit/requirements.txt
  else:
    print('requirements.txt file not found')

else:
  print("Project directory not found, please check again.")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 5.2 MB/s 
Collecting accelerate
  Downloading accelerate-0.15.0-py3-none-any.whl (191 kB)
[K     |████████████████████████████████| 191 kB 76.5 MB/s 
[?25hCollecting sentencepiece!=0.1.92
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 71.6 MB/s 
[?25hCollecting datasets>=1.8.0
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 65.7 MB/s 
[?25hCollecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 1.7 MB/s 
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |██████████████

In [4]:
import pandas as pd
import numpy as np

# Please do not edit anything here
languages = ['am', 'dz', 'ha', 'ig', 'ma', 'pcm', 'pt', 'sw', 'yo']

Training data is already in /content/drive/MyDrive/afrisent/afrisent-semeval-2023/SubtaskA/train/splitted-train-dev-test

## Remove all but sentiment-bearing words from tweets


In [34]:
import os

# function to get a list of sentiment-bearing words
def sent_words(language):
  sent_words = []
  path = PROJECT_DIR + '/sentiment_lexicon/' + language
  if language in ['yo', 'ha', 'ig']:
    for filename in os.listdir(path):
      if filename != 'README.txt':
        f = os.path.join(path,filename)
        df = pd.read_csv(f, header=0)
        sent_words.extend(df.iloc[:,0])
  return sent_words

# function to remove words not in sentiment-bearing word list
def remove_words(tweet, sent_words):
  words = tweet.split()
  sentiment_tweet = ""
  for word in words:
    if word in sent_words:
      sentiment_tweet = sentiment_tweet + " " + word
  return sentiment_tweet

# creates a train-dev-test dataset with only sentiment-bearing words
# data is in '/content/drive/MyDrive/afrisent/afrisent-semeval-2023/SubtaskA/train/sent-split-train-dev-test/{lang}
def format_sent_word_data(language):
  sent_list = sent_words(language)
  path = PROJECT_DIR + '/SubtaskA/train/sent-split-train-dev-test/' + language
  if not os.path.isdir(path):
    os.makedirs(path)
  print(path + " created")
  path_to_orig_data = PROJECT_DIR + '/SubtaskA/train/splitted-train-dev-test/' + language
  for file in os.listdir(path_to_orig_data):
    f = os.path.join(path_to_orig_data, file)
    df = pd.read_csv(f, sep='\t', header=0)
    df['text'] = df['text'].apply(remove_words, sent_words=sent_list)
    df.to_csv(os.path.join(path,file), sep='\t', index=False)
    print(f + " created with sentiment words removed")

## Simple Baseline Experiments

(code from Thomas's baseline code)





In [42]:
LANGUAGE_CODE = 'ig'

# create files with sentiment words only for yoruba
format_sent_word_data(language=LANGUAGE_CODE)

/content/drive/MyDrive/afrisent/afrisent-semeval-2023/SubtaskA/train/sent-split-train-dev-test/ig created
/content/drive/MyDrive/afrisent/afrisent-semeval-2023/SubtaskA/train/splitted-train-dev-test/ig/train.tsv created with sentiment words removed
/content/drive/MyDrive/afrisent/afrisent-semeval-2023/SubtaskA/train/splitted-train-dev-test/ig/dev.tsv created with sentiment words removed
/content/drive/MyDrive/afrisent/afrisent-semeval-2023/SubtaskA/train/splitted-train-dev-test/ig/test.tsv created with sentiment words removed


In [43]:
from types import SimpleNamespace
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import warnings
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, balanced_accuracy_score, balanced_accuracy_score, average_precision_score
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim

import os
import random
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
import numpy as np
import pandas
import pandas as pd
from datasets import load_dataset

import evaluate
import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    PreTrainedTokenizerFast,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from tokenizers import SentencePieceBPETokenizer
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version
from datasets import Features, Value, ClassLabel, load_dataset, Dataset

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

np.random.seed(420)
torch.manual_seed(69);

In [44]:
PROJECT_DIR = '/content/drive/MyDrive/afrisent/afrisent-semeval-2023'
TRAINING_DATA_DIR = os.path.join(PROJECT_DIR, 'SubtaskA', 'train')
DATA_DIR = os.path.join(TRAINING_DATA_DIR, 'sent-split-train-dev-test', LANGUAGE_CODE)

In [45]:
print(LANGUAGE_CODE)

# Set seed before initializing model.
set_seed(42069)

# obtain train data
df = pd.read_csv(DATA_DIR + '/train.tsv', sep='\t')
df = df.dropna()
train_dataset = Dataset.from_pandas(df)
label_list = df['label'].unique().tolist()

# obtain dev data
df = pd.read_csv(DATA_DIR + '/dev.tsv', sep='\t') 
df = df.dropna()
eval_dataset = Dataset.from_pandas(df)

# obtain test data
df = pd.read_csv(DATA_DIR + '/test.tsv', sep='\t')
df = df.dropna()
test_dataset = Dataset.from_pandas(df)

# Labels
num_labels = len(label_list)
print(label_list)

ig
['positive', 'neutral', 'negative']


Tokenization

In [46]:
tokenizer = SentencePieceBPETokenizer()
tokenizer.train_from_iterator(
    train_dataset['text'] + eval_dataset['text'],
    vocab_size=100000,
    min_frequency=5,
    show_progress=True,
    limit_alphabet=500,
)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer
)
tokenizer.add_special_tokens({'eos_token': '[EOS]'})
tokenizer.pad_token = tokenizer.eos_token

In [47]:
# Preprocessing the datasets
# Padding strategy
padding = "max_length"

label_to_id = None
label_to_id = {v: i for i, v in enumerate(label_list)}

In [48]:
MAXIMUM_SEQUENCE_LENGTH = 500

def preprocess_function(examples):
    texts =(examples['text'],)
    result = tokenizer(*texts, padding=padding, max_length=MAXIMUM_SEQUENCE_LENGTH)
    
    if label_to_id is not None and "label" in examples:
        result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
    
    result['length'], result["tokenized"] = [], []
    for input_ids in result['input_ids']:
        toks = tokenizer.convert_ids_to_tokens(input_ids, skip_special_tokens=True)
        result['length'].append(len(toks)+2)
        result['tokenized'].append(' '.join(toks))
    return result

train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on train dataset",
)

eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on validation dataset",
)

test_dataset = test_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on validation dataset",
)

Running tokenizer on train dataset:   0%|          | 0/4 [00:00<?, ?ba/s]

Running tokenizer on validation dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on validation dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [49]:
train_dataset, eval_dataset, test_dataset

(Dataset({
     features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'tokenized'],
     num_rows: 3188
 }), Dataset({
     features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'tokenized'],
     num_rows: 479
 }), Dataset({
     features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'length', 'tokenized'],
     num_rows: 885
 }))

In [50]:
train_text, train_labels = train_dataset['tokenized'], train_dataset['label']
eval_text, eval_labels = eval_dataset['tokenized'], eval_dataset['label']
test_text, test_labels = test_dataset['tokenized'], test_dataset['label']

Class proportions

In [51]:
for l in np.unique(test_labels):
  print(l, np.mean(np.array(test_labels) == l), 
        f1_score(test_labels, [l] * len(test_labels), average='weighted'), 
        balanced_accuracy_score(test_labels, [l] * len(test_labels)))

0 0.38305084745762713 0.21218012628780325 0.3333333333333333
1 0.3480225988700565 0.17969984987758153 0.3333333333333333
2 0.2689265536723164 0.11398845908105305 0.3333333333333333


Unigram, bigram, trigram BOW and TF-IDF



In [52]:
unigram_vectorizer = CountVectorizer(ngram_range=(1, 1), max_features=100000, min_df=7, max_df=0.8)
unigram_vectorizer.fit(train_text)
X_train_unigram = unigram_vectorizer.transform(train_text)
X_eval_unigram = unigram_vectorizer.transform(eval_text)
X_test_unigram = unigram_vectorizer.transform(test_text)

unigram_tf_idf_transformer = TfidfTransformer()
unigram_tf_idf_transformer.fit(X_train_unigram)
X_train_unigram_tf_idf = unigram_tf_idf_transformer.transform(X_train_unigram)
X_eval_unigram_tf_idf = unigram_tf_idf_transformer.transform(X_eval_unigram)
X_test_unigram_tf_idf = unigram_tf_idf_transformer.transform(X_test_unigram)

bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=100000, min_df=7, max_df=0.8)
bigram_vectorizer.fit(train_text)
X_train_bigram = bigram_vectorizer.transform(train_text)
X_eval_bigram = bigram_vectorizer.transform(eval_text)
X_test_bigram = bigram_vectorizer.transform(test_text)

bigram_tf_idf_transformer = TfidfTransformer()
bigram_tf_idf_transformer.fit(X_train_bigram)
X_train_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_train_bigram)
X_eval_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_eval_bigram)
X_test_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_test_bigram)

trigram_vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=100000, min_df=7, max_df=0.8)
trigram_vectorizer.fit(train_text)
X_train_trigram = trigram_vectorizer.transform(train_text)
X_eval_trigram = trigram_vectorizer.transform(eval_text)
X_test_trigram = trigram_vectorizer.transform(test_text)

trigram_tf_idf_transformer = TfidfTransformer()
trigram_tf_idf_transformer.fit(X_train_trigram)
X_train_trigram_tf_idf =trigram_tf_idf_transformer.transform(X_train_trigram)
X_eval_trigram_tf_idf =trigram_tf_idf_transformer.transform(X_eval_trigram)
X_test_trigram_tf_idf =trigram_tf_idf_transformer.transform(X_test_trigram)

MultinomialNB, SVM, Logistic Regression, Random Forest, AdaDTC, MLP

In [53]:
def AdaDTC(**params):
    base_params = {}
    for k, v in list(params.items()):
        if k[:16] == 'base_estimator__':
            base_params[k[16:]] = v
            del params[k]
    return AdaBoostClassifier(base_estimator=DecisionTreeClassifier(**base_params), **params)

classifiers = [
    (MultinomialNB, [
        {'alpha': 1e-3}, {'alpha': 0.1}, {'alpha': 1}, {'alpha': 10}, {'alpha': 1e3},
    ]), 
    (SVC, [
        {'C': 1, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': 1000},
        {'C': 0.1, 'gamma': 'auto', 'kernel': 'rbf', 'max_iter': 1000},
        {'C': 10, 'gamma': 'auto', 'kernel': 'rbf', 'max_iter': 1000},
        {'C': 1, 'gamma': 'scale', 'kernel': 'linear', 'max_iter': 1000},
        {'C': 0.1, 'gamma': 'auto', 'kernel': 'linear', 'max_iter': 1000},
        {'C': 10, 'gamma': 'auto', 'kernel': 'linear', 'max_iter': 1000},
    ]), 
    (LogisticRegression, [
        {'C': 0.1, 'max_iter': 1000, 'penalty': 'l2'},
        {'C': 1, 'max_iter': 1000, 'penalty': 'l2'},
        {'C': 10, 'max_iter': 1000, 'penalty': 'l2'}
    ]), 
    (RandomForestClassifier, [
        {'criterion': 'gini', 'max_depth': 5, 'n_estimators': 500},
        {'criterion': 'gini', 'max_depth': 2, 'n_estimators': 1000},
    ]), 
    (AdaDTC, [
        {'base_estimator__criterion': 'gini', 'base_estimator__max_depth': 5, 
          'base_estimator__splitter': 'best', 'learning_rate': 0.1, 'n_estimators': 500},
        {'base_estimator__criterion': 'entropy', 'base_estimator__max_depth': 3, 
         'base_estimator__splitter': 'random', 'learning_rate': 1, 'n_estimators': 500},
        {'base_estimator__criterion': 'entropy', 'base_estimator__max_depth': 2, 
         'base_estimator__splitter': 'random', 'learning_rate': 1, 'n_estimators': 1000}
    ]),  
    (MLPClassifier, [
        {'activation': 'relu', 'alpha': 1, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant'},
        {'activation': 'tanh', 'alpha': 1, 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive'}
    ])
    ]

X_train_data = {'unigram counts': X_train_unigram,
                'unigram tf-idf': X_train_unigram_tf_idf,
                'bigram counts': X_train_bigram, 
                'bigram tf-idf': X_train_bigram_tf_idf,
                'trigram counts': X_train_trigram, 
                'trigram tf-idf': X_train_trigram_tf_idf}

X_eval_data = {'unigram counts': X_eval_unigram,
                'unigram tf-idf': X_eval_unigram_tf_idf,
                'bigram counts': X_eval_bigram, 
                'bigram tf-idf': X_eval_bigram_tf_idf,
                'trigram counts': X_eval_trigram, 
                'trigram tf-idf': X_eval_trigram_tf_idf}

X_test_data = {'unigram counts': X_test_unigram,
                'unigram tf-idf': X_test_unigram_tf_idf,
                'bigram counts': X_test_bigram, 
                'bigram tf-idf': X_test_bigram_tf_idf,
                'trigram counts': X_test_trigram, 
                'trigram tf-idf': X_test_trigram_tf_idf}

def train_and_show_scores(title, model, parameters, k=5):

    X_train, X_test = X_train_data[title], X_test_data[title]
    y_train, y_test = train_labels, test_labels
    X_eval, y_eval = X_eval_data[title], eval_labels
    
    
    best_train, best_test, best_f1, best_balanced_accuracy, best_params = 0, 0, 0, 0, None
    best_valid_f1 = 0
    for params in parameters:
        train_score, test_score, test_f1, test_balanced_accuracy = 0, 0, 0, 0
        valid_f1 = 0
        for _ in range(k):
            clf = model(**params)
            with warnings.catch_warnings(record=True) as w:
                warnings.simplefilter("always")
                clf.fit(X_train, y_train)
                train_score += clf.score(X_train, y_train) / k
                test_score += clf.score(X_test, y_test) / k

                preds = clf.predict(X_test)
                test_f1 += f1_score(y_test, preds, average='weighted') / k
                test_balanced_accuracy += balanced_accuracy_score(y_test, preds) / k
            
                preds = clf.predict(X_eval)
                valid_f1 += f1_score(y_eval, preds, average='weighted') / k
        
        if valid_f1 > best_valid_f1:
            best_train, best_test, best_f1, best_balanced_accuracy, best_params, best_valid_f1 = \
                train_score, test_score, test_f1, test_balanced_accuracy, params, valid_f1
    
    return best_valid_f1, f"""
    {title}
    Train score: {best_train:.3}
    Eval score: {best_test:.3}
    Balanced Accuracy: {best_balanced_accuracy:.3}
    Weighted Test F1: {best_f1:.3}
    Params: {best_params}
    """

In [54]:
# Get scores for multiple different models
for model, parameters in classifiers:
    print(model)
    best_valid, best_scores = -1, ''
    for title in ['unigram counts', 'unigram tf-idf', 'bigram counts', 'bigram tf-idf', 'trigram counts', 'trigram tf-idf']:
        valid, scores = train_and_show_scores(title, model, parameters)
        if valid > best_valid:
            best_valid, best_scores = valid, scores
    print(best_scores)

<class 'sklearn.naive_bayes.MultinomialNB'>

    bigram counts
    Train score: 0.672
    Eval score: 0.645
    Balanced Accuracy: 0.641
    Weighted Test F1: 0.644
    Params: {'alpha': 1}
    
<class 'sklearn.svm._classes.SVC'>

    unigram tf-idf
    Train score: 0.715
    Eval score: 0.632
    Balanced Accuracy: 0.629
    Weighted Test F1: 0.631
    Params: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': 1000}
    
<class 'sklearn.linear_model._logistic.LogisticRegression'>

    bigram counts
    Train score: 0.672
    Eval score: 0.641
    Balanced Accuracy: 0.633
    Weighted Test F1: 0.643
    Params: {'C': 1, 'max_iter': 1000, 'penalty': 'l2'}
    
<class 'sklearn.ensemble._forest.RandomForestClassifier'>

    bigram counts
    Train score: 0.593
    Eval score: 0.589
    Balanced Accuracy: 0.567
    Weighted Test F1: 0.576
    Params: {'criterion': 'gini', 'max_depth': 5, 'n_estimators': 500}
    
<function AdaDTC at 0x7f784e1aeee0>

    unigram counts
    Train score: