## LACUNA MASAKHANE POS CLASSIFICATION
* Shoutout to Kenyor for his wonderful forked repo

In [None]:
!git clone https://github.com/NtemKenyor/masakhane-pos

In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate -U
!pip install wandb

### Import Libraries

In [None]:
import os
import random
import wandb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
from datasets import DatasetDict, Dataset

import torch
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import wandb


### Config setup

In [None]:
class CFG:
  project_name = "Baseline_lacuna_trial_one"
  test_path = "/content/masakhane-pos/data/Test.csv"
  train_path = "/content/masakhane-pos/data/africa_lan.csv"
  model_name = "Davlan/afro-xlmr-mini"
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  seed = 42
  max_length = 38
  valid_languages = ['wol', 'sna']
  num_classes = None
  num_epochs = 2
  label_mappings = None
  batch_size = 64


### Setup Wandb

In [None]:
wandb.login()
wandb.init(project=CFG.project_name)

In [None]:
%env WANDB_LOG_MODEL = true

### For reproducibility

In [None]:
def set_random_seed(random_seed):
  random.seed(random_seed)
  np.random.seed(random_seed)
  os.environ["PYTHONHASHEDSEED"] = str(random_seed)

  torch.manual_seed(random_seed)
  torch.cuda.manual_seed(random_seed)
  torch.cuda.manual_seed_all(random_seed)

  torch.backends.cudnn.deterministic = True

set_random_seed(CFG.seed)
transformers.set_seed(CFG.seed)

### Load the datasets

In [None]:
train = pd.read_csv(CFG.train_path)
test = pd.read_csv(CFG.test_path)

display(train.head(),
        test.head())

In [None]:
train.shape, test.shape

In [None]:
CFG.num_classes = train.tag.nunique()

In [None]:
display( 'Language valuecounts',
    train.lang.value_counts(),
    'tag valuecounts',
    train.tag.value_counts(),  )


### Find label mappings

In [None]:
train = train.rename(columns = {'tag': 'label'})
train = train.dropna()

train['label'] = le.fit_transform(train['label'])
CFG.label_mappings = dict(zip(le.classes_, le.transform(le.classes_)))
print(CFG.label_mappings)


### Setup The Model

In [None]:
tokz = AutoTokenizer.from_pretrained(CFG.model_name)
model = AutoModelForSequenceClassification.from_pretrained(CFG.model_name, num_labels = CFG.num_classes).to(CFG.device)


### Setup our Dataset

In [None]:
df_valid = train[train['lang'].isin(CFG.valid_languages)].drop(columns = ['lang'])
df_train = train[~train['lang'].isin(CFG.valid_languages)].drop(columns = ['lang'])
test = test.rename(columns = {'Word':'word'})
df_test = test [['word']]

df_train.shape, df_valid.shape, df_test.shape

In [None]:
df_train.label.mean(), df_valid.label.mean()

In [None]:
masakhane = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "valid": Dataset.from_pandas(df_valid),
    "test": Dataset.from_pandas(df_test),
})

masakhane

### Tokenization:


In [None]:
def tokenize(x):
  return tokz(x['word'], truncation = True, padding = True, max_length = CFG.max_length)
masakhane_encoded = masakhane.map(tokenize, batched = True, batch_size = CFG.batch_size)

### Training arguments

In [None]:
model_name = f"{CFG.model_name}-masakhane-challenge"
training_args = TrainingArguments(output_dir = model_name,
                                  num_train_epochs = CFG.num_epochs,
                                  learning_rate = 2e-05,
                                  evaluation_strategy ='steps',
                                  save_strategy = 'steps',
                                  save_steps = 10,
                                  eval_steps = 10,
                                  warmup_steps = 10,
                                  report_to="wandb",
                                  metric_for_best_model = "accuracy",
                                  load_best_model_at_end = True,
                                  save_total_limit=1,
                                  logging_strategy="steps",
                                  logging_steps= 10,
                                  per_device_train_batch_size=CFG.batch_size,
                                  per_device_eval_batch_size=CFG.batch_size,
                                  )

In [None]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  preds = np.argmax(predictions, axis = 1)
  f1_micro = f1_score(labels, preds, average = "micro")
  f1_macro = f1_score(labels, preds, average = "macro")
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc, "f1_micro": f1_micro, "f1_macro": f1_macro}

### Training

In [None]:
early_stopping_callback = EarlyStoppingCallback(50)
trainer = Trainer(model = model,
                  args = training_args,
                  compute_metrics = compute_metrics,
                  train_dataset = masakhane_encoded['train'],
                  eval_dataset = masakhane_encoded['valid'],
                  tokenizer = tokz,
                  callbacks = [early_stopping_callback],


                  )

trainer.train()

0.227133835(public lb) - 0.2159(locally)

In [None]:
trainer.evaluate()

### 0.334691383 - 0.32816


In [None]:
wandb.finish()

#### Inference pipeline

In [None]:
trainer.save_model("baseline_model")

In [None]:
import os
import random
import pandas as pd
import numpy as np
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback



In [None]:
loaded_model = AutoModelForSequenceClassification.from_pretrained(
    '/content/' + "baseline_model",
    num_labels = CFG.num_classes,

)

loaded_tokenizer = AutoTokenizer.from_pretrained(
    '/content/baseline_model'
)

In [None]:
test_args = TrainingArguments(
    output_dir= '/content/',
    do_train =False,
    do_predict = True,
    dataloader_drop_last = False
)

trainer = Trainer(
    model = loaded_model,
    args = test_args,
)

test_results = trainer.predict(masakhane_encoded['test'])

### Prepare Submission

In [None]:
result = test_results.predictions.argmax(axis = 1)
result = pd.DataFrame(result, columns = ['pred'])
result['Id']= test['Id']

name_le_mappings = {v:k for k,v in CFG.label_mappings.items()}
result['pred'] = result['pred'].map(name_le_mappings)
result = result[['Id', 'pred']]
result.to_csv("baseline.csv", index = False)
result.head()


### To dos
* Improve this approach
* Try a token classification approach