<a href="https://colab.research.google.com/github/koleshjr/ALL_MY_TEMPLATES/blob/main/NLP_Pipeline_MultiClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## NLP MULTICLASSIFICATION PIPELINE - ITERATIVE EXPERIMENTS

In [None]:
!pip install -q datasets
!pip install transformers
!pip install optuna

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
from pathlib import Path
path = Path('/content/drive/MyDrive/Swahili_nlp')

In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import transformers
import torch
from torch.utils.data import Dataset, DataLoader 
from transformers import DistilBertTokenizer, DistilBertModel

import warnings
warnings.filterwarnings('ignore')
logging.disable(logging.WARNING)

In [None]:


df = pd.read_csv(path/'train.csv')
df



In [None]:
eval_df = pd.read_csv(path/'test.csv')
len(eval_df)

In [None]:
eval_df.head()

In [None]:
df.target.value_counts()


### Training

In [None]:
from torch.utils.data import DataLoader
import warnings,transformers,logging,torch
from transformers import TrainingArguments,Trainer
from transformers import AutoModelForSequenceClassification,AutoTokenizer

In [None]:
import datasets
from datasets import load_dataset, Dataset, DatasetDict

In [None]:
model_nm = 'microsoft/deberta-v3-small'

Convert pandas dataframes to hugging face dataframe

In [None]:
ds = Dataset.from_pandas(df).rename_column('score', 'label')
eval_ds = Dataset.from_pandas(eval_df)

Tokenizing function

In [None]:
def tok_func(x): return tokz(x["inputs"])

In [None]:
tok_func(ds[0]

In [None]:
#tokenize and remove the columns we no longer need 
inps = "anchor","target","context"
tok_ds = ds.map(tok_func, batched=True, remove_columns=inps+('inputs','id','section'))

look at the first item

In [None]:
tok_ds[0]

### Creating a validation set

In [None]:
from sklearn.model_selection import train_test_split
train_idxs, val_idxs = train_test_split(df, test_size=0.2, stratify=df['target_column'])
train_idxs = list(train_idxs)
val_idxs = list(val_idxs)
len(val_idxs),len(train_idxs)

Ensure the mean of both traain and val is from the same distribution

In [None]:
df.iloc[trn_idxs].score.mean(),df.iloc[val_idxs].score.mean()

In [None]:

## Hugging Face Datasets
dds = DatasetDict({"train":tok_ds.select(trn_idxs),
             "test": tok_ds.select(val_idxs)})



### Initial Model

In [None]:
def multiclass_accuracy(eval_pred): return {'multiclass_accuracy': np.mean(eval_pred[0] == eval_pred[1])}

def multiclass_f1(eval_pred): 
    true_positives = np.sum((eval_pred[0] == eval_pred[1]) & (eval_pred[0] == 1))
    false_positives = np.sum((eval_pred[0] != eval_pred[1]) & (eval_pred[1] == 1))
    false_negatives = np.sum((eval_pred[0] != eval_pred[1]) & (eval_pred[0] == 1))
    precision = true_positives/(true_positives + false_positives)
    recall = true_positives/(true_positives + false_negatives)
    return {'multi_class_f1_score': 2*((precision*recall)/(precision + recall)

model params

In [None]:
lr,bs = 8e-5,128
wd,epochs = 0.01,4

Transformers uses the TrainingArguments class to set up arguments. We'll use a cosine scheduler with warmup, since at fast.ai we've found that's pretty reliable. We'll use fp16 since it's much faster on modern GPUs, and saves some memory. We evaluate using double-sized batches, since no gradients are stored so we can do twice as many rows at a time.

In [None]:
def get_trainer(dds):
    args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
        evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
        num_train_epochs=epochs, weight_decay=wd, report_to='none')
    model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
    return Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                   tokenizer=tokz, compute_metrics=[multiclass_accuracy, multiclass_f1])

In [None]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=wd, report_to='none')

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
               tokenizer=tokz, compute_metrics=corr)

In [None]:
trainer.train();

### Improving the Model

We now want to start iterating to improve this. To do that, we need to know whether the model gives stable results. I tried training it 3 times from scratch, and got a range of outcomes from 0.808-0.810. This is stable enough to make a start - if we're not finding improvements that are visible within this range, then they're not very significant! Later on, if and when we feel confident that we've got the basics right, we can use cross validation and more epochs of training.

Geet dataset function

In [None]:
def get_dds(df):
    ds = Dataset.from_pandas(df).rename_column('score', 'label')
    tok_ds = ds.map(tok_func, batched=True, remove_columns=inps+('inputs','id','section'))
    return DatasetDict({"train":tok_ds.select(trn_idxs), "test": tok_ds.select(val_idxs)})

Trainer Function

In [None]:
def get_model(): return AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=5)

def get_trainer(dds, model=None):
    if model is None: model = get_model()
    args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
        evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
        num_train_epochs=epochs, weight_decay=wd, report_to='none')
    return Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                   tokenizer=tokz, compute_metrics=[multiclass_accuracy, multiclass_f1])

### Feature Engineering + Preprocessing if necessary

In [None]:


sep = " [s] "
df['inputs'] = df.context + sep + df.anchor + sep + df.target
dds = get_dds(df)



In [None]:
get_trainer(dds).train()

In [None]:
df['inputs'] = df.inputs.str.lower()
dds = get_dds(df)
get_trainer(dds).train()

### Cross Validation

In [None]:
n_folds = 4


In [None]:
from sklearn.model_selection import StratifiedGroupKFold
cv = StratifiedGroupKFold(n_splits=n_folds)

In [None]:
df = df.sample(frac=1, random_state=42)
scores = (df.score*100).astype(int)
folds = list(cv.split(idxs, scores, df.anchor))
folds

 Function to create Folds

In [None]:
def get_fold(folds, fold_num):
    trn,val = folds[fold_num]
    return DatasetDict({"train":tok_ds.select(trn), "test": tok_ds.select(val)})

In [None]:
dds = get_fold(folds, 0)
dds



We can now pass this into get_trainer as we did before. If we have, say, 4 folds, then doing that for each fold will give us 4 models, and 4 sets of predictions and metrics. You could ensemble the 4 models to get a stronger model, and can also average the 4 metrics to get a more accurate assessment of your model. Here's how to get the final epoch metrics from a trainer:


In [None]:
metrics = [o['eval_pearson'] for o in trainer.state.log_history if 'eval_pearson' in o]
metrics[-1]

### Inference

In [None]:
preds = trainer.predict(eval_ds).predictions.astype(float)
preds


In [None]:
preds = np.clip(preds, 0, 1)

In [None]:
preds

In [None]:
import datasets

submission = datasets.Dataset.from_dict({
    'id': eval_ds['id'],
    'score': preds
})

submission.to_csv('submission.csv', index=False)

### Hyper Parameter Tuning

In [None]:
#Define the search space
import optuna 

def objective(trial):
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-2)
    bs = trial.suggest_int('bs', 32, 512)
    epochs = trial.suggest_int('epochs', 10, 100)
    wd = trial.suggest_loguniform('wd', 1e-10, 1e-2)
    
    #Define the model
    model = get_model()
    #Define the training arguments
    args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
        evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
        num_train_epochs=epochs, weight_decay=wd, report_to='none')
    trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                   tokenizer=tokz, compute_metrics=[multiclass_accuracy, multiclass_f1])
    #Train the model
    trainer.train()
    #Evaluate the model
    score = trainer.evaluate()
    return score

#Run the optimization loop
study = optuna.create_study()
study.optimize(objective, n_trials=20)
#Print the best parameters
print(study.best_params)