## Sequence Classification

### Install The Necessary Linraries

In [None]:
!pip install transformers datasets optuna sentencepiece evaluate wandb huggingface-hub
!pip install accelerate -U

### Import The Necessary Libraries

In [None]:
import os
import random
import wandb
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

### Config

In [None]:
class Cfg:
  trial = 1
  path = ""
  project_name = ""
  model_name = ""
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  seed = 42
  train_file = ""
  test_file = ""
  label_mappings = None
  batch_size = 128
  max_length = 512
  num_classes = None

  #training params
  num_epochs = 4
  learning_rate = 1e-06
  warmup_steps = 500
  early_stopping_patience = 30
  lr_scheduler = "cosine"
  eval_strategy ="steps"
  steps = 250
  accum_steps = 8

  #trial attributes
  project_name = f"{model_name}-{num_epochs}_trial_{trial}"

  #submission
  predict_num = 1




### Wandb: Track your Trial Runs

In [None]:
wandb.login()
wandb.init(project=CFG.project_name)
%env WANDB_LOG_MODEL=true   #log every trained model

### Reproducibility

In [None]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True
set_random_seed(CFG.seed)
transformers.set_seed(CFG.seed)

### Load The Datasets

In [None]:
train = pd.read_csv(Cfg.path + Cfg.train_file)
test = pd.read_csv(Cfg.path + Cfg.test_file)
display(train.head(), test.head())



In [None]:
def preprocess(df, text_col, target_col= None,id_col= None ):
    # Rename columns if necessary
    if text_col != 'inputs':
        df.rename(columns={text_col: 'inputs'}, inplace=True)
    if target_col:
        if target_col != 'target':
            df.rename(columns={target_col: 'label'}, inplace=True)
    if id_col:
        if id_col != 'id':
            df.rename(columns={id_col: 'id'}, inplace=True)


    df['Character Count'] = df['inputs'].apply(lambda x: len(str(x)))
    print("The longest input has a length of", df['Character Count'].max())

    return df




train = preprocess(df=train,text_col='Word',target_col='Pos',id_col='Id')
test = preprocess(df=test,text_col='Word',target_col='Pos',id_col='Id')


In [None]:
le = LabelEncoder()
train['label'] = le.fit_transform(train['label'])
CFG.label_mappings = dict(zip(le.classes_, le.transform(le.classes_)))
print(CFG.label_mappings)


### Setup The Tokenizer

In [None]:
tokz = AutoTokenizer.from_pretrained(CFG.model_nm)
sep = tokz.sep_token

def tokenize(x): return tokz(x["inputs"],truncation=True, padding=True, max_length=CFG.max_length)

### Cross Validation

In [None]:
folds = StratifiedKFold(n_splits=5)
train['fold'] = -1
for i,(train_index, test_index) in enumerate(folds.split(train,train['label'])):
    train.loc[test_index,'fold'] = i

### Metric

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    f1_macro = f1_score(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1_macro": f1_macro}



### Build The Train Function

In [None]:

def train():
    all_valid_losses = []
    for fold in range(5):
        print(f"Fold {fold} ----------------------------------- TRAINING -----------------------------------")
        train_df = train[train['fold'] != fold]
        valid_df = train[train['fold'] == fold]

        train_dataset = Dataset.from_pandas(train_df)
        valid_dataset = Dataset.from_pandas(valid_df)

        train_tokenized_df = train_dataset.map(tokenize, batched=True)
        valid_tokenized_df = valid_dataset.map(tokenize, batched=True)

        model = AutoModelForSequenceClassification.from_pretrained(CFG.model_nm, num_labels=CFG.num_classes, hidden_dropout_prob=CFG.dropout).to(CFG.device)


        training_args = TrainingArguments(
            output_dir = project_name,
            learning_rate = Cfg.learning_rate,
            warmup_steps = Cfg.warmup_steps,
            per_device_train_batch_size = Cfg.batch_size,
            per_device_eval_batch_size = Cfg.batch_size,
            weight_decay = 0.01,
            evaluation_strategy= Cfg.eval_strategy,
            logging_strategy = Cfg.eval_strategy,
            logging_steps = Cfg.steps,
            save_strategy = Cfg.eval_strategy,
            save_steps = Cfg.steps,
            eval_steps = Cfg.steps,
            log_level = "warning",
            fp16 = True,
            gradient_accumulation_steps = Cfg.accum_steps,
            save_total_limit = 1,
            metric_for_best_model = "accuracy"
            )

        trainer = Trainer(
            model = model,
            compute_metrics = compute_metrics,
            train_dataset = train_tokenized_df,
            eval_dataset = valid_tokenized_df,
            tokenizer = tokz,
            callbacks = [early_stopping_callback]
            )

        trainer.train()
        print(f"Fold {fold} ----------------------------------- VALIDATING -----------------------------------")
        valid_preds = trainer.predict(valid_tokenized_df)
        valid_preds = softmax(valid_preds.predictions)
        np.save(f"outputs/fold{fold}/valid_preds.npy", valid_preds)
        trainer.save_model(f"{fold}_{project_name}")

        # Calculate log loss for the current fold
        valid_labels = valid_tokenized_df['label']
        valid_loss = log_loss(valid_labels, valid_preds)
        all_valid_losses.append(valid_loss)

        del trainer, model

    # Calculate overall valid_loss
    overall_valid_loss = sum(all_valid_losses) / len(all_valid_losses)
    print(f"Overall Valid Loss: {overall_valid_loss}")


### Inference

In [None]:
def inference():
  test_dataset = Dataset.from_pandas(test)
  test_tokenized_df = test_dataset.map(tokenize, batched=True)
  test_preds = []

  for fold in range(5):
    loaded_model = AutoModelForSequenceClassification.from_pretrained(
        f"{fold}_{project_name}",
        num_labels = Cfg.num_labels
    )

    loaded_tokenizer = AutoTokenizer.from_pretrained(
        f"{fold}_{project_name}",
    )

    test_args = TrainingArguments(
      output_dir = CFG.path,
      do_train = False,
      do_predict = True,
      per_device_eval_batch_size = CFG.batch_size,
      dataloader_drop_last = False
      )

    trainer = Trainer(
        model = loaded_model,
        args = test_args,
        compute_metrics = compute_metrics
    )

    test_results = trainer.predict(test_tokenized_df)

    if Cfg.predict_num == 1:
        result = test_results.predictions.argmax(axis=1)
        test_preds.append(result)
        result = pd.DataFrame(result, columns=['pred'])
        result['Id'] = test['id']
        result = result[['Id', 'pred']]

        # Reverse the dictionary to map values to labels
        name_le_mapping = {v: k for k, v in CFG.label_mappings.items()}
        # Map the 'pred' values to their string labels
        result['pred'] = result['pred'].map(name_le_mapping)
        display(result.head())

        result.to_csv(f"{fold}_{project_name}.csv")

    else:
        # Assuming you have the predicted probabilities in test_results
        result = pd.DataFrame(test_results.predictions, columns=CFG.label_mappings.values())
        result['Id'] = test['id']
        display(result.head())

        result.to_csv(f"{fold}_{project_name}_probs.csv", index=False)

  # Save the average of the 5 folds
  if CFG.predict_num != 1:
      avg_preds = sum(test_preds) / len(test_preds)
      avg_result = pd.DataFrame(avg_preds, columns=CFG.label_mappings.values())
      avg_result['Id'] = test['id']
      display(avg_result.head())

      avg_result.to_csv(f"average_{project_name}.csv", index=False)

  else:
      # Find the most occurring label
      most_occurred_label = test_preds.mode(axis=0).iloc[0]
      result = pd.DataFrame(most_occurred_label, columns=['pred'])
      result['Id'] = test['id']

      # Reverse the dictionary to map values to labels
      name_le_mapping = {v: k for k, v in CFG.label_mappings.items()}
      # Map the 'pred' values to their string labels
      result['pred'] = result['pred'].map(name_le_mapping)
      display(result.head())

      result.to_csv(f"mode_{project_name}.csv", index=False)


### Run It

In [None]:
if "__name__" == "__main__":
  train()
  inference()