# Classification by the nb-BERT-large Model Prior to Fine-tuning
In this notebook we: 

1. Let the model classify the gender of artist in the test subset Thomas Treo concert reviews (30%) without having undergone any fine-tuning.

2. Extraxt a classification report for this model.

Following this, the Intergrated Gradient (IG) notebook for this model (`IG_pretrained.ipynb)`) can be ranned. 


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Initialize GPU
!nvidia-smi

Wed Dec 20 08:17:33 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
#### Install packages:
!pip install -q transformers transformers-interpret datasets evaluate tensorflow spacy spacy_langdetect numpy

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/45.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m998.1/998.1 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py

In [4]:
### Importing packages:
import pandas as pd
import torch
from datasets import Dataset, DatasetDict, load_metric
from transformers import AutoTokenizer, AutoModelForPreTraining, Trainer, TrainingArguments, AutoModelForSequenceClassification
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import EarlyStoppingCallback
import numpy as np

#### Data setup and loading:

In [5]:
### loading data:
train_path = "/content/drive/MyDrive/Colab_Notebooks/NLP_EX_COLAB/nb_large_model/train.csv"
df_train = pd.read_csv(train_path)

test_path = "/content/drive/MyDrive/Colab_Notebooks/NLP_EX_COLAB/nb_large_model/test.csv"
df_test = pd.read_csv(test_path)

val_path = "/content/drive/MyDrive/Colab_Notebooks/NLP_EX_COLAB/nb_large_model/valid.csv"
df_val = pd.read_csv(val_path)

train_dataset = Dataset.from_pandas(df_train)
valid_dataset = Dataset.from_pandas(df_val)
test_dataset = Dataset.from_pandas(df_test)

data_set = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
    })

### Loading model and tokenizer
The model and corresponding tokenizer is loaded from `AutoTokenizer.from_pretrained`. Then we use the tokenizer in a function in which padding and truncation is set to True. This means that we pad to the maximum token length of each text which is 512 here. Although truncation is set to True no actual truncation takes place, as texts with a token length above 512 has been splittet already.

In [7]:
# Loading model and tokenizer
model_ckpt = "NbAiLab/nb-bert-large"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
num_labels=2
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels) 

tokenizer_config.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/518 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/395k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NbAiLab/nb-bert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [9]:
### tokenizing data:
dat_tokenized = data_set.map(tokenize, batched=True)

Map:   0%|          | 0/919 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/153 [00:00<?, ? examples/s]

Map:   0%|          | 0/461 [00:00<?, ? examples/s]

### Evaluating the model without any fine-tuning on the classification task
The model is prior any fine-tuning now asked to classify the gender of the artist in the test data of the Thomas Treo reviews.

In [31]:
import numpy as np
import torch
from torch.utils.data import DataLoader

# Function to compute metrics and collect individual predictions
def compute_metrics_and_predictions(logits, labels):
    predictions = np.argmax(logits, axis=-1)

    # Collect individual predictions
    individual_results = []
    for i in range(len(predictions)):
        softmax_logits = torch.softmax(torch.tensor(logits[i]), dim=0)
        probabilities = softmax_logits.numpy()
        individual_result = {
            "text_inx": i,
            "Predicted Labels": predictions[i],
            "True Labels": labels[i],
            "Logits Values": logits[i],
            "Probabilities": probabilities,
            "Missclassification": predictions[i] == labels[i]
        }
        individual_results.append(individual_result)

# Prepare test data and DataLoader
test_features = dat_tokenized["test"].remove_columns(['Unnamed: 0'])
test_features.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_loader = DataLoader(test_features, batch_size=16)

# Evaluation
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

all_individual_results = []

for batch in test_loader:
    b_input_ids = batch["input_ids"].to(device)
    b_input_mask = batch["attention_mask"].to(device)
    b_labels = batch["label"].to(device)

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    metrics = compute_metrics_and_predictions(logits, label_ids)
    all_individual_results.extend(metrics["individual_results"])

results_df = pd.DataFrame(all_individual_results)
print(metrics)



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

{'accuracy': 0.5384615384615384, 'precision': 0.28994082840236685, 'recall': 0.5384615384615384, 'f1': 0.37692307692307697, 'individual_results': [{'text_inx': 0, 'Predicted Labels': 1, 'True Labels': 0, 'Logits Values': array([-0.9167207 ,  0.32106003], dtype=float32), 'Probabilities': array([0.22482254, 0.7751775 ], dtype=float32), 'Missclassification': False}, {'text_inx': 1, 'Predicted Labels': 1, 'True Labels': 1, 'Logits Values': array([-1.0696636 ,  0.32849875], dtype=float32), 'Probabilities': array([0.19810787, 0.8018921 ], dtype=float32), 'Missclassification': True}, {'text_inx': 2, 'Predicted Labels': 1, 'True Labels': 1, 'Logits Values': array([-1.090285  ,  0.29638726], dtype=float32), 'Probabilities': array([0.19993953, 0.80006045], dtype=float32), 'Missclassification': True}, {'text_inx': 3, 'Predicted Labels': 1, 'True Labels': 1, 'Logits Values': array([-1.020367 ,  0.3142469], dtype=float32), 'Probabilities': array([0.20839721, 0.79160285], dtype=float32), 'Missclassi

  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
results_df['Predicted Labels'] = results_df['Predicted Labels'].replace({0: 'Female', 1: 'Male'})
results_df['True Labels'] = results_df['True Labels'].replace({0: 'Female', 1: 'Male'})
results_df['Missclassification'] = results_df['Missclassification'].replace({True: 'TRUE', False: 'MISS'})

Unnamed: 0,text_inx,Predicted Labels,True Labels,Logits Values,Probabilities,Missclassification
0,0,Male,Male,"[-0.9680173, 0.34483105]","[0.2120106, 0.7879894]",TRUE
1,1,Male,Male,"[-0.908005, 0.3307311]","[0.22465606, 0.77534395]",TRUE
2,2,Male,Male,"[-1.0503737, 0.34982973]","[0.19778384, 0.80221623]",TRUE
3,3,Male,Female,"[-0.9380069, 0.27351972]","[0.22943105, 0.77056897]",MISS
4,4,Male,Male,"[-0.9296245, 0.5269984]","[0.1889844, 0.8110156]",TRUE
...,...,...,...,...,...,...
456,8,Male,Male,"[-0.69235486, 0.07388845]","[0.3172923, 0.68270767]",TRUE
457,9,Male,Female,"[-0.9792756, 0.28949726]","[0.21946739, 0.7805326]",MISS
458,10,Male,Male,"[-1.0313607, 0.32327595]","[0.20511337, 0.79488665]",TRUE
459,11,Male,Female,"[-0.9310722, 0.37079084]","[0.21385165, 0.78614837]",MISS


#### Classification report:

In [24]:
from sklearn.metrics import classification_report

In [36]:
# Extracting true labels and predictions
true_labels = [result['True Labels'] for result in all_individual_results]
predicted_labels = [result['Predicted Labels'] for result in all_individual_results]

# Generating classification report
report = classification_report(true_labels, predicted_labels, target_names=['female', 'male'])

print(report)


              precision    recall  f1-score   support

      female       0.00      0.00      0.00       133
        male       0.71      0.99      0.83       328

    accuracy                           0.71       461
   macro avg       0.36      0.50      0.41       461
weighted avg       0.51      0.71      0.59       461



#### Saving model performance

In [34]:
results_df.to_csv("/content/drive/MyDrive/Colab_Notebooks/NLP_EX_COLAB/eval_outputs/PRETRAIN_NBL_df_classification_report.csv")
metrics_df.to_csv("/content/drive/MyDrive/Colab_Notebooks/NLP_EX_COLAB/eval_outputs/PRETRAIN_NBL_df_classification_metrics_test.csv")