<a href="https://colab.research.google.com/github/marzinouri/AzeriPipeline/blob/main/Notebooks/Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prerequisites

In [None]:
%%capture
!pip install transformers

In [None]:
%%capture
!pip install tqdm

In [None]:
import pandas as pd
import json
import os
import numpy as np
import collections

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import f1_score
from sklearn.utils import shuffle

from transformers import BertConfig, BertTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm.notebook import tqdm

# Preparing Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Preparing Data
df = pd.DataFrame()
texts = []
cats = []
label_ids = []
src_dir = "/content/drive/MyDrive/Azari/Datasets/ClassficationData"
for (root, dirs, files) in os.walk(src_dir):
  for file in files:
    print(file)
    src_f = root+'/'+file
    with open(src_f, "r", encoding='utf-8') as f:
      articles = f.readlines()
    for article in articles:
      article_dict = json.loads(article)
      if article_dict['main category'] == "ادبیات":
        cat = "literature"
        cats.append(cat)
        label_ids.append(0)

      elif article_dict['main category'] == "ایدمان":
        cat = "sports"
        cats.append(cat)
        label_ids.append(1)

      elif article_dict['main category'] == "تاریخ":
        cat = "history"
        cats.append(cat)
        label_ids.append(2)

      elif article_dict['main category'] == "جوغرافیا":
        cat = "geography"
        cats.append(cat)
        label_ids.append(3)

      content = " ".join(article_dict['sents'])
      texts.append(content)
df['text'] = texts
df['target'] = cats
df['label_id'] = label_ids

history.txt
literature.txt
sports.txt
geography.txt


In [None]:
len(df)

400

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label_id'])

In [None]:
train.head()

Unnamed: 0,text,target,label_id
278,کونگ‌فو بیر سؤز دیر کی تمام چینی دؤیوش هۆنرلری...,sports,1
77,حسن صباح سلجوقلار چاغیندا قاریشیق سالدی . اونو...,history,2
116,آلاو تخلصی و گئنللیکله احمد شایا آدیلا تانینان...,literature,0
369,دنیز سطحی ؛ آتموسفر ایله دنیز سطحی‌نین بیرلشدی...,geography,3
87,خالخال خانلیغی مرکزی خالخال شهری اولماقلا آذرب...,history,2


# Baseline TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_train_vectors = tfidf_vectorizer.fit_transform(train['text'])
tfidf_test_vectors = tfidf_vectorizer.transform(test['text'])

In [None]:
svc_classifier = SVC(random_state=42)
svc_classifier.fit(tfidf_train_vectors,train['target'])

SVC(random_state=42)

In [None]:
tfidf_y_pred = svc_classifier.predict(tfidf_test_vectors)

In [None]:
print(classification_report(test['target'],tfidf_y_pred))

              precision    recall  f1-score   support

   geography       1.00      0.50      0.67        20
     history       0.77      0.85      0.81        20
  literature       0.69      0.90      0.78        20
      sports       0.82      0.90      0.86        20

    accuracy                           0.79        80
   macro avg       0.82      0.79      0.78        80
weighted avg       0.82      0.79      0.78        80



# Baseline Fasttext

In [None]:
%%capture

!wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
!unzip v0.9.2.zip
%cd fastText-0.9.2
!make
!pip install .

In [None]:
import fasttext

ft = fasttext.load_model('/content/drive/MyDrive/Azari/Models/Fasttext/v4_model_300.bin')



In [None]:
def mean_vector(vectors):
  num_vectors = len(vectors)
  sum_vectors = sum(vectors)
  mean_vector = sum_vectors/num_vectors
  return mean_vector

In [None]:
doc_embeddings = []

for text in train['text']:
  words = text.split()
  word_embeddings = []
  for word in words:
    word_e = ft.get_word_vector(word)
    word_embeddings.append(word_e)
  doc_embeddings.append(mean_vector(word_embeddings))

In [None]:
doc_embeddings_test = []

for text in test['text']:
  words = text.split()
  word_embeddings = []
  for word in words:
    word_e = ft.get_word_vector(word)
    word_embeddings.append(word_e)
  doc_embeddings_test.append(mean_vector(word_embeddings))

In [None]:
fasttext_train_vectors = np.array(doc_embeddings)
fasttext_test_vectors = np.array(doc_embeddings_test)

In [None]:
# from sklearn.linear_model import LogisticRegression

# lr_clf = LogisticRegression(random_state=42)
# lr_clf.fit(fasttext_train_vectors,train['label_id'])

# lr_tfidf_y_pred = lr_clf.predict(fasttext_test_vectors)
# print(classification_report(test['label_id'],lr_tfidf_y_pred))

In [None]:
svc_classifier = SVC()
svc_classifier.fit(fasttext_train_vectors,train['target'])

SVC()

In [None]:
y_pred = svc_classifier.predict(fasttext_test_vectors)

In [None]:
print(classification_report(test['target'],y_pred))

              precision    recall  f1-score   support

   geography       1.00      0.75      0.86        20
     history       0.84      0.80      0.82        20
  literature       0.76      0.95      0.84        20
      sports       0.90      0.95      0.93        20

    accuracy                           0.86        80
   macro avg       0.88      0.86      0.86        80
weighted avg       0.88      0.86      0.86        80



In [None]:
cnf_matrix = confusion_matrix(test['target'],y_pred)
cnf_matrix

array([[15,  2,  1,  2],
       [ 0, 16,  4,  0],
       [ 0,  1, 19,  0],
       [ 0,  0,  1, 19]])

# BERT

Acknowledgment: This portion of the code is based on the work available at [ParsBERT](https://github.com/hooshvare/parsbert).

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'device: {device}')

train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

device: cuda:0
CUDA is available!  Training on GPU ...


In [None]:
# general config
MAX_LEN = 64
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32

EPOCHS = 10
EEVERY_EPOCH = 20
LEARNING_RATE = 275e-7

CLIP = 0.0

model_path = "/content/drive/MyDrive/Azari/Models/AzerBert_v2"
MODEL_NAME_OR_PATH = os.path.join(model_path, "checkpoint-11630")
OUTPUT_PATH = '/content/drive/MyDrive/Azari/Models/Classificaion/final.bin'

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [None]:
# create a key finder based on label 2 id and id to label
labels = ["literature", "sports", "history", "geography"]
label2id = {label: i for i, label in enumerate(labels)}
id2label = {v: k for k, v in label2id.items()}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

label2id: {'literature': 0, 'sports': 1, 'history': 2, 'geography': 3}
id2label: {0: 'literature', 1: 'sports', 2: 'history', 3: 'geography'}


In [None]:
tokenizer = BertTokenizer.from_pretrained(model_path)
config = BertConfig.from_pretrained(model_path,**{
        'label2id': label2id,
        'id2label': id2label,
    })
print(config.to_json_string())

{
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "cls_token": "[CLS]",
  "do_lower_case": true,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "literature",
    "1": "sports",
    "2": "history",
    "3": "geography"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "geography": 3,
    "history": 2,
    "literature": 0,
    "sports": 1
  },
  "layer_norm_eps": 1e-12,
  "mask_token": "[MASK]",
  "max_len": 64,
  "max_position_embeddings": 512,
  "model_max_length": 64,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token": "[PAD]",
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "sep_token": "[SEP]",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "unk_token": "[UNK]",
  "use_cache": true,
  "vocab_size": 30522
}



In [None]:
class WikiDataset(torch.utils.data.Dataset):
    """ Create a PyTorch dataset for Wiki. """

    def __init__(self, tokenizer, articles, targets=None, label_list=None, max_len=64):
        self.articles = articles
        self.targets = targets
        self.has_target = isinstance(targets, list) or isinstance(targets, np.ndarray)

        self.tokenizer = tokenizer
        self.max_len = max_len


        self.label_map = {label: i for i, label in enumerate(label_list)} if isinstance(label_list, list) else {}

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, item):
        article = str(self.articles[item])

        if self.has_target:
            target = self.label_map.get(str(self.targets[item]), str(self.targets[item]))

        encoding = self.tokenizer.encode_plus(
            article,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt')

        inputs = {
            'article': article,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
        }

        if self.has_target:
            inputs['targets'] = torch.tensor(target, dtype=torch.long)

        return inputs


def create_data_loader(x, y, tokenizer, max_len, batch_size, label_list):
    dataset = WikiDataset(
        articles=x,
        targets=y,
        tokenizer=tokenizer,
        max_len=max_len,
        label_list=label_list)

    return torch.utils.data.DataLoader(dataset, batch_size=batch_size)

In [None]:
labels = ["literature", "sports", "history", "geography"]
train_data_loader = create_data_loader(train['text'].to_numpy(), train['target'].to_numpy(), tokenizer, MAX_LEN, VALID_BATCH_SIZE, labels)
test_data_loader = create_data_loader(test['text'].to_numpy(), test['target'].to_numpy(), tokenizer, MAX_LEN, VALID_BATCH_SIZE, labels)

In [None]:
def setup_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True


setup_seed(42)

In [None]:
class ClassificationModel(nn.Module):

    def __init__(self, config):
        super(ClassificationModel, self).__init__()

        self.bert = BertModel.from_pretrained(MODEL_NAME_OR_PATH,return_dict=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [None]:
pt_model = ClassificationModel(config=config)
pt_model = pt_model.to(device)

print('pt_model', type(pt_model))

Some weights of the model checkpoint at /content/drive/MyDrive/Azari/Models/AzerBert_v2/checkpoint-11630 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/Azari/Models/AzerBert_v2/checkpoint-1

pt_model <class '__main__.ClassificationModel'>


In [None]:
def simple_accuracy(y_true, y_pred):
    return (y_true == y_pred).mean()

def acc_and_f1(y_true, y_pred, average='weighted'):
    acc = simple_accuracy(y_true, y_pred)
    f1 = f1_score(y_true=y_true, y_pred=y_pred, average=average)
    return {
        "acc": acc,
        "f1": f1,
    }

def y_loss(y_true, y_pred, losses):
    y_true = torch.stack(y_true).cpu().detach().numpy()
    y_pred = torch.stack(y_pred).cpu().detach().numpy()
    y = [y_true, y_pred]
    loss = np.mean(losses)

    return y, loss


def eval_op(model, data_loader, loss_fn):
    model.eval()

    losses = []
    y_pred = []
    y_true = []

    with torch.no_grad():
        for dl in tqdm(data_loader, total=len(data_loader), desc="Evaluation... "):

            input_ids = dl['input_ids']
            attention_mask = dl['attention_mask']
            token_type_ids = dl['token_type_ids']
            targets = dl['targets']

            # move tensors to GPU if CUDA is available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            targets = targets.to(device)

            # compute predicted outputs by passing inputs to the model
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)

            # convert output probabilities to predicted class
            _, preds = torch.max(outputs, dim=1)

            # calculate the batch loss
            loss = loss_fn(outputs, targets)

            # accumulate all the losses
            losses.append(loss.item())

            y_pred.extend(preds)
            y_true.extend(targets)

    eval_y, eval_loss = y_loss(y_true, y_pred, losses)
    return eval_y, eval_loss


def train_op(model,
             data_loader,
             loss_fn,
             optimizer,
             scheduler,
             step=0,
             print_every_step=5,
             eval=False,
             eval_cb=None,
             eval_loss_min=np.Inf,
             eval_data_loader=None,
             clip=0.0):

    model.train()

    losses = []
    y_pred = []
    y_true = []

    for dl in tqdm(data_loader, total=len(data_loader), desc="Training... "):
        step += 1

        input_ids = dl['input_ids']
        attention_mask = dl['attention_mask']
        token_type_ids = dl['token_type_ids']
        targets = dl['targets']

        # move tensors to GPU if CUDA is available
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        targets = targets.to(device)

        # clear the gradients of all optimized variables
        optimizer.zero_grad()

        # compute predicted outputs by passing inputs to the model
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids)

        # convert output probabilities to predicted class
        _, preds = torch.max(outputs, dim=1)

        # calculate the batch loss
        loss = loss_fn(outputs, targets)

        # accumulate all the losses
        losses.append(loss.item())

        # compute gradient of the loss with respect to model parameters
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        if clip > 0.0:
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip)

        # perform optimization step
        optimizer.step()

        # perform scheduler step
        scheduler.step()

        y_pred.extend(preds)
        y_true.extend(targets)

        if eval:
            train_y, train_loss = y_loss(y_true, y_pred, losses)
            train_score = acc_and_f1(train_y[0], train_y[1], average='weighted')

            if step % print_every_step == 0:
                eval_y, eval_loss = eval_op(model, eval_data_loader, loss_fn)
                eval_score = acc_and_f1(eval_y[0], eval_y[1], average='weighted')

                if hasattr(eval_cb, '__call__'):
                    eval_loss_min = eval_cb(model, step, train_score, train_loss, eval_score, eval_loss, eval_loss_min)

    train_y, train_loss = y_loss(y_true, y_pred, losses)

    return train_y, train_loss, step, eval_loss_min

In [None]:
optimizer = AdamW(pt_model.parameters(), lr=LEARNING_RATE, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss()

step = 0
eval_loss_min = np.Inf
history = collections.defaultdict(list)


def eval_callback(epoch, epochs, output_path):
    def eval_cb(model, step, train_score, train_loss, eval_score, eval_loss, eval_loss_min):
        statement = ''
        statement += 'Epoch: {}/{}...'.format(epoch, epochs)
        statement += 'Step: {}...'.format(step)

        statement += 'Train Loss: {:.6f}...'.format(train_loss)
        statement += 'Train Acc: {:.3f}...'.format(train_score['acc'])

        statement += 'Valid Loss: {:.6f}...'.format(eval_loss)
        statement += 'Valid Acc: {:.3f}...'.format(eval_score['acc'])

        print(statement)

        if eval_loss <= eval_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
                eval_loss_min,
                eval_loss))

            torch.save(model.state_dict(), output_path)
            eval_loss_min = eval_loss

        return eval_loss_min


    return eval_cb


for epoch in tqdm(range(1, EPOCHS + 1), desc="Epochs... "):
    train_y, train_loss, step, eval_loss_min = train_op(
        model=pt_model,
        data_loader=train_data_loader,
        loss_fn=loss_fn,
        optimizer=optimizer,
        scheduler=scheduler,
        step=step,
        print_every_step=EEVERY_EPOCH,
        eval=True,
        eval_cb=eval_callback(epoch, EPOCHS, OUTPUT_PATH),
        eval_loss_min=eval_loss_min,
        eval_data_loader=test_data_loader,
        clip=CLIP)

    train_score = acc_and_f1(train_y[0], train_y[1], average='weighted')

    eval_y, eval_loss = eval_op(
        model=pt_model,
        data_loader=test_data_loader,
        loss_fn=loss_fn)

    eval_score = acc_and_f1(eval_y[0], eval_y[1], average='weighted')

    history['train_acc'].append(train_score['acc'])
    history['train_loss'].append(train_loss)
    history['val_acc'].append(eval_score['acc'])
    history['val_loss'].append(eval_loss)



Epochs... :   0%|          | 0/10 [00:00<?, ?it/s]

Training... :   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/3 [00:00<?, ?it/s]

Training... :   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/3 [00:00<?, ?it/s]

Epoch: 2/10...Step: 20...Train Loss: 0.804267...Train Acc: 0.616...Valid Loss: 0.751244...Valid Acc: 0.625...
Validation loss decreased (inf --> 0.751244).  Saving model ...


Evaluation... :   0%|          | 0/3 [00:00<?, ?it/s]

Training... :   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/3 [00:00<?, ?it/s]

Training... :   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/3 [00:00<?, ?it/s]

Epoch: 4/10...Step: 40...Train Loss: 0.405486...Train Acc: 0.772...Valid Loss: 0.739999...Valid Acc: 0.650...
Validation loss decreased (0.751244 --> 0.739999).  Saving model ...


Evaluation... :   0%|          | 0/3 [00:00<?, ?it/s]

Training... :   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/3 [00:00<?, ?it/s]

Training... :   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/3 [00:00<?, ?it/s]

Epoch: 6/10...Step: 60...Train Loss: 0.185445...Train Acc: 0.931...Valid Loss: 0.369946...Valid Acc: 0.863...
Validation loss decreased (0.739999 --> 0.369946).  Saving model ...


Evaluation... :   0%|          | 0/3 [00:00<?, ?it/s]

Training... :   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/3 [00:00<?, ?it/s]

Training... :   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/3 [00:00<?, ?it/s]

Epoch: 8/10...Step: 80...Train Loss: 0.061912...Train Acc: 0.981...Valid Loss: 0.376538...Valid Acc: 0.900...


Evaluation... :   0%|          | 0/3 [00:00<?, ?it/s]

Training... :   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/3 [00:00<?, ?it/s]

Training... :   0%|          | 0/10 [00:00<?, ?it/s]

Evaluation... :   0%|          | 0/3 [00:00<?, ?it/s]

Epoch: 10/10...Step: 100...Train Loss: 0.066977...Train Acc: 0.978...Valid Loss: 0.449041...Valid Acc: 0.887...


Evaluation... :   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
def predict(model, articles, tokenizer, max_len=64, batch_size=32):
    data_loader = create_data_loader(articles, None, tokenizer, max_len, batch_size, None)

    predictions = []
    prediction_probs = []


    model.eval()
    with torch.no_grad():
        for dl in tqdm(data_loader, position=0):
            input_ids = dl['input_ids']
            attention_mask = dl['attention_mask']
            token_type_ids = dl['token_type_ids']

            # move tensors to GPU if CUDA is available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)

            # compute predicted outputs by passing inputs to the model
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)

            # convert output probabilities to predicted class
            _, preds = torch.max(outputs, dim=1)

            predictions.extend(preds)
            prediction_probs.extend(F.softmax(outputs, dim=1))

    predictions = torch.stack(predictions).cpu().detach().numpy()
    prediction_probs = torch.stack(prediction_probs).cpu().detach().numpy()

    return predictions, prediction_probs

In [None]:
test_articles = test['text'].to_numpy()
preds, probs = predict(pt_model, test_articles, tokenizer, max_len=64)

print(preds.shape, probs.shape)

y_test, y_pred = test['label_id'].to_numpy(), preds

print(f'F1: {f1_score(y_test, y_pred, average="weighted")}')
print()
print(classification_report(y_test, y_pred, target_names=labels))

  0%|          | 0/3 [00:00<?, ?it/s]

(80,) (80, 4)
F1: 0.8864367614367612

              precision    recall  f1-score   support

  literature       0.91      1.00      0.95        20
      sports       0.79      0.95      0.86        20
     history       0.94      0.80      0.86        20
   geography       0.94      0.80      0.86        20

    accuracy                           0.89        80
   macro avg       0.90      0.89      0.89        80
weighted avg       0.90      0.89      0.89        80



# Load Model
Run ClassificationModel and Predict and CreateDataLoader first

In [None]:
model = ClassificationModel(config=config)
model = model.to(device)

model.load_state_dict(torch.load("/content/drive/MyDrive/Azari/Models/Classificaion/final.bin"))
model.eval()

Some weights of the model checkpoint at /content/drive/MyDrive/Azari/Models/AzerBert_v2/checkpoint-11630 were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at /content/drive/MyDrive/Azari/Models/AzerBert_v2/checkpoint-1

93377280


ClassificationModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(10000, 768, padding_idx=0)
      (position_embeddings): Embedding(64, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [None]:
import torch

num_params = sum(p.numel() for p in model.parameters())
print(num_params)


93380356


In [None]:
test_articles = test['text'].to_numpy()
preds, probs = predict(model, test_articles, tokenizer, max_len=64)

print(preds.shape, probs.shape)

y_test, y_pred = test['label_id'].to_numpy(), preds

print(f'F1: {f1_score(y_test, y_pred, average="weighted")}')
print()
print(classification_report(y_test, y_pred, target_names=labels))

  0%|          | 0/3 [00:00<?, ?it/s]

(80,) (80, 4)
F1: 0.8864367614367612

              precision    recall  f1-score   support

  literature       0.91      1.00      0.95        20
      sports       0.79      0.95      0.86        20
     history       0.94      0.80      0.86        20
   geography       0.94      0.80      0.86        20

    accuracy                           0.89        80
   macro avg       0.90      0.89      0.89        80
weighted avg       0.90      0.89      0.89        80

