# Include and load data

In [41]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline, BertModel

from datasets import Dataset

import evaluate

In [46]:
MODEL_DIR = './bert_clf'
MODEL_NAME = 'bert-base-cased'

TRAIN_PATH = r"train_for_student.json"
TEST_PATH = r"test_for_student.json"
RANDOM_STATE = 6969

TRAIN_PATH = os.path.join(os.path.pardir, "data", TRAIN_PATH)
TEST_PATH = os.path.join(os.path.pardir, "data", TEST_PATH)

LABEL_LIST = ['CE','ENV','BME','PE','METAL','ME','EE','CPE','OPTIC','NANO','CHE','MATENG','AGRI','EDU','IE','SAFETY','MATH','MATSCI']

In [26]:
def load_df(path, is_train = True):
    df = pd.read_json(TRAIN_PATH).transpose()

    df['Text'] = df['Title'] # + " " + df['Abstract']
    df.drop(columns=['Title', 'Abstract'], inplace=True)

    if is_train:
        # add label
        for label in LABEL_LIST:
            df[label] = df['Classes'].apply(lambda x : label in x).astype(int)
        df.drop(columns=['Classes'], inplace=True)
    

    return df

# Process data

In [58]:
def get_train_dataset(label):
    # load and filter label
    df = load_df(TRAIN_PATH)
    df = df[['Text', label]]
    df.columns = ['Text', 'labels']

    # split 
    train_df, test_df = train_test_split(df, test_size=0.3, stratify=df['labels'], random_state=RANDOM_STATE)
    test_df, val_df = train_test_split(test_df, test_size=0.5, stratify=test_df['labels'], random_state=RANDOM_STATE)

    ros = RandomOverSampler(random_state=RANDOM_STATE)
    train_df, _ = ros.fit_resample(train_df, train_df['labels'])
    

    # create dataset
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)

    # tokenize
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer_fn = lambda x : tokenizer(x['Text'], max_length=128, truncation=True, padding='max_length')
    train_dataset = train_dataset.map(tokenizer_fn)
    val_dataset = val_dataset.map(tokenizer_fn)
    test_dataset = test_dataset.map(tokenizer_fn)

    return train_dataset, val_dataset, test_dataset

def get_test_dataset():
    df = load_df(TEST_PATH, False)
    df = df[['Text']]
    df.columns = ['Text']

    test_dataset = Dataset.from_pandas(df)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer_fn = lambda x : tokenizer(x['Text'], max_length=128, truncation=True, padding='max_length')
    test_dataset = test_dataset.map(tokenizer_fn)

    return test_dataset

# Make model

In [52]:
def compute_metrics(pred):
    metric = evaluate.load('f1')
    preds = pred.predictions.argmax(-1)
    labels = pred.label_ids
    return metric.compute(predictions=preds, references=labels)

In [57]:
model_dict = {}

for label in LABEL_LIST:
    print(f"Training for {label}")
    
    # get dataset
    train_dataset, val_dataset, test_dataset = get_train_dataset(label)

    # create model
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
    train_args = TrainingArguments(
        output_dir = f"{MODEL_DIR}/{label}",
        logging_dir = f"{MODEL_DIR}/{label}/logs",
        evaluation_strategy = "epoch",
        save_strategy = "epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        seed=RANDOM_STATE
    )
    trainer = Trainer(
        model=model,
        args=train_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
    )

    # train
    trainer.train()

    # evaluate
    eval_result = trainer.evaluate(test_dataset)
    print(eval_result)

    # save model
    model.save_pretrained(f"{MODEL_DIR}/{label}")

Training for CE


Map:   0%|          | 0/317 [00:00<?, ? examples/s]

Map:   0%|          | 0/69 [00:00<?, ? examples/s]

Map:   0%|          | 0/68 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

{'eval_loss': 0.359801322221756, 'eval_f1': 0.0, 'eval_runtime': 6.5466, 'eval_samples_per_second': 10.54, 'eval_steps_per_second': 0.764, 'epoch': 1.0}


KeyboardInterrupt: 

In [53]:
train_arg = TrainingArguments(output_dir=os.path.join(MODEL_DIR),
                              logging_dir=os.path.join(MODEL_DIR, 'logs'),
                              logging_strategy='epoch',
                              logging_steps=100,
                              num_train_epochs=2,
                              learning_rate=5e-6,
                              seed=RANDOM_STATE,
                              save_strategy='epoch',
                              save_total_limit=2,
                              evaluation_strategy='epoch',
                              load_best_model_at_end=True)

In [56]:
trainer = Trainer(model=model,
                  args=train_arg,
                  train_dataset=train_dataset,
                  eval_dataset=val_dataset,
                  compute_metrics=compute_metrics,
                  callbacks=[EarlyStoppingCallback(early_stopping_patience=5)])

trainer.train()

  0%|          | 0/80 [00:00<?, ?it/s]

{'loss': 0.2058, 'grad_norm': 3.6584346294403076, 'learning_rate': 2.5e-06, 'epoch': 1.0}


  0%|          | 0/9 [00:00<?, ?it/s]

Checkpoint destination directory ./bert_clf/checkpoint-40 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 0.20032957196235657, 'eval_f1': 0.0, 'eval_runtime': 6.0812, 'eval_samples_per_second': 11.346, 'eval_steps_per_second': 1.48, 'epoch': 1.0}
{'loss': 0.1954, 'grad_norm': 3.46293306350708, 'learning_rate': 0.0, 'epoch': 2.0}


  0%|          | 0/9 [00:00<?, ?it/s]

Checkpoint destination directory ./bert_clf/checkpoint-80 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 0.19756199419498444, 'eval_f1': 0.0, 'eval_runtime': 5.6135, 'eval_samples_per_second': 12.292, 'eval_steps_per_second': 1.603, 'epoch': 2.0}
{'train_runtime': 101.6396, 'train_samples_per_second': 6.238, 'train_steps_per_second': 0.787, 'train_loss': 0.2006228268146515, 'epoch': 2.0}


TrainOutput(global_step=80, training_loss=0.2006228268146515, metrics={'train_runtime': 101.6396, 'train_samples_per_second': 6.238, 'train_steps_per_second': 0.787, 'train_loss': 0.2006228268146515, 'epoch': 2.0})

In [55]:
trainer.evaluate(test_dataset)

  0%|          | 0/9 [00:00<?, ?it/s]

{'eval_loss': 0.20747779309749603,
 'eval_f1': 0.0,
 'eval_runtime': 2.9828,
 'eval_samples_per_second': 22.797,
 'eval_steps_per_second': 3.017,
 'epoch': 2.0}

In [45]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
test_dataset = get_test_dataset()

submission_df = pd.DataFrame(columns= LABEL_LIST,
                             index= [f"{'0'*(3-len(str(k+1)))}{k+1}eval" for k in range(151)])

for label in LABEL_LIST:
    model = AutoModelForSequenceClassification.from_pretrained(f"{MODEL_DIR}/{label}")
    pipeline = TextClassificationPipeline(model=model, tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME))
    pred = pipeline(test_dataset['Text'])
    submission_df[label] = pred

submission_df.index.name = "id"
submission_df