# Include and load data

In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split

import tensorflow as tf
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline

from datasets import Dataset

import evaluate

In [120]:
MODEL_DIR = './bert_clf'
MODEL_NAME = 'michellejieli/NSFW_text_classifier'


TRAIN_PATH = r"train_for_student.json"
TEST_PATH = r"test_for_student.json"
RANDOM_STATE = 6969

TRAIN_PATH = os.path.join(os.path.pardir, "data", TRAIN_PATH)
TEST_PATH = os.path.join(os.path.pardir, "data", TEST_PATH)

LABEL_LIST = ['CE','ENV','BME','PE','METAL','ME','EE','CPE','OPTIC','NANO','CHE','MATENG','AGRI','EDU','IE','SAFETY','MATH','MATSCI']

In [121]:
def load_df(path, is_train = True):
    df = pd.read_json(path).transpose()

    df['Text'] = df['Title'] + " " + df['Abstract']
    df.drop(columns=['Title', 'Abstract'], inplace=True)

    if is_train:
        # add label
        df['labels'] = df['Classes'].apply(lambda x : [1.0 if LABEL_LIST[i] in x else 0.0 for i in range(len(LABEL_LIST))])

        df.drop(columns=['Classes'], inplace=True)
    return df

# Process data

In [144]:
def get_train_dataset():
    # load and filter label
    df = load_df(TRAIN_PATH)
    df = pd.concat([df])

    df['labels_idx'] = df['labels'].apply(lambda x : np.array([x[i]*2**i for i in range(len(LABEL_LIST))]).sum())

    # split 
    train_df, test_df = train_test_split(df, 
                                         test_size=0.2, 
                                        #  stratify=df['labels_idx'], 
                                         random_state=RANDOM_STATE)
    test_df, val_df = train_test_split(test_df, 
                                       test_size=0.5, 
                                    #    stratify=test_df['labels_idx'], 
                                       random_state=RANDOM_STATE)
    train_df.drop(columns=['labels_idx'], inplace=True)
    val_df.drop(columns=['labels_idx'], inplace=True)
    test_df.drop(columns=['labels_idx'], inplace=True)

    # create dataset
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)

    # tokenize
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, problem_type="multi_label_classification")
    tokenizer_fn = lambda x : tokenizer(x['Text'], truncation=True, padding='max_length', max_length=512)
    train_dataset = train_dataset.map(tokenizer_fn)
    val_dataset = val_dataset.map(tokenizer_fn)
    test_dataset = test_dataset.map(tokenizer_fn)

    return train_dataset, val_dataset, test_dataset

def get_test_dataset():
    df = load_df(TEST_PATH, is_train=False)
    test_dataset = Dataset.from_pandas(df)

    # tokenize
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, problem_type="multi_label_classification")
    tokenizer_fn = lambda x : tokenizer(x['Text'], truncation=True, padding='max_length', max_length=512)
    test_dataset = test_dataset.map(tokenizer_fn)

    return test_dataset

In [145]:
train_dataset, val_dataset, test_dataset = get_train_dataset()

Map:   0%|          | 0/363 [00:00<?, ? examples/s]

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

# Make model

In [146]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, 
                                                           num_labels = 18, 
                                                           problem_type="multi_label_classification",
                                                           ignore_mismatched_sizes=True)

model.classifier = torch.nn.Sequential(
    torch.nn.Linear(in_features=768, out_features=18, bias=True),
    torch.nn.Sigmoid()
)

# freeze bert layers
for param in model.base_model.parameters():
    param.requires_grad = False


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at michellejieli/NSFW_text_classifier and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([18, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([18]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [147]:
def compute_metrics(pred):
    metric = evaluate.load('f1', 'multilabel')
    preds = (pred.predictions > 0.5).astype(int)
    labels = pred.label_ids.astype(int)
    return metric.compute(predictions=preds, references=labels, average='macro')

In [148]:
train_arg = TrainingArguments(output_dir=os.path.join(MODEL_DIR),
                              logging_dir=os.path.join(MODEL_DIR, 'logs'),
                              logging_strategy='epoch',
                              logging_steps=100,
                              num_train_epochs=5,
                              learning_rate=5e-4,
                              seed=RANDOM_STATE,
                              save_strategy='epoch',
                              save_steps=100,
                              evaluation_strategy='epoch',
                              eval_steps=100,
                              load_best_model_at_end=True)

In [149]:
trainer = Trainer(model=model,
                  args=train_arg,
                  train_dataset=train_dataset,
                  eval_dataset=val_dataset,
                  compute_metrics=compute_metrics,
                  callbacks=[EarlyStoppingCallback(early_stopping_patience=5)])

trainer.train()

  0%|          | 0/230 [00:00<?, ?it/s]

{'loss': 0.4327, 'grad_norm': 1.4759794473648071, 'learning_rate': 0.0004, 'epoch': 1.0}


  0%|          | 0/6 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Checkpoint destination directory ./bert_clf/checkpoint-46 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 0.3959414064884186, 'eval_f1': 0.043771043771043766, 'eval_runtime': 6.844, 'eval_samples_per_second': 6.721, 'eval_steps_per_second': 0.877, 'epoch': 1.0}
{'loss': 0.4046, 'grad_norm': 4.521640777587891, 'learning_rate': 0.0003, 'epoch': 2.0}


  0%|          | 0/6 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Checkpoint destination directory ./bert_clf/checkpoint-92 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 0.44109755754470825, 'eval_f1': 0.03888888888888889, 'eval_runtime': 6.7303, 'eval_samples_per_second': 6.835, 'eval_steps_per_second': 0.891, 'epoch': 2.0}
{'loss': 0.3968, 'grad_norm': 0.9684645533561707, 'learning_rate': 0.0002, 'epoch': 3.0}


  0%|          | 0/6 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Checkpoint destination directory ./bert_clf/checkpoint-138 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 0.41140833497047424, 'eval_f1': 0.03381642512077295, 'eval_runtime': 6.5779, 'eval_samples_per_second': 6.993, 'eval_steps_per_second': 0.912, 'epoch': 3.0}
{'loss': 0.3892, 'grad_norm': 0.7104480862617493, 'learning_rate': 0.0001, 'epoch': 4.0}


  0%|          | 0/6 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Checkpoint destination directory ./bert_clf/checkpoint-184 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 0.41655856370925903, 'eval_f1': 0.024691358024691357, 'eval_runtime': 7.1587, 'eval_samples_per_second': 6.426, 'eval_steps_per_second': 0.838, 'epoch': 4.0}
{'loss': 0.391, 'grad_norm': 0.937039852142334, 'learning_rate': 0.0, 'epoch': 5.0}


  0%|          | 0/6 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Checkpoint destination directory ./bert_clf/checkpoint-230 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 0.4114076495170593, 'eval_f1': 0.023391812865497075, 'eval_runtime': 6.8335, 'eval_samples_per_second': 6.731, 'eval_steps_per_second': 0.878, 'epoch': 5.0}
{'train_runtime': 560.8577, 'train_samples_per_second': 3.236, 'train_steps_per_second': 0.41, 'train_loss': 0.4028931078703507, 'epoch': 5.0}


TrainOutput(global_step=230, training_loss=0.4028931078703507, metrics={'train_runtime': 560.8577, 'train_samples_per_second': 3.236, 'train_steps_per_second': 0.41, 'train_loss': 0.4028931078703507, 'epoch': 5.0})

In [150]:
eva = trainer.evaluate(test_dataset)

f1_score = eva['eval_f1']

for k in eva:
    print(f"{k} : {eva[k]}")


  0%|          | 0/6 [00:00<?, ?it/s]

eval_loss : 0.36381930112838745
eval_f1 : 0.04861111111111111
eval_runtime : 7.7266
eval_samples_per_second : 5.824
eval_steps_per_second : 0.777
epoch : 5.0


In [151]:
# save trained model
model.save_pretrained(os.path.join(MODEL_DIR, f'trained_model(f1={f1_score:.4f})'))

# Send test

In [152]:
answer_dataset = get_test_dataset()
answer = trainer.predict(answer_dataset)
answer = (answer.predictions > 0.5).astype(int)

answer_df = pd.DataFrame(answer, columns=LABEL_LIST)
answer_df.index = [f"{'0'*(3-len(str(i)))}{str(i)}eval" for i in range(1, 152)]

answer_df

Map:   0%|          | 0/151 [00:00<?, ? examples/s]

  0%|          | 0/19 [00:00<?, ?it/s]

Unnamed: 0,CE,ENV,BME,PE,METAL,ME,EE,CPE,OPTIC,NANO,CHE,MATENG,AGRI,EDU,IE,SAFETY,MATH,MATSCI
001eval,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
002eval,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
003eval,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
004eval,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
005eval,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147eval,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
148eval,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
149eval,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
150eval,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [153]:
answer_df.index.name = 'id'
answer_df.to_csv('submission.csv')

In [156]:
!kaggle competitions submit -c 2110446-data-science-2023-02 -f submission.csv -m "(michellejieli/NSFW_text_classifier) transfer learning"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████████████████████████████████| 6.57k/6.57k [00:01<00:00, 4.10kB/s]
Successfully submitted to 2110446 Data Science and Data Engineering Tools