<a href="https://colab.research.google.com/github/irisawa/fy21compe/blob/main/DistilBERT_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DistilBERT

HuggingFaceのtransformersを使って、Multiclass text classificationに対するDistilBertのファインチューニングを行う。

https://www.sunnyville.ai/fine-tuning-distilbert-multi-class-text-classification-using-transformers-and-tensorflow/

# Importing Libraries

In [None]:
!pip install transformers --q

[K     |████████████████████████████████| 3.1 MB 4.0 MB/s 
[K     |████████████████████████████████| 895 kB 48.0 MB/s 
[K     |████████████████████████████████| 596 kB 40.8 MB/s 
[K     |████████████████████████████████| 56 kB 4.8 MB/s 
[K     |████████████████████████████████| 3.3 MB 34.3 MB/s 
[?25h

In [None]:
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
import tensorflow as tf
import pandas as pd
import json
import gc

In [None]:
import pandas as pd
import numpy as np

# Loading the Data


In [None]:
df = pd.read_csv("/content/drive/MyDrive/data/training.tsv", delimiter='\t')
df.head()

Unnamed: 0,sid,sentence,html_id,label
0,tr-01-0000,PART I,Form10k_01,0
1,tr-01-0001,The “Business” section and other parts of this...,Form10k_01,0
2,tr-01-0002,"Statements that are not historical facts, inc...",Form10k_01,0
3,tr-01-0003,Our actual results may differ materially from...,Form10k_01,0
4,tr-01-0004,Factors that could cause such differences inc...,Form10k_01,0


In [None]:
minimum_num = df['label'].value_counts().min()
dfs = [d.sample(minimum_num, random_state=0) for name, d in df.groupby('label')]
under_resampled_df = pd.concat(dfs).sample(frac=1, random_state=0)
X_resampled = under_resampled_df.to_numpy()
dfx = pd.DataFrame(X_resampled)
df = dfx.set_axis(['sid', 'sentence', 'html_id', 'label'], axis=1)
df = pd.concat([df,pd.get_dummies(df['label'])],axis=1)
df = df.set_axis(['sid', 'sentence', 'html_id', 'label', "Other", "Green", "Environmental", "Social"], axis=1)
df.head()

Unnamed: 0,sid,sentence,html_id,label,Other,Green,Environmental,Social
0,tr-60-0187,The CCPA provides for civil penalties for viol...,Form10k_60,3,0,0,0,1
1,tr-55-0457,"Further, regulation of GHG emissions may limit...",Form10k_55,1,0,1,0,0
2,tr-63-0077,Consumers increasingly demand computing device...,Form10k_63,1,0,1,0,0
3,tr-30-0815,Despite our agreement to obtain an opinion fro...,Form10k_30,0,1,0,0,0
4,tr-15-0700,"Effective January 1, 2018, the Federal Departm...",Form10k_15,1,0,1,0,0


In [None]:
data_texts = df["sentence"].to_list() # Features
data_labels = df["label"].to_list() # Lables

# Spliting data

In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(data_texts, data_labels, test_size=0.2, random_state=0)
train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size=0.01, random_state=0)

# Tokenizing the text
## encode

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

# Creating a Dataset object

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

# Fine-tuning

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=4)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_transform', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'pre_classifier', 'dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use i

# Training

①

In [None]:
model.fit(train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16,
          validation_data=val_dataset.shuffle(1000).batch(16))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fd157a73a10>

②

TFTrainerクラスを使って、構成設定を定義し、モデルを構築する


In [None]:
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments

training_args = TFTrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    eval_steps=1000,
)

with training_args.strategy.scope():
    trainer_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)

trainer = TFTrainer(
    model=trainer_model,                 # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'vocab_projector', 'activation_13', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'pre_classifier', 'dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use i

# Training

In [None]:
trainer.train()

# evaluation metrics

In [None]:
trainer.evaluate()

{'eval_loss': 1.0915880997975667}

# Saving & Loading the model

## Saving the mode

In [None]:
save_directory = "/saved_models" # change this to your preferred location

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('/saved_models/tokenizer_config.json',
 '/saved_models/special_tokens_map.json',
 '/saved_models/vocab.txt',
 '/saved_models/added_tokens.json')

## Loading the mode

In [None]:
loaded_tokenizer = DistilBertTokenizer.from_pretrained(save_directory)
loaded_model = TFDistilBertForSequenceClassification.from_pretrained(save_directory)

Some layers from the model checkpoint at /saved_models were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /saved_models and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Prediction test

In [None]:
test_text = test_texts[6]
test_text

'Commercial whole mortgage loans are also subject to special hazard risk and to bankruptcy risk.'

In [None]:
predict_input = loaded_tokenizer.encode(test_text,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")

output = loaded_model(predict_input)[0]

prediction_value = tf.argmax(output, axis=1).numpy()[0]
prediction_value

3

# Prediction test

In [None]:
testdf = pd.read_csv("/content/drive/MyDrive/data/test.tsv", delimiter='\t')
testdf.head()

Unnamed: 0,sid,sentence,html_id
0,pb-09-0000,PART I,Form10k_09
1,pb-09-0001,"In this Annual Report on Form 10-K, unless oth...",Form10k_09
2,pb-09-0002,We use the term “Holding Company” to refer sol...,Form10k_09
3,pb-09-0003,Item 1.,Form10k_09
4,pb-09-0004,Business,Form10k_09


In [None]:
testdataset = []
for index, row in testdf.iterrows():
    text = row['sentence']

    testsample = text
    testdataset.append(testsample)

print((testdataset[2]))

We use the term “Holding Company” to refer solely to Dime Community Bancshares, Inc. and not to our consolidated subsidiary.


In [None]:
predlist = []
for w in testdataset:
    predict_input = loaded_tokenizer.encode(w,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")
    output = loaded_model(predict_input)[0]
    prediction_value = tf.argmax(output, axis=1).numpy()[0]
    predlist.append(prediction_value)

In [None]:
pred_x = np.array(predlist)

In [None]:
ans = np.append(testdf, pred_x.reshape(len(pred_x),1), 1)
ans[:, [0, 3]]

array([['pb-09-0000', 0],
       ['pb-09-0001', 0],
       ['pb-09-0002', 0],
       ...,
       ['pb-71-0430', 0],
       ['pb-71-0431', 0],
       ['pb-71-0432', 2]], dtype=object)

In [None]:
df_f = pd.DataFrame(ans[:, [0, 3]])
df_f.head()

Unnamed: 0,0,1
0,pb-09-0000,0
1,pb-09-0001,0
2,pb-09-0002,0
3,pb-09-0003,0
4,pb-09-0004,0


In [None]:
df_f.to_csv(f'/content/drive/MyDrive/data/output_distil.tsv', header = False, index=False, sep='\t')

0.2416598