In [1]:
import pandas as pd
import os
import sys
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, TrainingArguments, Trainer

In [2]:
model_id = "roberta-base"
try:
    if len(os.environ['DOG_HF_MODEL_ID']) > 4:
        model_id = os.environ['DOG_HF_MODEL_ID']
except:
    pass

output_model_id = model_id+'-hc6model'
user_id = os.environ['USER']
slurm_job = -1

try:
    slurm_job = int(os.environ['SLURM_JOB_ID'])
    output_model_id = model_id+'-hc6model-'+str(slurm_job)
except:
    pass

In [3]:
display(model_id)
display(output_model_id)

'bert-base-multilingual-uncased'

'bert-base-multilingual-uncased-hc6model-23861808'

In [4]:
sys.stdout.write('Running model '+model_id)
sys.stdout.write('Output id '+output_model_id)
sys.stderr.write('Running model '+model_id)
sys.stderr.write('Output id '+output_model_id)

Running model bert-base-multilingual-uncasedOutput id bert-base-multilingual-uncased-hc6model-23861808

Running model bert-base-multilingual-uncasedOutput id bert-base-multilingual-uncased-hc6model-23861808

58

In [5]:
#Move huggingface models to local scratch space
scratch_filedir = f"/state/partition1/user/{user_id}/{slurm_job}/huggingface-models"

def make_dir(dir_name):
  """Creates a directory if it does not exist."""
  if not os.path.exists(dir_name):
    os.makedirs(dir_name)

print (f"Create directory {scratch_filedir}")
make_dir(scratch_filedir)

os.system(f"rsync -av  ~/.cache/huggingface/hub/ {scratch_filedir}")

Create directory /state/partition1/user/tkoch/23861808/huggingface-models
sending incremental file list
./
version.txt
models--bert-base-cased/
models--bert-base-multilingual-uncased/
models--bert-base-multilingual-uncased/.no_exist/
models--bert-base-multilingual-uncased/.no_exist/3da6b6aad5111664db74322f2158b7f93e09a717/
models--bert-base-multilingual-uncased/.no_exist/3da6b6aad5111664db74322f2158b7f93e09a717/added_tokens.json
models--bert-base-multilingual-uncased/.no_exist/3da6b6aad5111664db74322f2158b7f93e09a717/special_tokens_map.json
models--bert-base-multilingual-uncased/blobs/
models--bert-base-multilingual-uncased/blobs/03c53303f0ef6535e93372a93be2db71ec46a1e3
models--bert-base-multilingual-uncased/blobs/23bc0b1c246323483af59827fab707d26b831456
models--bert-base-multilingual-uncased/blobs/28dbff2d6ef29e813e012041b13a9a9eb618ad21
models--bert-base-multilingual-uncased/blobs/a661b1a138dac6dc5590367402d100765010ffd6
models--bert-base-multilingual-uncased/blobs/b33adb2b700b7029a6

0

In [6]:
#Run in offline mode for SuperCloud.
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_DATASETS_OFFLINE"] = "1"
os.environ["TRANSFORMERS_CACHE"] = scratch_filedir
os.environ["HF_HOME"] = scratch_filedir

In [7]:
# Load dataset
dtypes = {'PRODUCT DESCRIPTION' : 'str', 'PRODUCT DESCRIPTION_ASCII' : 'str'}
label_col = 'HS CODE6'
labels = set([])

#Load to pandas
train_dataset = pd.read_csv('input/hc_codes_train_IND.csv.gz', dtype=dtypes)[['PRODUCT DESCRIPTION', label_col]]
train_dataset = train_dataset[train_dataset['PRODUCT DESCRIPTION'].str.len() > 2]
train_dataset['label'] = train_dataset[label_col].astype(int)
labels.update(train_dataset['label'].unique())
max_desc_length = train_dataset['PRODUCT DESCRIPTION'].str.len().max()

test_dataset = pd.read_csv('input/hc_codes_test_IND.csv.gz', dtype=dtypes)[['PRODUCT DESCRIPTION', label_col]]
test_dataset = test_dataset[test_dataset['PRODUCT DESCRIPTION'].str.len() > 2]
test_dataset['label'] = test_dataset[label_col].astype(int)
labels.update(test_dataset['label'].unique())

val_dataset = pd.read_csv('input/hc_codes_valid_IND.csv.gz', dtype=dtypes)[['PRODUCT DESCRIPTION', label_col]]
val_dataset = val_dataset[val_dataset['PRODUCT DESCRIPTION'].str.len() > 2]
val_dataset['label'] = val_dataset[label_col].astype(int)
labels.update(val_dataset['label'].unique())

# We will need this to directly output the class names when using the pipeline without mapping the labels later.
# Extract the number of classes and their names
class_names = labels
num_labels = len(class_names)
print(f"number of labels: {num_labels}")
#print(f"the labels: {class_names}")

# Create an id2label mapping
id2label = {i: int(label) for i, label in enumerate(class_names)}
label2id = {label : idx for idx, label in id2label.items()}
print ('Map labels to ids')
train_dataset = train_dataset.replace({"label": label2id})
test_dataset = test_dataset.replace({"label": label2id})
val_dataset = val_dataset.replace({"label": label2id})

#Load into HF
print ('Load to HF')
train_dataset = Dataset.from_pandas(train_dataset)
test_dataset = Dataset.from_pandas(test_dataset)
val_dataset = Dataset.from_pandas(val_dataset)

number of labels: 5612
Map labels to ids
Load to HF


In [8]:
max_text_length = 256
print (f"Max length of text {max_desc_length}")
assert max_desc_length > 0
assert max_desc_length < max_text_length-1

Max length of text 254


In [9]:
# Preprocessing
tokenizer = AutoTokenizer.from_pretrained(model_id, local_files_only=True, cache_dir=scratch_filedir)
if tokenizer.pad_token is None and model_id == 'gpt2':
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# This function tokenizes the input text using the tokenizer. 
# It applies padding and truncation to ensure that all sequences have the same length.
def tokenize(batch):
    return tokenizer(batch["PRODUCT DESCRIPTION"], padding=True, truncation=True, max_length=max_text_length)

print ('Tokenize training')
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
print ('Tokenize validation')
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
print ('Tokenize test')
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

Tokenize training


Map:   0%|          | 0/844939 [00:00<?, ? examples/s]

Tokenize validation


Map:   0%|          | 0/279776 [00:00<?, ? examples/s]

Tokenize test


Map:   0%|          | 0/279776 [00:00<?, ? examples/s]

In [10]:
# Set dataset format
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [11]:
# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

In [12]:
# Model
model = AutoModelForSequenceClassification.from_pretrained(model_id, config=config, 
                                                           local_files_only=True, 
                                                           cache_dir=scratch_filedir)
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
N_EPOCHS = 5
if 'roberta' in model_id:
    N_EPOCHS = 10
elif 'bert-base-multilingual' in model_id:
    N_EPOCHS = 10
    
# TrainingArguments
training_args = TrainingArguments(
    output_dir=output_model_id,
    num_train_epochs=N_EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=VALID_BATCH_SIZE,
    evaluation_strategy="epoch",
    logging_dir=f"{output_model_id}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Fine-tune the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,1.3153,1.23546
2,1.1628,0.958714
3,0.5926,0.836108
4,0.6114,0.761451
5,0.5055,0.721299
6,0.3455,0.700863
7,0.2144,0.696945
8,0.2466,0.699807
9,0.2556,0.709376
10,0.2032,0.714095




TrainOutput(global_step=264050, training_loss=0.6588043315347183, metrics={'train_runtime': 88326.5064, 'train_samples_per_second': 95.661, 'train_steps_per_second': 2.989, 'total_flos': 5.837766774111744e+17, 'train_loss': 0.6588043315347183, 'epoch': 10.0})

In [14]:
# Evaluate the model
trainer.evaluate()



{'eval_loss': 0.6969451308250427,
 'eval_runtime': 1365.8312,
 'eval_samples_per_second': 204.839,
 'eval_steps_per_second': 12.802,
 'epoch': 10.0}

In [15]:
#Save model
trainer.save_model(output_model_id+'-model')

In [16]:
test_predictions = trainer.predict(test_dataset)

from sklearn.metrics import f1_score, accuracy_score

predictions = test_predictions.predictions.argmax(axis=-1)

f1_score = f1_score(test_dataset["label"], predictions, average="weighted")
accuracy = accuracy_score(test_dataset["label"], predictions)

print(f"F1 score: {f1_score:.3f}")
print(f"Accuracy: {accuracy:.3f}")

F1 score: 0.867
Accuracy: 0.871
