In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

data_dir = "./data"
if not os.path.exists(data_dir):
  os.makedirs(data_dir)

speech_df = pd.read_csv(os.path.join(data_dir, "messages.csv"))

messages = speech_df["Message"].tolist()
labels = speech_df["Aggressive"].tolist()

train_messages, test_messages, train_labels, test_labels = train_test_split(messages, labels, test_size=0.5, random_state=256, stratify=labels)

train_df = pd.DataFrame({"Message": train_messages, "Aggressive": train_labels})
train_df.to_csv(os.path.join(data_dir, "train.csv"), index=False)

test_messages, _, test_labels, _ = train_test_split(test_messages, test_labels, test_size=0.9, random_state=256, stratify=test_labels)

test_df = pd.DataFrame({"Message": test_messages, "Aggressive": test_labels})
test_df.to_csv(os.path.join(data_dir, "test.csv"), index=False)

In [11]:
import numpy as np
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
import torch
from sklearn.metrics import accuracy_score

seed = 256
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [6]:
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification 
from transformers import TrainingArguments, Trainer
import evaluate

MODEL_NAME = "distilbert/distilbert-base-uncased"

In [5]:
dataset = Dataset.from_dict({"text": train_messages, "label": train_labels})
train_test_split = dataset.train_test_split(test_size=0.2)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_data = train_test_split.map(preprocess_function, batched=True)
tokenized_data.set_format("torch", columns=["input_ids", "attention_mask", "label"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Map:   0%|          | 0/51375 [00:00<?, ? examples/s]

Map:   0%|          | 0/12844 [00:00<?, ? examples/s]

In [50]:
model_dir = "./model"
if not os.path.exists(data_dir):
  os.makedirs(data_dir)

id2label = {0: "NON_AGGRESSIVE", 1: "AGGRESSIVE"}
label2id = {"NON_AGGRESSIVE": 0, "AGGRESSIVE": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir=model_dir,
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


loading configuration file https://huggingface.co/distilbert/distilbert-base-uncased/resolve/main/config.json from cache at /Users/kevkev/.cache/huggingface/transformers/9156cd487ebc07b22755262799b39fcdc0d5ae65bb62a1c8dc21ebe3f74bbf58.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert/distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "NON_AGGRESSIVE",
    "1": "AGGRESSIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "AGGRESSIVE": 1,
    "NON_AGGRESSIVE": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

  0%|          | 0/3211 [00:00<?, ?it/s]

{'loss': 0.3551, 'learning_rate': 8.44285269386484e-06, 'epoch': 0.16}
{'loss': 0.2868, 'learning_rate': 6.88570538772968e-06, 'epoch': 0.31}
{'loss': 0.2744, 'learning_rate': 5.328558081594519e-06, 'epoch': 0.47}
{'loss': 0.2633, 'learning_rate': 3.771410775459359e-06, 'epoch': 0.62}
{'loss': 0.2511, 'learning_rate': 2.214263469324198e-06, 'epoch': 0.78}
{'loss': 0.2453, 'learning_rate': 6.571161631890377e-07, 'epoch': 0.93}


***** Running Evaluation *****
  Num examples = 12844
  Batch size = 16


  0%|          | 0/803 [00:00<?, ?it/s]

Saving model checkpoint to ./model/checkpoint-3211
Configuration saved in ./model/checkpoint-3211/config.json


{'eval_loss': 0.24636918306350708, 'eval_accuracy': 0.8992525692930551, 'eval_runtime': 534.9396, 'eval_samples_per_second': 24.01, 'eval_steps_per_second': 1.501, 'epoch': 1.0}


Model weights saved in ./model/checkpoint-3211/pytorch_model.bin
tokenizer config file saved in ./model/checkpoint-3211/tokenizer_config.json
Special tokens file saved in ./model/checkpoint-3211/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./model/checkpoint-3211 (score: 0.24636918306350708).


{'train_runtime': 13143.645, 'train_samples_per_second': 3.909, 'train_steps_per_second': 0.244, 'train_loss': 0.27652568811198625, 'epoch': 1.0}


  optimizer_kwargs.update({"scale_parameter": False, "relative_step": False})


TrainOutput(global_step=3211, training_loss=0.27652568811198625, metrics={'train_runtime': 13143.645, 'train_samples_per_second': 3.909, 'train_steps_per_second': 0.244, 'train_loss': 0.27652568811198625, 'epoch': 1.0})

In [18]:
tokenizer = AutoTokenizer.from_pretrained("./model/checkpoint-3211")
test_inputs = tokenizer(test_messages, padding=True, truncation=True, return_tensors="pt")

fine_tuned_model = AutoModelForSequenceClassification.from_pretrained("./model/checkpoint-3211")

with torch.no_grad():
  logits = []
  for i in tqdm(range(0, len(test_inputs["input_ids"]), 16)):
    logits.append(fine_tuned_model(**{k: v[i:i+16] for k, v in test_inputs.items()}).logits)
  logits = torch.cat(logits, dim=0)

fine_tuned_pred = torch.argmax(logits, dim=-1)

print(f'Fine-tuned Accuracy: {accuracy_score(test_labels, fine_tuned_pred)}')

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
test_inputs = tokenizer(test_messages, padding=True, truncation=True, return_tensors="pt")

pre_trained_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

with torch.no_grad():
  logits = []
  for i in tqdm(range(0, len(test_inputs["input_ids"]), 16)):
    logits.append(pre_trained_model(**{k: v[i:i+16] for k, v in test_inputs.items()}).logits)
  logits = torch.cat(logits, dim=0)

pre_trained_pred = torch.argmax(logits, dim=-1)

print(f'Pre-trained Accuracy: {accuracy_score(test_labels, pre_trained_pred)}')

res_dir = "./results"
if not os.path.exists(res_dir):
  os.makedirs(res_dir)

import json

with open(os.path.join(res_dir, "fine_tuned_pred.json"), "w") as f:
  json.dump(fine_tuned_pred.tolist(), f)

with open(os.path.join(res_dir, "pre_trained_pred.json"), "w") as f:
  json.dump(pre_trained_pred.tolist(), f)


100%|██████████| 402/402 [06:10<00:00,  1.09it/s]


Fine-tuned Accuracy: 0.8993926179722784


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 402/402 [06:10<00:00,  1.08it/s]

Pre-trained Accuracy: 0.43186419560816075



