In [None]:
!pip install transformers[sentencepiece]

In [None]:
!pip install datasets

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

import torch
from torch.utils.data import DataLoader

from tqdm.auto import tqdm

from transformers import AdamW, AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, get_scheduler, TrainingArguments, Trainer
from datasets import Dataset

In [None]:
data_train = pd.read_csv('hw4_train.csv')
data_train = data_train[['comment_text', 'toxic']]

data_test = pd.read_csv('hw4_test.csv')
data_test = data_train[['comment_text', 'toxic']]

In [None]:
data_train

Unnamed: 0,comment_text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
...,...,...
159566,""":::::And for the second time of asking, when ...",0
159567,You should be ashamed of yourself \n\nThat is ...,0
159568,"Spitzer \n\nUmm, theres no actual article for ...",0
159569,And it looks like it was actually you who put ...,0


In [None]:
trainset = Dataset.from_pandas(data_train)
devset = Dataset.from_pandas(data_test)

In [None]:
trainset

Dataset({
    features: ['comment_text', 'toxic'],
    num_rows: 159571
})

In [None]:
checkpoint = "microsoft/MiniLM-L12-H384-uncased"  # the model has no maximum length parameter to pad with

tokenizer = AutoTokenizer.from_pretrained(checkpoint, model_max_length=512)

def tokenize_function(example):
    return tokenizer(example["comment_text"], truncation=True)


tokenized_trainset = trainset.map(tokenize_function, batched=True)
tokenized_devset = devset.map(tokenize_function, batched=True)

tokenized_trainset = tokenized_trainset.remove_columns(["comment_text"])
tokenized_trainset = tokenized_trainset.rename_column("toxic", "labels")
tokenized_devset = tokenized_devset.remove_columns(["comment_text"])
tokenized_devset = tokenized_devset.rename_column("toxic", "labels")


Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

  0%|          | 0/160 [00:00<?, ?ba/s]

  0%|          | 0/160 [00:00<?, ?ba/s]

In [None]:
tokenized_trainset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 159571
})

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Downloading:   0%|          | 0.00/127M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
args = TrainingArguments(
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    num_train_epochs=3,
    report_to="none",
    output_dir='trainer_baseline',
)


trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_trainset,
    eval_dataset=tokenized_devset,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 159571
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 59841


Epoch,Training Loss,Validation Loss
1,0.1696,0.199738
2,0.315,0.323234


***** Running Evaluation *****
  Num examples = 159571
  Batch size = 8
Saving model checkpoint to trainer_baseline/checkpoint-19947
Configuration saved in trainer_baseline/checkpoint-19947/config.json
Model weights saved in trainer_baseline/checkpoint-19947/pytorch_model.bin
tokenizer config file saved in trainer_baseline/checkpoint-19947/tokenizer_config.json
Special tokens file saved in trainer_baseline/checkpoint-19947/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 159571
  Batch size = 8
Saving model checkpoint to trainer_baseline/checkpoint-39894
Configuration saved in trainer_baseline/checkpoint-39894/config.json
Model weights saved in trainer_baseline/checkpoint-39894/pytorch_model.bin
tokenizer config file saved in trainer_baseline/checkpoint-39894/tokenizer_config.json
Special tokens file saved in trainer_baseline/checkpoint-39894/special_tokens_map.json


Epoch,Training Loss,Validation Loss
1,0.1696,0.199738
2,0.315,0.323234
3,0.182,0.173545


***** Running Evaluation *****
  Num examples = 159571
  Batch size = 8
Saving model checkpoint to trainer_baseline/checkpoint-59841
Configuration saved in trainer_baseline/checkpoint-59841/config.json
Model weights saved in trainer_baseline/checkpoint-59841/pytorch_model.bin
tokenizer config file saved in trainer_baseline/checkpoint-59841/tokenizer_config.json
Special tokens file saved in trainer_baseline/checkpoint-59841/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=59841, training_loss=0.2514570900483939, metrics={'train_runtime': 9613.8613, 'train_samples_per_second': 49.794, 'train_steps_per_second': 6.224, 'total_flos': 1.6597119519814596e+16, 'train_loss': 0.2514570900483939, 'epoch': 3.0})

In [None]:
predictions = trainer.predict(tokenized_devset)


***** Running Prediction *****
  Num examples = 159571
  Batch size = 8


In [None]:
y_pred = np.argmax(predictions.predictions, axis=1)

In [None]:
y_true = predictions.label_ids

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

print("f1_score", f1_score(y_true, y_pred))
print("f1_score_macro", f1_score(y_true, y_pred,average='macro'))

f1_score 0.7501264371691561
f1_score_macro 0.8622628123431062
