In [1]:
!pip install torch pandas datasets scikit-learn
!pip install transformers[torch] -U



In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import pandas as pd
from datasets import load_dataset
from torch.utils.data.dataset import Dataset
from sklearn.model_selection import train_test_split
import torch

In [5]:
class TextClassifierDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

In [3]:
# https://medium.com/@abdurhmanfayad_73788/fine-tuning-bert-for-a-multi-label-classification-problem-on-colab-5ca5b8759f3f
df = pd.read_csv('./processed_data.csv', delimiter='|')
train_df, test_df = train_test_split(df, test_size=0.2)
print("Train dataset head:")
print(train_df)

Train dataset head:
                                                content            county  \
1469  new york coronavirus deaths surging new york g...           Bristol   
505   steamboat springs county officials echoed gov ...             Routt   
863   hanna city danley believes swimmingpool sales ...            Peoria   
1571  coronavirus stories provided free public servi...         Middlesex   
1126  ambitious billion paycheck protection program ...  East Baton Rouge   
...                                                 ...               ...   
1261  washington president trump friday abruptly fir...      Androscoggin   
1821  one county centrals knowledge bowl teams quali...           Jackson   
351   boulder quality biomedical inc ramped producti...        Broomfield   
880   chris cuomo says hes lost significant amount w...        Stephenson   
561   marcy shortuse move surprised many florida gov...               Lee   

              state                source  total_popula

In [6]:

columns = ["avg_white_pop_pct","avg_median_hh_inc","avg_non_college_pct"]
df_labels_train = train_df[columns]
df_labels_test = test_df[columns]

#convert to label lists
labels_list_train = df_labels_train.values.tolist()
labels_list_test = df_labels_test.values.tolist()

# set up our text inputs
train_texts = train_df['content'].tolist()
train_labels = labels_list_train

eval_texts = test_df['content'].tolist()
eval_labels = labels_list_test

print(train_labels)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

#TODO: increase max length when we use article content
train_encodings = tokenizer(train_texts, padding="max_length", truncation=True, max_length=64)
eval_encodings = tokenizer(eval_texts, padding="max_length", truncation=True, max_length=64)

print(train_encodings[0])

train_dataset = TextClassifierDataset(train_encodings, train_labels)
eval_dataset = TextClassifierDataset(eval_encodings, eval_labels)

[[1, 1, 1], [1, 1, 0], [0, 1, 0], [1, 1, 0], [0, 0, 0], [1, 1, 0], [1, 1, 1], [0, 0, 0], [1, 0, 1], [1, 1, 0], [1, 0, 1], [0, 0, 1], [1, 0, 1], [1, 1, 1], [0, 0, 1], [0, 0, 1], [1, 1, 0], [1, 0, 1], [1, 0, 1], [0, 0, 0], [0, 1, 1], [0, 0, 1], [0, 1, 0], [0, 1, 0], [1, 1, 1], [1, 1, 1], [1, 0, 1], [1, 0, 1], [0, 0, 1], [0, 0, 1], [0, 1, 0], [0, 0, 1], [1, 0, 1], [1, 0, 1], [1, 1, 1], [1, 0, 0], [1, 0, 1], [0, 1, 0], [1, 0, 1], [0, 1, 1], [1, 0, 1], [1, 1, 0], [0, 0, 1], [1, 1, 0], [1, 1, 0], [0, 0, 0], [0, 1, 1], [1, 0, 1], [0, 1, 0], [1, 0, 1], [0, 1, 0], [0, 0, 1], [0, 1, 0], [1, 1, 0], [1, 0, 1], [0, 1, 0], [1, 0, 1], [0, 1, 0], [1, 0, 1], [1, 1, 1], [1, 0, 1], [1, 1, 0], [1, 0, 1], [0, 0, 1], [1, 1, 1], [1, 1, 0], [1, 1, 0], [0, 1, 0], [1, 1, 0], [0, 0, 1], [1, 0, 1], [1, 0, 1], [1, 1, 0], [1, 1, 0], [1, 0, 1], [0, 1, 1], [0, 1, 1], [0, 1, 0], [0, 1, 0], [1, 1, 0], [1, 0, 1], [1, 0, 1], [0, 1, 0], [0, 1, 0], [0, 0, 1], [1, 1, 0], [0, 0, 1], [0, 0, 1], [0, 1, 0], [1, 1, 0], [0, 0, 1]

Collecting transformers[torch]
  Downloading transformers-4.39.2-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.38.2
    Uninstalling transformers-4.38.2:
      Successfully uninstalled transformers-4.38.2
Successfully installed transformers-4.39.2


In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    problem_type="multi_label_classification",
    num_labels=3
)

training_arguments = TrainingArguments(
    output_dir="./output",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4
)

trainer = Trainer(
    model = model,
    args = training_arguments,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

trainer.train()
trainer.save_model(output_dir='./trained_bert')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,No log,0.546294
2,No log,0.513811
3,No log,0.533316
4,No log,0.552115


In [8]:
## Evaluate the model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.5521154999732971, 'eval_runtime': 1.4365, 'eval_samples_per_second': 278.454, 'eval_steps_per_second': 17.403, 'epoch': 4.0}
