In [2]:
import pandas as pd
import numpy as np
data = pd.read_json('data/dataset_1.json')

data["label_bin"] = np.where(data["label"] > 1, 1, 0)
data["label_n"] = data["label"] - 1
data["display_text"] = [d[1]['text'][d[1]['displayTextRangeStart']: d[1]['getDisplayTextRangeEnd']] for d in data[["text","displayTextRangeStart", "getDisplayTextRangeEnd"]].iterrows()]
print("max text length", len(data.iloc[np.argmax(data['text'].to_numpy())]['text']))
max_display_text_length = len(data.iloc[np.argmax(data['display_text'].to_numpy())]['display_text'])
print("max display text length", max_display_text_length)
data
X = data.display_text.to_list()
y = data.label_n.to_list()

max text length 286
max display text length 270


In [14]:
#select model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import EarlyStoppingCallback

# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])
    
# ----- 2. Fine-tune pretrained model -----#
# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
    
def train_model(X, y, validation_split_ratio=0.2, batch_size=16, model_name="bert-base-uncased", random_seed=1337):
    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=4)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=validation_split_ratio, random_state=random_seed, shuffle=True)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=validation_split_ratio * len(X) / len(X_train), random_state=random_seed, shuffle=True)

    X_train_trans = tokenizer(X, truncation=True, padding=True, max_length=512)
    X_test_trans = tokenizer(X_test, truncation=True, padding=True, max_length=512)
    X_val_trans = tokenizer(X_val, truncation=True, padding=True, max_length=512)

    # X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
    X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)



    # Define Trainer
    args = TrainingArguments(
        output_dir="output",
        evaluation_strategy="steps",
        eval_steps=500,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=1,
        seed=0,
        load_best_model_at_end=True,
    )
        
    train_dataset = Dataset(X_train_tokenized, y_train)
    val_dataset = Dataset(X_val_tokenized, y_val)
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )

    # Train pre-trained model
    trainer.train()

    # ----- 3. Predict -----#
    # Load test data
    # test_data = pd.read_csv("test.csv")
    # X_test = list(test_data["review"])
    X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

    # Create torch dataset
    test_dataset = Dataset(X_test_tokenized)

    # # Load trained model
    # model_path = "./output/checkpoint-50000"
    # model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)

    # Make prediction
    raw_pred, _, _ = trainer.predict(test_dataset)

    # Preprocess raw predictions
    y_pred = np.argmax(raw_pred, axis=1)
    return trainer

In [15]:
train_model(X,y)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.
You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['bert.encoder.layer.6.intermediate.dense.weight', 'bert.encoder.layer.2.attention.self.value.weight', 'bert.embeddings.word_embeddings.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.6.output.LayerNorm.bias', 'bert.encoder.layer.5.attention.self.key.weight', 'bert.encoder.layer.8.attention.self.query.bias', 'bert.encoder.layer.4.attention.self.value.weight', 'bert.encoder.layer.1.attention.self

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['embeddings.word_embeddings.weight', 'transformer.layer.11.attention.k_lin.bias', 'transformer.layer.6.ffn.lin1.weight', 'transformer.layer.11.ffn.lin2.weight', 'transformer.layer.1.attention.q_lin.weight', 'transformer.layer.6.attention.k_lin.weight', 'pre_classifier.weight', 'transformer.layer.8.attention.k_lin.bias', 'transformer.layer.3.attention.v_lin.bias', 'transformer.layer.8.attention.out_lin.weight', 'transformer.layer.3.attention.out_lin.weight', 'transformer.layer.9.attention.q_lin.weight', 'transformer.layer.11.ffn.lin1.bias', 'transformer.layer.3.attention.q_lin.bias', 'transformer.layer.5.output_layer_norm.bias', 'transformer.layer.5.attention.k_lin.bias', 'transformer.layer.6.ffn.lin2.weight', 'transformer.layer.9.output_layer_norm.weight', 'transformer.layer.10.attention.k_lin.bias', 'transformer.layer.6.attention.q_lin.wei

***** Running training *****
  Num examples = 3915
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 245


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 1305
  Batch size = 16


<transformers.trainer.Trainer at 0x7f6e7bd87490>

In [42]:
def get_sbert_centroid_args(sentences, num_labels:int, bert_model_name='all-distilroberta-v1', verbose=False):
    from sentence_transformers import SentenceTransformer, util
    import torch
    model = SentenceTransformer(bert_model_name)
    model.max_seq_length = np.argmax(sentences)
    
    embedding_list = model.encode(sentences, show_progress_bar=verbose)
    from sklearn.cluster import KMeans
    clustering_model = KMeans(n_clusters=num_labels, random_state=1337) 
    clustering_model.fit(embedding_list)
    cluster_assignment = clustering_model.labels_
    clustered_sentences = {}
    for sentence_id, cluster_id in enumerate(cluster_assignment):
        if cluster_id not in clustered_sentences:
            clustered_sentences[cluster_id] = []

        clustered_sentences[cluster_id].append(sentence_id)

    centroids = []
    for i in range(len(clustering_model.cluster_centers_)):
        center = clustering_model.cluster_centers_[i]
        # get centroid arg for cluster by min euclidian distance from cluster center
        centroid_arg = clustered_sentences[i][np.argmin([np.linalg.norm(embedding_list[cluster_item_arg]-center) for cluster_item_arg in clustered_sentences[i]])]
        centroids.append(centroid_arg)
    return centroids

In [43]:
centroid_args = get_sbert_centroid_args(sentences=X, num_labels=30)
centroid_args.sort()
centroid_args

[90,
 184,
 355,
 384,
 425,
 503,
 1197,
 1557,
 2011,
 2539,
 2732,
 2902,
 3012,
 3050,
 3237,
 3253,
 3328,
 3411,
 3523,
 3703,
 4064,
 4117,
 4249,
 4325,
 4629,
 5030,
 5397,
 5749,
 5850,
 6173]

In [None]:
def get_