In [13]:
import pandas as pd

# this crates a very simple dataset where a description maps to a price
df = pd.read_csv("data.csv")
df = df[["description", "price"]]

# remove all the genius discount information
for i in range(df.shape[0]):
    description = df.loc[i, "description"]
    if "Genius discount" in description:
        df.loc[i, "description"] = " ".join(df.loc[i, "description"].split("\n")[1:])
    else:
        df.loc[i, "description"] = " ".join(df.loc[i, "description"].split("\n"))

# limit to 50 words of each description
for i in range(df.shape[0]):
    description = df.loc[i, "description"]
    df.loc[i, "description"] = " ".join(df.loc[i, "description"].split(" ")[:50])

df = df.rename(columns={"description": "text", "price": "labels"}) # necessary for fine-tuning

df

Unnamed: 0,text,labels
0,"The State Hotel features a fitness center, ter...",6.212606
1,Hotel Theodore is a nonsmoking hotel located i...,5.826000
2,"Located in Central Seattle, Kimpton Hotel Mona...",6.084499
3,Less than 10 minutes’ drive from the Space Nee...,6.061457
4,"Attractively set in Seattle, citizenM Seattle ...",5.937536
...,...,...
3054,"Located in Apopka, Florida, this hotel offers ...",5.062595
3055,"Featuring free WiFi, Hyatt Place Orlando Lake ...",5.446737
3056,"Only 1 mi from Universal Studios Florida™, thi...",5.187386
3057,"Experience all of the comforts of home, includ...",5.327876


In [14]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from datasets import Dataset

# fine-tuning implementation based on:
# https://predictivehacks.com/how-to-fine-tune-an-nlp-regression-model-with-transformers-and-huggingface/

# Create the dataset based on our pandas dataframe
dataset = Dataset.from_pandas(df, preserve_index=False)
dataset = dataset.train_test_split(test_size=0.25, seed=0)

# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Load the pre-trained model from huggingface (num_labels = 1 for regression)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1)

Map:   0%|          | 0/2294 [00:00<?, ? examples/s]

Map:   0%|          | 0/765 [00:00<?, ? examples/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifi

In [15]:
from sklearn.metrics import mean_squared_error
from transformers import TrainingArguments, Trainer

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

training_args = TrainingArguments(output_dir="test_trainer",
                                  logging_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  num_train_epochs=3,
                                  save_total_limit = 2,
                                  save_strategy = 'no',
                                  load_best_model_at_end=False
                                  )


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics
)

trainer.train()



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 