In [55]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import pipeline
from datasets import Dataset

# Load the preprocessed dataset into a Pandas DataFrame
df_preproc = pd.read_parquet('sentiment_data_preprocessed.parquet')
# df.rename(columns={'text_cleaned': 'text'}, inplace=True)

df_preproc['labels'] = df_preproc['labels'].apply(lambda n: n-1)
df_preproc = df_preproc.dropna()
# df = df.sample(frac=.1, random_state=42)

df_preproc.head()




# yelp_ds = load_dataset('csv', data_files= 'sentiment_data_preprocessed.csv')
# yelp_ds = yelp_ds['train'].train_test_split(test_size=0.2, seed=42)
# yelp_ds


Unnamed: 0,text,labels
0,love going happy hour dinner great patio fans ...,3
1,love brewpub variety dishes nachos pizza sandw...,3
2,brother came visit wanted pick six pack good b...,4
3,bit weary trying shellfish company wharf often...,4
4,could give zero would order plain hamburger re...,0


In [2]:
# df_sentiment = df[['text_cleaned', 'stars']]
# df_sentiment.rename(columns={'text_cleaned': 'text', 'stars':'labels'}, inplace=True)
# df_sentiment

In [3]:
dataset = Dataset.from_pandas(df)
dataset = dataset.remove_columns(["__index_level_0__"])
dataset

Dataset({
    features: ['text', 'labels'],
    num_rows: 415052
})

In [4]:
train_test = dataset.train_test_split(test_size=0.4, seed=42)
test_valid = dataset.train_test_split(test_size=0.5, seed=42)

In [5]:
from datasets import DatasetDict
dataset = DatasetDict({
    'train': train_test['train'],
    'validation': test_valid['train'],
    'test' : test_valid['test'],
})

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 249031
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 207526
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 207526
    })
})

In [6]:
# from datasets import load_dataset

# dataset = train_test_split(X, y, test_size=0.3, random_state=42)

# # Split the dataset into training, validation, and test sets
# X_train, X_test, y_train, y_test = 

# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=42)

# X_train, X_val, y_train, y_val = X_train.reset_index(drop=True), X_val.reset_index(drop=True), y_train.reset_index(drop=True), y_val.reset_index(drop=True)

In [7]:
dataset['train'][0]

{'text': 'coffee shop literally want go everyday staff super chill signature drinks unique',
 'labels': 4}

In [8]:
features = dataset['train'].features
features

{'text': Value(dtype='string', id=None),
 'labels': Value(dtype='int64', id=None)}

In [9]:
df = dataset['train'].to_pandas()

df['labels'].value_counts(normalize=True).sort_index()

0    0.119017
1    0.078609
2    0.106774
3    0.228233
4    0.467368
Name: labels, dtype: float64

In [10]:
df.dropna()


Unnamed: 0,text,labels
0,coffee shop literally want go everyday staff s...,4
1,place new owners fantastic flavored mini donut...,4
2,wow pretty good food worth wait calamari fresh...,3
3,great small italian restaurant gnocchi ai game...,4
4,come 10 torr regularly right around corner off...,3
...,...,...
249026,stopped new place monday evening maybe best ni...,1
249027,excellent pho spring rolls nice pho broth real...,4
249028,terrible service incorrect orders late deliver...,0
249029,havent bfast spot since kid far best breakfast...,4


In [11]:
from transformers import AutoTokenizer
model_ckpt = 'microsoft/MiniLM-L12-H384-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, device='cpu')

In [12]:
tokenizer(dataset['train']['text'][:1])

{'input_ids': [[101, 4157, 4497, 6719, 2215, 2175, 10126, 3095, 3565, 10720, 8085, 8974, 4310, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [13]:
def tokenize_text(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512)

In [14]:
dataset = dataset.map(tokenize_text, batched=True)
dataset

Map:   0%|          | 0/249031 [00:00<?, ? examples/s]

Map:   0%|          | 0/207526 [00:00<?, ? examples/s]

Map:   0%|          | 0/207526 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 249031
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 207526
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 207526
    })
})

In [15]:
class_weights = (1-(df['labels'].value_counts(normalize=True).sort_index() / len(df))).values
class_weights

array([0.99999952, 0.99999968, 0.99999957, 0.99999908, 0.99999812])

In [16]:
import torch 
class_weights = torch.from_numpy(class_weights).float().to('cuda')
class_weights

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000], device='cuda:0')

In [17]:
from torch import nn
import torch
from transformers import Trainer

class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        logits = outputs.get('logits')

        labels = inputs.get('labels')

        loss_func = nn.CrossEntropyLoss(weight=class_weights)

        loss=loss_func(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [18]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
from sklearn.metrics import f1_score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    return {'f1':f1}

In [20]:
from transformers import TrainingArguments

batch_size = 16

logging_steps = len(dataset['train']) // batch_size
output_dir = 'minilm-finetuned-yelp'
training_args = TrainingArguments(output_dir=output_dir,
                                  num_train_epochs=15,
                                  learning_rate=5e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  logging_steps=logging_steps,
                                  fp16=False,
                                  push_to_hub=True,
                                  save_steps=10000
                                  )



In [21]:
trainer= WeightedLossTrainer(model=model,
                             args=training_args,
                             compute_metrics=compute_metrics,
                             train_dataset=dataset['train'],
                             eval_dataset=dataset['validation'],
                             tokenizer=tokenizer,
                             )

Cloning https://huggingface.co/kaitou/minilm-finetuned-yelp into local empty directory.


In [29]:
# from huggingface_hub import notebook_login
# notebook_login()
!huggingface-cli login --token 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

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to C:\Users\}{\.cache\huggingface\token
Login successful


In [23]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 249031
  Num Epochs = 15
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 233475
  Number of trainable parameters = 33361925


  0%|          | 0/233475 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Saving model checkpoint to minilm-finetuned-yelp\checkpoint-10000
Configuration saved in minilm-finetuned-yelp\checkpoint-10000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-10000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-10000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-10000\special_tokens_map.json
tokenizer config file saved in minilm-finetuned-yelp\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,

{'loss': 0.858, 'learning_rate': 4.666688082235786e-05, 'epoch': 1.0}


  0%|          | 0/12971 [00:00<?, ?it/s]

{'eval_loss': 0.7553852200508118, 'eval_f1': 0.6473943599058882, 'eval_runtime': 427.6047, 'eval_samples_per_second': 485.322, 'eval_steps_per_second': 30.334, 'epoch': 1.0}


Saving model checkpoint to minilm-finetuned-yelp\checkpoint-20000
Configuration saved in minilm-finetuned-yelp\checkpoint-20000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-20000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-20000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-20000\special_tokens_map.json
tokenizer config file saved in minilm-finetuned-yelp\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\special_tokens_map.json
Several commits (2) will be pushed upstream.
Saving model checkpoint to minilm-finetuned-yelp\checkpoint-30000
Configuration saved in minilm-finetuned-yelp\checkpoint-30000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-30000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-30000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-30000\special_tokens_map.js

{'loss': 0.7528, 'learning_rate': 4.333376164471571e-05, 'epoch': 2.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 207526
  Batch size = 16


  0%|          | 0/12971 [00:00<?, ?it/s]

{'eval_loss': 0.6906121969223022, 'eval_f1': 0.6960419654441062, 'eval_runtime': 426.8115, 'eval_samples_per_second': 486.224, 'eval_steps_per_second': 30.39, 'epoch': 2.0}


Saving model checkpoint to minilm-finetuned-yelp\checkpoint-40000
Configuration saved in minilm-finetuned-yelp\checkpoint-40000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-40000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-40000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-40000\special_tokens_map.json
tokenizer config file saved in minilm-finetuned-yelp\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\special_tokens_map.json
Several commits (4) will be pushed upstream.


{'loss': 0.7071, 'learning_rate': 4.0000642467073563e-05, 'epoch': 3.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 207526
  Batch size = 16


  0%|          | 0/12971 [00:00<?, ?it/s]

{'eval_loss': 0.6578182578086853, 'eval_f1': 0.7272650752141089, 'eval_runtime': 427.0929, 'eval_samples_per_second': 485.904, 'eval_steps_per_second': 30.37, 'epoch': 3.0}


Saving model checkpoint to minilm-finetuned-yelp\checkpoint-50000
Configuration saved in minilm-finetuned-yelp\checkpoint-50000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-50000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-50000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-50000\special_tokens_map.json
tokenizer config file saved in minilm-finetuned-yelp\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\special_tokens_map.json
Several commits (5) will be pushed upstream.
Saving model checkpoint to minilm-finetuned-yelp\checkpoint-60000
Configuration saved in minilm-finetuned-yelp\checkpoint-60000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-60000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-60000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-60000\special_tokens_map.js

{'loss': 0.6654, 'learning_rate': 3.666752328943141e-05, 'epoch': 4.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 207526
  Batch size = 16


  0%|          | 0/12971 [00:00<?, ?it/s]

{'eval_loss': 0.5850256085395813, 'eval_f1': 0.7584655738726847, 'eval_runtime': 431.8394, 'eval_samples_per_second': 480.563, 'eval_steps_per_second': 30.037, 'epoch': 4.0}


Saving model checkpoint to minilm-finetuned-yelp\checkpoint-70000
Configuration saved in minilm-finetuned-yelp\checkpoint-70000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-70000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-70000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-70000\special_tokens_map.json
tokenizer config file saved in minilm-finetuned-yelp\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\special_tokens_map.json
Several commits (7) will be pushed upstream.


{'loss': 0.6229, 'learning_rate': 3.3334404111789275e-05, 'epoch': 5.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 207526
  Batch size = 16


  0%|          | 0/12971 [00:00<?, ?it/s]

{'eval_loss': 0.5385793447494507, 'eval_f1': 0.7825056863901135, 'eval_runtime': 431.4143, 'eval_samples_per_second': 481.036, 'eval_steps_per_second': 30.066, 'epoch': 5.0}


Saving model checkpoint to minilm-finetuned-yelp\checkpoint-80000
Configuration saved in minilm-finetuned-yelp\checkpoint-80000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-80000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-80000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-80000\special_tokens_map.json
tokenizer config file saved in minilm-finetuned-yelp\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\special_tokens_map.json
Several commits (8) will be pushed upstream.
Saving model checkpoint to minilm-finetuned-yelp\checkpoint-90000
Configuration saved in minilm-finetuned-yelp\checkpoint-90000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-90000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-90000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-90000\special_tokens_map.js

{'loss': 0.5796, 'learning_rate': 3.0001284934147128e-05, 'epoch': 6.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 207526
  Batch size = 16


  0%|          | 0/12971 [00:00<?, ?it/s]

{'eval_loss': 0.4821244776248932, 'eval_f1': 0.8187590772661449, 'eval_runtime': 431.8405, 'eval_samples_per_second': 480.562, 'eval_steps_per_second': 30.037, 'epoch': 6.0}


Saving model checkpoint to minilm-finetuned-yelp\checkpoint-100000
Configuration saved in minilm-finetuned-yelp\checkpoint-100000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-100000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-100000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-100000\special_tokens_map.json
tokenizer config file saved in minilm-finetuned-yelp\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\special_tokens_map.json
Several commits (10) will be pushed upstream.


{'loss': 0.5367, 'learning_rate': 2.666816575650498e-05, 'epoch': 7.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 207526
  Batch size = 16


  0%|          | 0/12971 [00:00<?, ?it/s]

{'eval_loss': 0.42541003227233887, 'eval_f1': 0.8423243370295005, 'eval_runtime': 436.3945, 'eval_samples_per_second': 475.547, 'eval_steps_per_second': 29.723, 'epoch': 7.0}


Saving model checkpoint to minilm-finetuned-yelp\checkpoint-110000
Configuration saved in minilm-finetuned-yelp\checkpoint-110000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-110000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-110000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-110000\special_tokens_map.json
tokenizer config file saved in minilm-finetuned-yelp\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\special_tokens_map.json
Several commits (11) will be pushed upstream.
Saving model checkpoint to minilm-finetuned-yelp\checkpoint-120000
Configuration saved in minilm-finetuned-yelp\checkpoint-120000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-120000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-120000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-120000\special_to

{'loss': 0.4962, 'learning_rate': 2.3335046578862836e-05, 'epoch': 8.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 207526
  Batch size = 16


  0%|          | 0/12971 [00:00<?, ?it/s]

{'eval_loss': 0.37922847270965576, 'eval_f1': 0.8696387994007095, 'eval_runtime': 428.7435, 'eval_samples_per_second': 484.033, 'eval_steps_per_second': 30.254, 'epoch': 8.0}


Saving model checkpoint to minilm-finetuned-yelp\checkpoint-130000
Configuration saved in minilm-finetuned-yelp\checkpoint-130000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-130000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-130000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-130000\special_tokens_map.json
tokenizer config file saved in minilm-finetuned-yelp\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\special_tokens_map.json
Several commits (13) will be pushed upstream.
Saving model checkpoint to minilm-finetuned-yelp\checkpoint-140000
Configuration saved in minilm-finetuned-yelp\checkpoint-140000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-140000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-140000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-140000\special_to

{'loss': 0.4574, 'learning_rate': 2.000192740122069e-05, 'epoch': 9.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 207526
  Batch size = 16


  0%|          | 0/12971 [00:00<?, ?it/s]

{'eval_loss': 0.33641642332077026, 'eval_f1': 0.8843217228164648, 'eval_runtime': 428.2047, 'eval_samples_per_second': 484.642, 'eval_steps_per_second': 30.292, 'epoch': 9.0}


Saving model checkpoint to minilm-finetuned-yelp\checkpoint-150000
Configuration saved in minilm-finetuned-yelp\checkpoint-150000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-150000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-150000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-150000\special_tokens_map.json
tokenizer config file saved in minilm-finetuned-yelp\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\special_tokens_map.json
Several commits (15) will be pushed upstream.


{'loss': 0.4222, 'learning_rate': 1.666880822357854e-05, 'epoch': 10.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 207526
  Batch size = 16


  0%|          | 0/12971 [00:00<?, ?it/s]

{'eval_loss': 0.28931283950805664, 'eval_f1': 0.9042034093047879, 'eval_runtime': 4346.4884, 'eval_samples_per_second': 47.746, 'eval_steps_per_second': 2.984, 'epoch': 10.0}


Saving model checkpoint to minilm-finetuned-yelp\checkpoint-160000
Configuration saved in minilm-finetuned-yelp\checkpoint-160000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-160000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-160000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-160000\special_tokens_map.json
tokenizer config file saved in minilm-finetuned-yelp\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\special_tokens_map.json
Several commits (16) will be pushed upstream.
Saving model checkpoint to minilm-finetuned-yelp\checkpoint-170000
Configuration saved in minilm-finetuned-yelp\checkpoint-170000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-170000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-170000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-170000\special_to

{'loss': 0.3903, 'learning_rate': 1.3335689045936397e-05, 'epoch': 11.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 207526
  Batch size = 16


  0%|          | 0/12971 [00:00<?, ?it/s]

{'eval_loss': 0.25822898745536804, 'eval_f1': 0.9167015190778822, 'eval_runtime': 446.6116, 'eval_samples_per_second': 464.668, 'eval_steps_per_second': 29.043, 'epoch': 11.0}


Saving model checkpoint to minilm-finetuned-yelp\checkpoint-180000
Configuration saved in minilm-finetuned-yelp\checkpoint-180000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-180000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-180000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-180000\special_tokens_map.json
tokenizer config file saved in minilm-finetuned-yelp\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\special_tokens_map.json
Several commits (18) will be pushed upstream.


{'loss': 0.3626, 'learning_rate': 1.0002569868294252e-05, 'epoch': 12.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 207526
  Batch size = 16


  0%|          | 0/12971 [00:00<?, ?it/s]

{'eval_loss': 0.22856532037258148, 'eval_f1': 0.9282644202736262, 'eval_runtime': 428.3814, 'eval_samples_per_second': 484.442, 'eval_steps_per_second': 30.279, 'epoch': 12.0}


Saving model checkpoint to minilm-finetuned-yelp\checkpoint-190000
Configuration saved in minilm-finetuned-yelp\checkpoint-190000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-190000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-190000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-190000\special_tokens_map.json
tokenizer config file saved in minilm-finetuned-yelp\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\special_tokens_map.json
Several commits (19) will be pushed upstream.
Saving model checkpoint to minilm-finetuned-yelp\checkpoint-200000
Configuration saved in minilm-finetuned-yelp\checkpoint-200000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-200000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-200000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-200000\special_to

{'loss': 0.3386, 'learning_rate': 6.669450690652104e-06, 'epoch': 13.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 207526
  Batch size = 16


  0%|          | 0/12971 [00:00<?, ?it/s]

{'eval_loss': 0.20290784537792206, 'eval_f1': 0.9391609036077688, 'eval_runtime': 428.3125, 'eval_samples_per_second': 484.52, 'eval_steps_per_second': 30.284, 'epoch': 13.0}


Saving model checkpoint to minilm-finetuned-yelp\checkpoint-210000
Configuration saved in minilm-finetuned-yelp\checkpoint-210000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-210000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-210000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-210000\special_tokens_map.json
tokenizer config file saved in minilm-finetuned-yelp\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\special_tokens_map.json
Several commits (21) will be pushed upstream.


{'loss': 0.3213, 'learning_rate': 3.336331513009958e-06, 'epoch': 14.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 207526
  Batch size = 16


  0%|          | 0/12971 [00:00<?, ?it/s]

{'eval_loss': 0.1911805421113968, 'eval_f1': 0.9442512598340941, 'eval_runtime': 428.9762, 'eval_samples_per_second': 483.77, 'eval_steps_per_second': 30.237, 'epoch': 14.0}


Saving model checkpoint to minilm-finetuned-yelp\checkpoint-220000
Configuration saved in minilm-finetuned-yelp\checkpoint-220000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-220000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-220000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-220000\special_tokens_map.json
tokenizer config file saved in minilm-finetuned-yelp\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\special_tokens_map.json
Several commits (22) will be pushed upstream.
Saving model checkpoint to minilm-finetuned-yelp\checkpoint-230000
Configuration saved in minilm-finetuned-yelp\checkpoint-230000\config.json
Model weights saved in minilm-finetuned-yelp\checkpoint-230000\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\checkpoint-230000\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\checkpoint-230000\special_to

{'loss': 0.3063, 'learning_rate': 3.2123353678123996e-09, 'epoch': 15.0}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 207526
  Batch size = 16


  0%|          | 0/12971 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.18377083539962769, 'eval_f1': 0.9467422575650517, 'eval_runtime': 427.8337, 'eval_samples_per_second': 485.062, 'eval_steps_per_second': 30.318, 'epoch': 15.0}
{'train_runtime': 40029.2517, 'train_samples_per_second': 93.318, 'train_steps_per_second': 5.833, 'train_loss': 0.52113987051505, 'epoch': 15.0}


TrainOutput(global_step=233475, training_loss=0.52113987051505, metrics={'train_runtime': 40029.2517, 'train_samples_per_second': 93.318, 'train_steps_per_second': 5.833, 'train_loss': 0.52113987051505, 'epoch': 15.0})

In [31]:
# model.save_pretrained(output_dir)
trainer.push_to_hub()

Saving model checkpoint to minilm-finetuned-yelp
Configuration saved in minilm-finetuned-yelp\config.json
Model weights saved in minilm-finetuned-yelp\pytorch_model.bin
tokenizer config file saved in minilm-finetuned-yelp\tokenizer_config.json
Special tokens file saved in minilm-finetuned-yelp\special_tokens_map.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Text Classification', 'type': 'text-classification'}, 'metrics': [{'name': 'F1', 'type': 'f1', 'value': 0.9467422575650517}]}


In [34]:
my_model = 'minilm-finetuned-yelp'
pipe = pipeline('sentiment-analysis', my_model)

loading configuration file minilm-finetuned-yelp\config.json
Model config BertConfig {
  "_name_or_path": "minilm-finetuned-yelp",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size

In [57]:
df_test = dataset['test'].to_pandas()[['text','labels']]

test_result = df_test['text'].apply(lambda text: pipe(text[:512]))

In [58]:
this = df_preproc['text'].apply(lambda text: pipe(text[:512]))

len(df_preproc['text']),this

(415052,
 0         [{'label': 'LABEL_3', 'score': 0.9796379208564...
 1         [{'label': 'LABEL_3', 'score': 0.5744537711143...
 2         [{'label': 'LABEL_4', 'score': 0.9511278271675...
 3         [{'label': 'LABEL_2', 'score': 0.9871237277984...
 4         [{'label': 'LABEL_0', 'score': 0.9835019707679...
                                 ...                        
 415061    [{'label': 'LABEL_3', 'score': 0.9805882573127...
 415062    [{'label': 'LABEL_4', 'score': 0.9469411373138...
 415063    [{'label': 'LABEL_4', 'score': 0.9731036424636...
 415064    [{'label': 'LABEL_3', 'score': 0.9731329083442...
 415065    [{'label': 'LABEL_0', 'score': 0.9701607227325...
 Name: text, Length: 415052, dtype: object)

In [90]:
df_sent = pd.DataFrame(df_preproc['text'])
df_sent['stars'], df_sent['label'], df_sent['score'] = df_preproc['labels'], this.apply(lambda result: int(result[0]['label'][-1])+1), this.apply(lambda result: result[0]['score'])
df_sent

Unnamed: 0,text,stars,label,score
0,love going happy hour dinner great patio fans ...,3,4,0.979638
1,love brewpub variety dishes nachos pizza sandw...,3,4,0.574454
2,brother came visit wanted pick six pack good b...,4,5,0.951128
3,bit weary trying shellfish company wharf often...,4,3,0.987124
4,could give zero would order plain hamburger re...,0,1,0.983502
...,...,...,...,...
415061,first time today happened mothers day around 1...,3,4,0.980588
415062,incredible amazing always come philly get fish...,4,5,0.946941
415063,hesitant try la margarita thinking would super...,4,5,0.973104
415064,place times lunch buffet dinner would definite...,3,4,0.973133


In [None]:
# from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
# analyzer = pipeline("sentiment-analysis", model=model_ckpt)


In [63]:
emotion_model = 'lewtun/minilm-finetuned-emotion'
emo_pipe = pipeline('text-classification', emotion_model)

Downloading (…)lve/main/config.json:   0%|          | 0.00/954 [00:00<?, ?B/s]

loading configuration file config.json from cache at C:\Users\}{/.cache\huggingface\hub\models--lewtun--minilm-finetuned-emotion\snapshots\2e1ecc37e5edd7eb71dec436923ad199f57825c6\config.json
Model config BertConfig {
  "_name_or_path": "lewtun/minilm-finetuned-emotion",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "id2label": {
    "0": "sadness",
    "1": "joy",
    "2": "love",
    "3": "anger",
    "4": "fear",
    "5": "surprise"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "label2id": {
    "anger": 3,
    "fear": 4,
    "joy": 1,
    "love": 2,
    "sadness": 0,
    "surprise": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_typ

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at C:\Users\}{/.cache\huggingface\hub\models--lewtun--minilm-finetuned-emotion\snapshots\2e1ecc37e5edd7eb71dec436923ad199f57825c6\pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at lewtun/minilm-finetuned-emotion.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


Downloading (…)okenizer_config.json:   0%|          | 0.00/525 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

loading file vocab.txt from cache at C:\Users\}{/.cache\huggingface\hub\models--lewtun--minilm-finetuned-emotion\snapshots\2e1ecc37e5edd7eb71dec436923ad199f57825c6\vocab.txt
loading file tokenizer.json from cache at C:\Users\}{/.cache\huggingface\hub\models--lewtun--minilm-finetuned-emotion\snapshots\2e1ecc37e5edd7eb71dec436923ad199f57825c6\tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at C:\Users\}{/.cache\huggingface\hub\models--lewtun--minilm-finetuned-emotion\snapshots\2e1ecc37e5edd7eb71dec436923ad199f57825c6\special_tokens_map.json
loading file tokenizer_config.json from cache at C:\Users\}{/.cache\huggingface\hub\models--lewtun--minilm-finetuned-emotion\snapshots\2e1ecc37e5edd7eb71dec436923ad199f57825c6\tokenizer_config.json


In [64]:
ex = df_preproc['text'][0][:512]
ex, emo_pipe(ex)


('love going happy hour dinner great patio fans beat stl heat alsovery accomodating location like veal milanese mixed greens instead pasta theyll modify menu suit taste',
 [{'label': 'joy', 'score': 0.8993651270866394}])

In [68]:
for i in range(50,60):
    ex = df_preproc['text'][i][:512]
    print (ex, '\n', emo_pipe(ex))


12step program help manage reanimator coffee problem please share 
 [{'label': 'sadness', 'score': 0.35848113894462585}]
beautifully sweet ending sweet trip nola girls stumbled upon delicous savory lunch nearby cochon review follow sweet confections magazine st around corner art gallery strip julia st sampling cupcake chocolate mocha filling truffles raspberry creme brulee orange cupcake good mocha filling best part rasberry truffle fav love dark chocolate cloyingly sweet coffee wonderful chaser overly burnt smoky acidic usually coffee milk n sugar forgot time didnt even need seating nice cafe tables inside enough 6girl troop 
 [{'label': 'love', 'score': 0.5278953909873962}]
great place bring kids juice slice cake homemade popsicles think tea party tried everything favorites homemade doughnuts filled read bean popsicles patbingsu even good ol coffee cream sugar 
 [{'label': 'love', 'score': 0.6177704930305481}]
much care given healthy delicious food ex japanese squash rather white pot

In [71]:
emo_result = df_preproc['text'].apply(lambda text: emo_pipe(text[:512]))
emo_result

0           [{'label': 'joy', 'score': 0.8993651270866394}]
1           [{'label': 'joy', 'score': 0.8529354929924011}]
2           [{'label': 'joy', 'score': 0.9102669358253479}]
3         [{'label': 'sadness', 'score': 0.8094623684883...
4          [{'label': 'anger', 'score': 0.757133960723877}]
                                ...                        
415061      [{'label': 'joy', 'score': 0.5986286401748657}]
415062      [{'label': 'love', 'score': 0.459367036819458}]
415063      [{'label': 'joy', 'score': 0.5384106636047363}]
415064      [{'label': 'joy', 'score': 0.9112372994422913}]
415065      [{'label': 'joy', 'score': 0.7565946578979492}]
Name: text, Length: 415052, dtype: object

In [82]:
df_emo = pd.DataFrame(df_preproc['text'])
df_emo['stars'],df_emo['emotion'],df_emo['score'] = df_preproc['labels'], emo_result.apply(lambda result: result[0]['label']), emo_result.apply(lambda result: result[0]['score'])
df_emo

Unnamed: 0,text,stars,emotion,score
0,love going happy hour dinner great patio fans ...,3,joy,0.899365
1,love brewpub variety dishes nachos pizza sandw...,3,joy,0.852935
2,brother came visit wanted pick six pack good b...,4,joy,0.910267
3,bit weary trying shellfish company wharf often...,4,sadness,0.809462
4,could give zero would order plain hamburger re...,0,anger,0.757134
...,...,...,...,...
415061,first time today happened mothers day around 1...,3,joy,0.598629
415062,incredible amazing always come philly get fish...,4,love,0.459367
415063,hesitant try la margarita thinking would super...,4,joy,0.538411
415064,place times lunch buffet dinner would definite...,3,joy,0.911237


In [91]:
df_sent.to_parquet('yelp_sentiments.parquet')
df_emo.to_parquet('yelp_emotions.parquet')