<a href="https://colab.research.google.com/github/joserafaelrebelo/pdm2023/blob/dev/kaggle_experiments/bert_based_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports and setting data

In [1]:
!pip install -q kaggle

In [2]:
! pip install transformers datasets accelerate torchinfo evaluate

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134

In [3]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [4]:
!mkdir ~/.kaggle
!cp /gdrive/MyDrive/kaggle/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle competitions download -c 'anlise-de-sentimento-do-google-play' -p google-play-data \
 && cd google-play-data \
 && unzip  anlise-de-sentimento-do-google-play.zip

Downloading anlise-de-sentimento-do-google-play.zip to google-play-data
 92% 140M/152M [00:00<00:00, 244MB/s]
100% 152M/152M [00:00<00:00, 236MB/s]
Archive:  anlise-de-sentimento-do-google-play.zip
  inflating: reviews_sample_submission_kaggle.csv  
  inflating: reviews_test_kaggle.csv  
  inflating: reviews_train.csv       


In [6]:
reviews_csv = "/content/google-play-data/reviews_train.csv"

In [7]:
import pandas as pd
df = pd.read_csv(reviews_csv)
df.head(1)

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,sortOrder,appId,sentiment
0,2e00b81c-d20a-45e6-96e7-319a4b9ad7ad,Hillary,https://play-lh.googleusercontent.com/a-/ALV-U...,"Eu realmente amo muito a shein, amo comprar po...",1,363,9.6.2,2023-09-18 07:56:47,"Olá, para qualquer problema, você pode entrar ...",2023-09-18 22:54:47,9.6.2,most_relevant,com.zzkko,0


In [8]:
# get score to 0-4
df["score"] = df["score"] - 1

In [9]:
nan_count_per_column = df.isna().sum()
print(nan_count_per_column)

reviewId                     0
userName                     2
userImage                    0
content                     24
score                        0
thumbsUpCount                0
reviewCreatedVersion    147434
at                           0
replyContent            582576
repliedAt               582576
appVersion              147434
sortOrder                    0
appId                        0
sentiment                    0
dtype: int64


In [10]:
df = df.dropna()

In [11]:
# set only content and score for dataset.csv
df[["content", "score"]].to_csv("dataset.csv", index=None)

In [12]:
# turn into a huggingface dataset
from datasets import load_dataset, VerificationMode
dataset = load_dataset("csv", data_files="dataset.csv")

Generating train split: 0 examples [00:00, ? examples/s]

In [13]:
dataset = dataset.rename_column("score", "labels")

In [14]:
dataset_split = dataset["train"].train_test_split(test_size=0.25, seed=99)

In [15]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoConfig,
    TrainingArguments,
    Trainer
)

### Model configurations

In [32]:
# getting model and configurations

# model_id = "neuralmind/bert-base-portuguese-cased"
model_id = "/gdrive/MyDrive/pdm-letxa_joserafaelrebelo/google-play-sentiment-analysis/checkpoint-69874"
config = AutoConfig.from_pretrained(model_id)

In [33]:
len(dataset['train']['content'][0].split(' '))

55

In [34]:
config.id2label = {
    0: "Muito Negativo",
    1: "Negativo",
    2: "Neutro",
    3: "Positivo",
    4: "Muito Positivo"
}

In [35]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)


def tokenize_function(examples):
  return tokenizer(examples["content"], truncation=True, padding="max_length", max_length=60)


tokenized_datasets = dataset_split.map(tokenize_function, batched=True)

Map:   0%|          | 0/279492 [00:00<?, ? examples/s]

Map:   0%|          | 0/93165 [00:00<?, ? examples/s]

In [36]:
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
eval_dataset = tokenized_datasets["test"].shuffle(seed=42)

In [37]:
len(train_dataset)

279492

In [38]:
len(eval_dataset)

93165

In [39]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_id, config=config,
                                                           ignore_mismatched_sizes=True)

In [44]:
from transformers import TrainingArguments

batch_size = 16
number_of_epochs = 1
logging_steps = len(eval_dataset) // batch_size

steps = (len(eval_dataset) / batch_size) * number_of_epochs
warmup_steps = int(0.2 * steps)

training_args = TrainingArguments(
    num_train_epochs=number_of_epochs,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    logging_steps=logging_steps,
    warmup_steps= warmup_steps,
    save_steps=1000,
    eval_steps=500,
    output_dir = "/content/gdrive/MyDrive/pdm-letxa_joserafaelrebelo/google-play-sentiment-analysis",
)

In [41]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [None]:
# from transformers import TrainingArguments, Trainer

# training_args = TrainingArguments(output_dir="/content/gdrive/MyDrive/pdm-letxa/google-play-sentiment-analysis", evaluation_strategy="epoch")

In [42]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [45]:
model.resize_token_embeddings(len(tokenizer))
tokenizer.add_special_tokens({'pad_token': '<pad>'})

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


 [34937/34937 1:25:31, Epoch 1/1]
Epoch 	Training Loss 	Validation Loss 	Accuracy
1 	1.014600 	1.011038 	0.566060

TrainOutput(global_step=34937, training_loss=1.0540985242732965, metrics={'train_runtime': 5131.1697, 'train_samples_per_second': 54.469, 'train_steps_per_second': 6.809, 'total_flos': 1.005421701396168e+16, 'train_loss': 1.0540985242732965, 'epoch': 1.0})

In [None]:
!pip install --upgrade huggingface_hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
trainer.push_to_hub("google-play-sentiment-analysis")

In [None]:
tokenizer.push_to_hub(repo_id="lemorim/google-play-sentiment-analysis")

In [None]:
model.push_to_hub(repo_id="lemorim/google-play-sentiment-analysis", create_pr=True)