In [None]:
pip install transformers==4.30

In [None]:
pip install transformers[torch] accelerate -U

In [None]:
pip install tensorboard

In [None]:
pip install torch torchvision

In [None]:
from google.colab import drive
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from torch.utils.data import Dataset
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from tensorboard.backend.event_processing import event_accumulator
import matplotlib.pyplot as plt
import os

In [None]:
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/dataset.csv')

In [None]:
plt.bar(df['score'].unique(), df['score'].value_counts())
plt.xlabel('Review Score')
plt.ylabel('Count')
plt.title('Distribution of Review Scores')
plt.show()

positive_df = df[df['score'] == 1]
negative_df = df[df['score'] == 0]

print(len(positive_df['score']))
print(len(negative_df['score']))

min_count = min(len(positive_df), len(negative_df))

positive_df = positive_df.sample(n=min_count, random_state=42)
negative_df = negative_df.sample(n=min_count, random_state=42)

positive_df = positive_df[:40000] #do usunięcia albo do ograniczenia liczby próbek
negative_df = negative_df[:40000] #do usunięcia albo do ograniczenia liczby próbek

p_train_ds, p_val_and_test_ds = train_test_split(positive_df, test_size=0.2, random_state=42)
p_valid_ds, p_test_ds = train_test_split(p_val_and_test_ds, test_size=0.5, random_state=42)

n_train_ds, n_val_and_test_ds = train_test_split(negative_df, test_size=0.2, random_state=42)
n_valid_ds, n_test_ds = train_test_split(n_val_and_test_ds, test_size=0.5, random_state=42)

# Concatenate the balanced samples to create the balanced dataset
train_ds = pd.concat([p_train_ds, n_train_ds]).sample(frac=1, random_state=42).reset_index(drop=True)
valid_ds = pd.concat([p_valid_ds, n_valid_ds]).sample(frac=1, random_state=42).reset_index(drop=True)
test_ds = pd.concat([p_test_ds, n_test_ds]).sample(frac=1, random_state=42).reset_index(drop=True)

print(len(train_ds['score']))

plt.bar(df['score'].unique(), train_ds['score'].value_counts())
plt.xlabel('Review Score')
plt.ylabel('Count')
plt.title('Distribution of Review Scores after Cleaning')
plt.show()

In [None]:
# Define pretrained tokenizer and model
model_name = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)

train_ds_tokenized = tokenizer(list(train_ds['text']), padding=True, truncation=True, max_length=512)
val_ds_tokenized = tokenizer(list(valid_ds['text']), padding=True, truncation=True, max_length=512)
test_ds_tokenized = tokenizer(list(test_ds["text"]), padding=True, truncation=True, max_length=512)

# Create torch dataset
class Dataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels.values if labels is not None else None

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(train_ds_tokenized, train_ds['score'])
val_dataset = Dataset(val_ds_tokenized, valid_ds['score'])
test_dataset = Dataset(test_ds_tokenized, test_ds['score'])

In [None]:
# Addind TensorBoard to monitor training in progress
%load_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/bert_model/logs

In [None]:
# Defining Trainer parameters
def compute_metrics(predictions):
  labels = predictions.label_ids
  predictions = predictions.predictions
  predictions = np.argmax(predictions, axis=-1)

  accuracy = accuracy_score(labels, predictions)
  recall = recall_score(labels, predictions)
  precision = precision_score(labels, predictions)
  f1 = f1_score(labels, predictions)

  # return dict(accuracy=accuracy, precision=precision, recall=recall, f1=f1)
  return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

In [None]:
import math
num_training_samples = len(train_dataset)
per_device_train_batch_size = 16
steps_per_epoch = num_training_samples // per_device_train_batch_size
eval_steps = math.ceil(steps_per_epoch // 3)

In [None]:
print(eval_steps)

In [None]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

args = TrainingArguments(
    output_dir='/content/drive/MyDrive/bert_model',
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    seed=42,
    load_best_model_at_end=True,
    logging_dir='/content/drive/MyDrive/bert_model/logs',
    logging_strategy="steps",
    logging_steps=50)

trainer = Trainer(
  model=model,
  args=args,
  train_dataset=train_dataset,
  eval_dataset=val_dataset,
  compute_metrics=compute_metrics,
  callbacks=[EarlyStoppingCallback(early_stopping_patience=8)])

In [None]:
# Train pre-trained model
trainer.train()

In [None]:
# Evaluate the best model on the test set
test_results = trainer.evaluate(test_dataset)
print(f"Test set results: {test_results}")

In [None]:
logdir = '/content/drive/MyDrive/bert_model/logs'
ea = event_accumulator.EventAccumulator(logdir)
ea.Reload()

In [None]:
print(ea.Tags()['scalars'])

In [None]:
training_loss = ea.scalars.Items('train/loss')  # Sprawdź, czy ten tag istnieje
validation_loss = ea.scalars.Items('eval/loss')

In [None]:
print(f"Number of epochs: {ea.scalars.Items('eval/loss')}")

In [None]:
# Wykres dla straty
plt.figure(figsize=(10, 5))
plt.plot([x.step for x in training_loss], [x.value for x in training_loss], label='Training Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.grid()
plt.legend()
plt.show()

In [None]:
# Wykres dla straty
plt.figure(figsize=(10, 5))
plt.plot([x.step for x in training_loss], [x.value for x in training_loss], label='Training Loss')
plt.plot([x.step for x in validation_loss], [x.value for x in validation_loss], label='Validation Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Training vs Validation Loss')
plt.legend()
plt.show()

In [None]:
# Wyodrębnienie danych dla dokładności
validation_accuracy = ea.scalars.Items('eval/accuracy')  # Dokładność walidacyjna/ewaluacyjna

# Wykres dla dokładności
plt.figure(figsize=(10, 5))
plt.plot([x.step for x in validation_accuracy], [x.value for x in validation_accuracy], label='Validation Accuracy')
plt.xlabel('Steps')
plt.ylabel('Accuracy')
plt.title('Validation Accuracy Over Time')
plt.legend()
plt.show()

In [None]:
print(f"Number of epochs: {trainer.state.epoch}")
#Zmienna trainer.state.epoch zawiera aktualną liczbę zakończonych epok.

# Print model summary
print(model)

# Print training arguments
print(args)

In [None]:
# Making predictions

x_predict = ["i love this", "I hate this", 'i like this, but i am afraid of it will not suit me', "cherries are sweet but my fiancee is sweeter", 'it was a lovely dinner, but what the hell happened to you two?']
x_predict_tokenized = tokenizer(x_predict, padding=True, truncation=True, max_length=512)
x_dataset = Dataset(x_predict_tokenized)
raw_pred, _, _ = Trainer(model).predict(x_dataset)
predictions = np.argmax(raw_pred, axis=1)
print(predictions)

In [None]:
x_predict = list(test_ds["text"])
x_predict_tokenized = tokenizer(x_predict, padding=True, truncation=True, max_length=512)
x_dataset = Dataset(x_predict_tokenized)
raw_pred, _, _ = Trainer(model).predict(x_dataset)
predictions = np.argmax(raw_pred, axis=1)
print(predictions)

In [None]:
# Evaluate the best model
best_cm = confusion_matrix(list(test_ds['score']), predictions)
best_df_cm = pd.DataFrame(best_cm, index=["Negative", "Positive"], columns=["Negative", "Positive"])

# Plot confusion matrix for the best model
hmap_best = sns.heatmap(best_df_cm, annot=True, fmt="d", cmap="PuBu")
hmap_best.yaxis.set_ticklabels(hmap_best.yaxis.get_ticklabels(), ha='right')
hmap_best.xaxis.set_ticklabels(hmap_best.xaxis.get_ticklabels(), ha='right')
plt.ylabel('Actual')
plt.xlabel('Prediction')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# #loading saved model to see if it works:

# from transformers import BertForSequenceClassification, BertTokenizer
# from transformers import BertTokenizer, BertForSequenceClassification

# # Model name or path to the directory containing the saved model
# model_path = '/content/drive/MyDrive/bert_model/checkpoint-500'
# model_name = "bert-base-uncased"
# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertForSequenceClassification.from_pretrained(model_path)
# from transformers import TrainingArguments, Trainer


# # Making predictions
# class Dataset(Dataset):
#     def __init__(self, encodings, labels=None):
#         self.encodings = encodings
#         self.labels = labels.values if labels is not None else None

#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#         if self.labels is not None:
#             item["labels"] = torch.tensor(self.labels[idx])
#         return item

#     def __len__(self):
#         return len(self.encodings["input_ids"])

# x_predict = ["i love this", "I hate this", 'i like this, but i am afraid of it will not suit me', "cherries are sweet but my fiancee is sweeter", 'it was a lovely dinner, but what the hell happened to you two?']
# x_predict_tokenized = tokenizer(x_predict, padding=True, truncation=True, max_length=512)
# x_dataset = Dataset(x_predict_tokenized)
# raw_pred, _, _ = Trainer(model).predict(x_dataset)
# predictions = np.argmax(raw_pred, axis=1)
# print(predictions)