In [11]:
import numpy as np
import pandas as pd
from transformers import (BertTokenizerFast,TFBertTokenizer,BertTokenizer,RobertaTokenizerFast,
                          DataCollatorWithPadding,TFRobertaForSequenceClassification,TFBertForSequenceClassification,
                          TFBertModel,create_optimizer)
from transformers import pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
import mlflow
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
FINE_TUNED_DIR = '../../pretained_or_finetune-models'
REVIEWS_DATASET_DIR = '../../dataset'
UTILS_DIR = '../../utils'
NLTK_DATA_PATH = f"{FINE_TUNED_DIR}/nltk_data"

# nltk.data.path.append(NLTK_DATA_PATH)

In [4]:
tourism_reviews_en = pd.read_csv(f"{REVIEWS_DATASET_DIR}/eng_reviews_emotion_classify.csv", encoding='utf-8')

columns_to_use = ['helpful_votes', 'location_id', 'review_id', 'review',
                  'review_subject', 'trip_type', 'rating',
                  'location_name', 'province', 'place_id', 'emotion', 'cleaned_review']
columns_to_train = ['location_id', 'review', 'rating']
review_df = tourism_reviews_en[columns_to_train]

In [5]:
review_df.head(1)

Unnamed: 0,location_id,review,rating
0,2209612,Besides elegant grand palace and wat pra kaew ...,5


In [12]:
# Parameters
max_len = 128
batch_size = 16
epochs = 3
model_name = 'bert-base-uncased'

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=1)
model.summary()


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  769       
                                                                 
Total params: 109483009 (417.64 MB)
Trainable params: 109483009 (417.64 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [6]:

# Tokenization
def tokenize(batch):
    return tokenizer(batch['review'], padding=True, truncation=True, max_length=max_len)

# Encode the labels
labels = review_df['rating'].values  # Assuming binary labels


In [29]:
def map_rating_to_sentiment(rating):
    if rating in [1, 2]:
        return 'negative'
    elif rating in [3]:
        return 'neutral'
    else:  # 4 or 5
        return 'positive'
    
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}

In [31]:
review_df['label'] = review_df['sentiment'].map(label_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_df['label'] = review_df['sentiment'].map(label_mapping)


In [32]:
review_df['sentiment'] = review_df['rating'].apply(map_rating_to_sentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_df['sentiment'] = review_df['rating'].apply(map_rating_to_sentiment)


In [39]:
train_texts, val_texts, train_labels, val_labels = train_test_split(review_df, labels, test_size=0.3, random_state=42)

# Tokenize datasets
train_encodings = tokenizer(train_texts['review'].tolist(), truncation=True, padding=True, max_length=max_len)
val_encodings = tokenizer(val_texts['review'].tolist(), truncation=True, padding=True, max_length=max_len)

In [34]:
# Convert to PyTorch dataset
import torch

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

In [70]:
class ReviewDataset(SentimentDataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [40]:
train_texts.head(5)

Unnamed: 0,location_id,review,rating,sentiment,label
3908,12594062,History of Siam can be quite confusing; but th...,5,positive,2
2232,10077750,Beautiful side trip. There are few view points...,5,positive,2
4006,13326328,One more nature that must come Before entering...,4,positive,2
4517,1441352,Here for the Songkran festival or Thai New Yea...,5,positive,2
585,9606514,There are a few shops attached to a hotel. No ...,1,negative,0


In [41]:
train_dataset = ReviewDataset(
    texts=train_texts['review'],
    labels=train_texts['sentiment'],
    tokenizer=tokenizer
)

test_dataset = ReviewDataset(
    texts=val_texts['review'],
    labels=val_texts['sentiment'],
    tokenizer=tokenizer
)

In [44]:
train_texts['label']

3908    2
2232    2
4006    2
4517    2
585     0
       ..
3772    2
5191    2
5226    2
5390    2
860     2
Name: label, Length: 4136, dtype: int64

In [45]:
# Compute class weights
classes = np.array([0, 1, 2])
class_weights = compute_class_weight('balanced', classes=classes, y=train_texts['label'])
class_weights = torch.tensor(class_weights, dtype=torch.float)

In [48]:
class_weights

tensor([4.2949, 2.6564, 0.4183])

In [71]:
# Load BERT model with class weights for loss function
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.config.class_weights = class_weights

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [61]:
import torch

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)  # Extracting the predicted class

    # Compute precision, recall, f1-score, and accuracy
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    # Ensure all values are Python-native types (float) for JSON serialization
    return {
        'accuracy': float(acc),
        'f1': float(f1),
        'precision': float(precision),
        'recall': float(recall)
    }

In [72]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    # compute_metrics=compute_metrics
)

In [75]:
trainer.train()

TypeError: Object of type Tensor is not JSON serializable

In [74]:
# Ensure no active run exists before starting a new one
if mlflow.active_run():
    mlflow.end_run()

# Start a new MLflow run
mlflow.start_run()

# Train the model
trainer.train()

# Evaluate the model (optional, but ensure we handle non-serializable objects)
eval_metrics = trainer.evaluate()

# Convert evaluation metrics from tensors to native types if needed
eval_metrics = {
    k: float(v.cpu().numpy()) if isinstance(v, torch.Tensor) else v 
    for k, v in eval_metrics.items()
}

# Log metrics to MLflow
mlflow.log_metrics(eval_metrics)

# End the MLflow run
mlflow.end_run()

# Save the best model
trainer.save_model("./best_model")


TypeError: Object of type Tensor is not JSON serializable

In [None]:
# Prediction and evaluation
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)

# Confusion matrix
cm = confusion_matrix(train_labels['sentiment'], preds)
sns.heatmap(cm, annot=True, fmt="d")
plt.show()

In [None]:
# If you just want to use a pre-trained model without training
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

# Example usage
example_texts = ["I love this!", "I hate it."]
predictions = classifier(example_texts)
print(predictions)

# IMDs Datasets


In [22]:
import tensorflow as tf### models
import numpy as np### math computations
import matplotlib.pyplot as plt### plotting bar chart
import sklearn### machine learning library
# import cv z2## image processing
from sklearn.metrics import confusion_matrix, roc_curve### metrics
import seaborn as sns### visualizations
import datetime # For Datetime Functions
import pathlib # handling files and paths on your operating system
import io # dealing with various types of I/O
import os 
import re # for Regular Expressions
import string
import time
from numpy import random
import gensim.downloader as api # to download pre-trained model datasets and word embeddings from Gensim's repository
from PIL import Image # manipulating images, resizing, cropping, adding text
import tensorflow_datasets as tfds # Tf Datasets
import tensorflow_probability as tfp


ModuleNotFoundError: No module named 'tensorflow.python.training.tracking'

In [23]:
import tensorflow as tf
import tensorflow_probability as tfp
print("TensorFlow version:", tf.__version__)
print("TensorFlow Probability version:", tfp.__version__)


ModuleNotFoundError: No module named 'tensorflow.python.training.tracking'

In [21]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import (Dense,Flatten,InputLayer,BatchNormalization,
                                     Dropout,Input,LayerNormalization)
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy,  SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam

from datasets import load_dataset
from transformers import (BertTokenizerFast,TFBertTokenizer,BertTokenizer,RobertaTokenizerFast,
                          DataCollatorWithPadding,TFRobertaForSequenceClassification,TFBertForSequenceClassification,
                          TFBertModel,create_optimizer)