In [None]:
!pip install SentencePiece
!pip install datasets
!pip install imbalanced-learn

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from transformers import AutoTokenizer, TFXLMRobertaForSequenceClassification
from imblearn.over_sampling import SMOTE

file_path = '/kaggle/input/xllm-data/dataset (2).csv'
data = pd.read_csv(file_path)

In [None]:
label_encoder = LabelEncoder()
data['encoded_emotion'] = label_encoder.fit_transform(data['Emotion'])
data['Review'] = data['Review'].astype(str)
train_data, test_data = train_test_split(
    data, test_size=0.2, random_state=42, stratify=data['encoded_emotion']
)
#aritro ami train test e rakhsi only so eta change koris na

In [None]:
train_text, train_labels = train_data['Review'].tolist(), train_data['encoded_emotion'].tolist()
test_text, test_labels = test_data['Review'].tolist(), test_data['encoded_emotion'].tolist()

# Tokenizer er moddhe ami smote use korsi
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

def tokenize_function(text_list):
    return tokenizer(text_list, padding='max_length', truncation=True, max_length=128, return_tensors='tf')
train_encodings = tokenize_function(train_text)
test_encodings = tokenize_function(test_text)


In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
))

#  SMOTE apply korsi for  balancing the class
X_train = np.array(train_encodings['input_ids'])  # token id nisi
y_train = np.array(train_labels)

#  SMOTE apply korsi to balance the dataset
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

train_text_resampled = [tokenizer.decode(ids) for ids in X_res] 
train_labels_resampled = y_res
train_encodings_resampled = tokenize_function(train_text_resampled)
train_dataset_resampled = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings_resampled),
    train_labels_resampled
))
model = TFXLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=len(label_encoder.classes_))

optimizer = tf.keras.optimizers.Adam(learning_rate=4e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])


In [None]:
history = model.fit(train_dataset_resampled.shuffle(3000).batch(16), epochs=20, batch_size=16, validation_data=test_dataset.batch(16))

eval_results = model.evaluate(test_dataset.batch(16))
print(f"Test Loss: {eval_results[0]}, Test Accuracy: {eval_results[1]}")

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
y_pred = model.predict(test_dataset.batch(16)).logits
y_pred = tf.argmax(y_pred, axis=1).numpy()

unique_classes = np.unique(test_labels)

#  unique classes
target_names = [label_encoder.inverse_transform([cls])[0] for cls in unique_classes]

print("Confusion Matrix:\n", confusion_matrix(test_labels, y_pred))
precision = precision_score(test_labels, y_pred, average='macro')
recall = recall_score(test_labels, y_pred, average='macro')
f1 = f1_score(test_labels, y_pred, average='macro')
print(f"Precision: {precision}\nRecall: {recall}\nF1 Score: {f1}")

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(test_labels, y_pred)
print(f"Accuracy: {accuracy}")