In [3]:
# XG-Boost Classifier

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertModel
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
import torch
import re
import logging
from sklearn.preprocessing import LabelEncoder
from imblearn.combine import SMOTETomek
from collections import Counter

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

# Function to preprocess Tamil text
def preprocess_text(text):
    if not isinstance(text, str):
        logger.warning("Non-string input detected; replacing with empty string.")
        return '' 
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    return text

# Function to load and preprocess datasets
def load_data(train_path, test_path):
    try:
        logger.info(f"Loading training dataset from {train_path}...")
        train_data = pd.read_excel(train_path, engine="openpyxl")
        logger.info(f"Training dataset loaded successfully with {len(train_data)} rows.")
        
        logger.info(f"Loading testing dataset from {test_path}...")
        test_data = pd.read_excel(test_path, engine="openpyxl")
        logger.info(f"Testing dataset loaded successfully with {len(test_data)} rows.")
        
        train_data['content'] = train_data['content'].fillna('')
        train_data['labels'] = train_data['labels'].fillna('unknown')
        test_data['content'] = test_data['content'].fillna('')
        test_data['labels'] = test_data['labels'].fillna('unknown')
        
        train_data['text'] = train_data['content'].apply(preprocess_text)
        test_data['text'] = test_data['content'].apply(preprocess_text)
        
        logger.info("Text preprocessing completed.")
        return train_data['text'], train_data['labels'], test_data['text'], test_data['labels']
    except Exception as e:
        logger.error(f"Error loading datasets: {e}")
        raise

# Function to compute mBERT embeddings
def compute_mbert_embeddings(texts, batch_size=32):
    logger.info("Computing mBERT embeddings...")
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    model = BertModel.from_pretrained('bert-base-multilingual-cased')
    model.eval()
    
    embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # Using CLS token
            embeddings.extend(batch_embeddings)
            if i % (batch_size * 10) == 0:
                logger.info(f"Processed {i}/{len(texts)} texts...")
    
    embeddings = np.array(embeddings)
    logger.info("mBERT embeddings computed successfully.")
    return embeddings

# Function to balance classes using SMOTE + Tomek Links
def balance_classes(X, y):
    logger.info("Applying SMOTE + Tomek Links to balance classes...")
    smote_tomek = SMOTETomek(random_state=42)
    X_resampled, y_resampled = smote_tomek.fit_resample(X, y)
    logger.info(f"Class distribution after balancing: {Counter(y_resampled)}")
    return X_resampled, y_resampled

# Main function to train and evaluate the model
def train_and_evaluate(train_path, test_path):
    try:
        train_texts, train_labels, test_texts, test_labels = load_data(train_path, test_path)
        
        label_encoder = LabelEncoder()
        train_labels_encoded = label_encoder.fit_transform(train_labels)
        test_labels_encoded = label_encoder.transform(test_labels)
        label_mapping = {i: label for i, label in enumerate(label_encoder.classes_)}
        
        logger.info(f"Class distribution before balancing: {Counter(train_labels_encoded)}")
        
        train_embeddings = compute_mbert_embeddings(train_texts.tolist())
        X_resampled, y_resampled = balance_classes(train_embeddings, train_labels_encoded)
        
        test_embeddings = compute_mbert_embeddings(test_texts.tolist())
        
        logger.info("Training the model...")
        classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
        classifier.fit(X_resampled, y_resampled)
        
        predictions = classifier.predict(test_embeddings)
        
        logger.info("Classification Report:")
        print(classification_report(test_labels_encoded, predictions, target_names=[label_mapping[i] for i in range(len(label_mapping))]))
        
        accuracy = accuracy_score(test_labels_encoded, predictions)
        macro_f1 = f1_score(test_labels_encoded, predictions, average='macro')
        macro_precision = precision_score(test_labels_encoded, predictions, average='macro')
        macro_recall = recall_score(test_labels_encoded, predictions, average='macro')
        
        logger.info(f"Accuracy: {accuracy:.4f}")
        logger.info(f"Macro Precision: {macro_precision:.4f}")
        logger.info(f"Macro Recall: {macro_recall:.4f}")
        logger.info(f"Macro F1-score: {macro_f1:.4f}")
        
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Macro Precision: {macro_precision:.4f}")
        print(f"Macro Recall: {macro_recall:.4f}")
        print(f"Macro F1-score: {macro_f1:.4f}")
    
    except Exception as e:
        logger.error(f"Error during training and evaluation: {e}")
        raise

# File paths to datasets
train_file_path = r"C:\Users\haris\OneDrive\Desktop\Sentiment Analysis\PS_train(3).xlsx"
test_file_path =  r"C:\Users\haris\OneDrive\Desktop\Sentiment Analysis\PS_test.xlsx"
train_and_evaluate(train_file_path, test_file_path)


2025-03-09 22:33:08,818 - INFO - Loading training dataset from C:\Users\haris\OneDrive\Desktop\Sentiment Analysis\PS_train(3).xlsx...
2025-03-09 22:33:08,994 - INFO - Training dataset loaded successfully with 4352 rows.
2025-03-09 22:33:08,994 - INFO - Loading testing dataset from C:\Users\haris\OneDrive\Desktop\Sentiment Analysis\PS_test.xlsx...
2025-03-09 22:33:09,042 - INFO - Testing dataset loaded successfully with 544 rows.
2025-03-09 22:33:09,074 - INFO - Text preprocessing completed.
2025-03-09 22:33:09,074 - INFO - Class distribution before balancing: Counter({3: 1361, 5: 790, 1: 637, 4: 575, 6: 412, 0: 406, 2: 171})
2025-03-09 22:33:09,074 - INFO - Computing mBERT embeddings...
2025-03-09 22:33:16,323 - INFO - Processed 0/4352 texts...
2025-03-09 22:34:05,840 - INFO - Processed 320/4352 texts...
2025-03-09 22:34:31,060 - INFO - Processed 640/4352 texts...
2025-03-09 22:35:06,710 - INFO - Processed 960/4352 texts...
2025-03-09 22:35:34,050 - INFO - Processed 1280/4352 texts...


                   precision    recall  f1-score   support

         Negative       0.13      0.13      0.13        46
          Neutral       0.16      0.13      0.14        70
None of the above       0.95      0.84      0.89        25
      Opinionated       0.36      0.45      0.40       171
         Positive       0.20      0.19      0.19        75
        Sarcastic       0.27      0.24      0.25       106
    Substantiated       0.13      0.12      0.12        51

         accuracy                           0.29       544
        macro avg       0.32      0.30      0.31       544
     weighted avg       0.28      0.29      0.28       544

Accuracy: 0.2904
Macro Precision: 0.3160
Macro Recall: 0.2985
Macro F1-score: 0.3055
