In [None]:
# RANDOM FOREST CLASSIFIER

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
import torch
import re
import logging
from sklearn.preprocessing import LabelEncoder
from imblearn.combine import SMOTETomek
from collections import Counter

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

# Function to preprocess Tamil text
def preprocess_text(text):
    if not isinstance(text, str):
        logger.warning("Non-string input detected; replacing with empty string.")
        return '' 
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    return text

# Function to load and preprocess the dataset
def load_data(file_path):
    try:
        logger.info(f"Loading dataset from {file_path}...")
        data = pd.read_excel(file_path, engine="openpyxl")
        logger.info(f"Dataset loaded successfully with {len(data)} rows.")
        
        data['content'] = data['content'].fillna('')
        data['labels'] = data['labels'].fillna('unknown')
        
        data['text'] = data['content'].apply(preprocess_text)
        logger.info("Text preprocessing completed.")

        return data['text'], data['labels']
    except Exception as e:
        logger.error(f"Error loading dataset: {e}")
        raise

# Function to compute mBERT embeddings
def compute_mbert_embeddings(texts, batch_size=32):
    logger.info("Computing mBERT embeddings...")
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    model = BertModel.from_pretrained('bert-base-multilingual-cased')
    model.eval()
    
    embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # Using CLS token
            embeddings.extend(batch_embeddings)
            if i % (batch_size * 10) == 0:
                logger.info(f"Processed {i}/{len(texts)} texts...")
    
    embeddings = np.array(embeddings)
    logger.info("mBERT embeddings computed successfully.")
    return embeddings

# Function to balance classes using SMOTE + Tomek Links
def balance_classes(X, y):
    logger.info("Applying SMOTE + Tomek Links to balance classes...")
    smote_tomek = SMOTETomek(random_state=42)
    X_resampled, y_resampled = smote_tomek.fit_resample(X, y)
    logger.info(f"Class distribution after balancing: {Counter(y_resampled)}")
    return X_resampled, y_resampled

# Main function to train and evaluate the model
def train_and_evaluate(file_path):
    try:
        texts, labels = load_data(file_path)
        
        label_encoder = LabelEncoder()
        labels_encoded = label_encoder.fit_transform(labels)
        label_mapping = {i: label for i, label in enumerate(label_encoder.classes_)}
        logger.info(f"Class distribution before balancing: {Counter(labels_encoded)}")
        
        mbert_embeddings = compute_mbert_embeddings(texts.tolist())
        X_resampled, y_resampled = balance_classes(mbert_embeddings, labels_encoded)
        
        # Train-test split (80% training, 20% testing)
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)
        
        logger.info("Training the model...")
        classifier = RandomForestClassifier(random_state=42)
        classifier.fit(X_train, y_train)
        
        predictions = classifier.predict(X_test)
        
        logger.info("Classification Report:")
        print(classification_report(y_test, predictions, target_names=[label_mapping[i] for i in range(len(label_mapping))]))
        
        accuracy = accuracy_score(y_test, predictions)
        macro_f1 = f1_score(y_test, predictions, average='macro')
        macro_precision = precision_score(y_test, predictions, average='macro')
        macro_recall = recall_score(y_test, predictions, average='macro')
        
        logger.info(f"Accuracy: {accuracy:.4f}")
        logger.info(f"Macro Precision: {macro_precision:.4f}")
        logger.info(f"Macro Recall: {macro_recall:.4f}")
        logger.info(f"Macro F1-score: {macro_f1:.4f}")
        
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Macro Precision: {macro_precision:.4f}")
        print(f"Macro Recall: {macro_recall:.4f}")
        print(f"Macro F1-score: {macro_f1:.4f}")
    
    except Exception as e:
        logger.error(f"Error during training and evaluation: {e}")
        raise

# File path to dataset
file_path = r"C:\\Users\\haris\\OneDrive\\Desktop\\Sentiment Analysis\\PS_train(3).xlsx"
train_and_evaluate(file_path)


2025-03-09 22:08:23,885 - INFO - Loading dataset from C:\\Users\\haris\\OneDrive\\Desktop\\Sentiment Analysis\\PS_train(3).xlsx...
2025-03-09 22:08:24,186 - INFO - Dataset loaded successfully with 4352 rows.
2025-03-09 22:08:24,233 - INFO - Text preprocessing completed.
2025-03-09 22:08:24,233 - INFO - Class distribution before balancing: Counter({3: 1361, 5: 790, 1: 637, 4: 575, 6: 412, 0: 406, 2: 171})
2025-03-09 22:08:24,233 - INFO - Computing mBERT embeddings...
2025-03-09 22:08:27,148 - INFO - Processed 0/4352 texts...
2025-03-09 22:09:10,736 - INFO - Processed 320/4352 texts...
2025-03-09 22:09:34,567 - INFO - Processed 640/4352 texts...


In [None]:
# SVM CLASSIFIER

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
import torch
import re
import logging
from sklearn.preprocessing import LabelEncoder
from imblearn.combine import SMOTETomek
from collections import Counter

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

# Function to preprocess Tamil text
def preprocess_text(text):
    if not isinstance(text, str):
        logger.warning("Non-string input detected; replacing with empty string.")
        return '' 
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    return text

# Function to load and preprocess the dataset
def load_data(file_path):
    try:
        logger.info(f"Loading dataset from {file_path}...")
        data = pd.read_excel(file_path, engine="openpyxl")
        logger.info(f"Dataset loaded successfully with {len(data)} rows.")
        
        data['content'] = data['content'].fillna('')
        data['labels'] = data['labels'].fillna('unknown')
        
        data['text'] = data['content'].apply(preprocess_text)
        logger.info("Text preprocessing completed.")

        return data['text'], data['labels']
    except Exception as e:
        logger.error(f"Error loading dataset: {e}")
        raise

# Function to compute mBERT embeddings
def compute_mbert_embeddings(texts, batch_size=32):
    logger.info("Computing mBERT embeddings...")
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    model = BertModel.from_pretrained('bert-base-multilingual-cased')
    model.eval()
    
    embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # Using CLS token
            embeddings.extend(batch_embeddings)
            if i % (batch_size * 10) == 0:
                logger.info(f"Processed {i}/{len(texts)} texts...")
    
    embeddings = np.array(embeddings)
    logger.info("mBERT embeddings computed successfully.")
    return embeddings

# Function to balance classes using SMOTE + Tomek Links
def balance_classes(X, y):
    logger.info("Applying SMOTE + Tomek Links to balance classes...")
    smote_tomek = SMOTETomek(random_state=42)
    X_resampled, y_resampled = smote_tomek.fit_resample(X, y)
    logger.info(f"Class distribution after balancing: {Counter(y_resampled)}")
    return X_resampled, y_resampled

# Main function to train and evaluate the model
def train_and_evaluate(file_path):
    try:
        texts, labels = load_data(file_path)
        
        label_encoder = LabelEncoder()
        labels_encoded = label_encoder.fit_transform(labels)
        label_mapping = {i: label for i, label in enumerate(label_encoder.classes_)}
        logger.info(f"Class distribution before balancing: {Counter(labels_encoded)}")
        
        mbert_embeddings = compute_mbert_embeddings(texts.tolist())
        X_resampled, y_resampled = balance_classes(mbert_embeddings, labels_encoded)
        
        # Train-test split (80% training, 20% testing)
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)
        
        logger.info("Training the model...")
        classifier = SVC(kernel='linear', random_state=42)
        classifier.fit(X_train, y_train)
        
        predictions = classifier.predict(X_test)
        
        logger.info("Classification Report:")
        print(classification_report(y_test, predictions, target_names=[label_mapping[i] for i in range(len(label_mapping))]))
        
        accuracy = accuracy_score(y_test, predictions)
        macro_f1 = f1_score(y_test, predictions, average='macro')
        macro_precision = precision_score(y_test, predictions, average='macro')
        macro_recall = recall_score(y_test, predictions, average='macro')
        
        logger.info(f"Accuracy: {accuracy:.4f}")
        logger.info(f"Macro Precision: {macro_precision:.4f}")
        logger.info(f"Macro Recall: {macro_recall:.4f}")
        logger.info(f"Macro F1-score: {macro_f1:.4f}")
        
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Macro Precision: {macro_precision:.4f}")
        print(f"Macro Recall: {macro_recall:.4f}")
        print(f"Macro F1-score: {macro_f1:.4f}")
    
    except Exception as e:
        logger.error(f"Error during training and evaluation: {e}")
        raise

# File path to dataset
file_path = r"C:\\Users\\haris\\OneDrive\\Desktop\\Sentiment Analysis\\PS_train(3).xlsx"
train_and_evaluate(file_path)


In [None]:
# XG-Boost Classifier

import pandas as pd  # Data manipulation
import numpy as np  # Numerical operations
import matplotlib.pyplot as plt  # Visualizations
from sklearn.model_selection import train_test_split  # Train/test split
from transformers import BertTokenizer, BertModel  # BERT models
from xgboost import XGBClassifier  # XGBoost classifier
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score  # Evaluation metrics
import torch  # PyTorch
import re  # Regular expressions
import logging  # Logging
from sklearn.preprocessing import LabelEncoder  # Label encoding
from imblearn.combine import SMOTETomek  # SMOTE-Tomek balancing
from collections import Counter  # Counting elements

# Configure logging for better debugging and monitoring
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

# Function to preprocess Tamil text
def preprocess_text(text):
    """
    Cleans and preprocesses the input Tamil text by removing extra spaces, URLs, and mentions.
    """
    if not isinstance(text, str):
        logger.warning("Non-string input detected; replacing with empty string.")
        return ''  # Handling non-string input
    text = text.strip()  # Remove leading and trailing whitespace
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    return text

# Function to load and preprocess the dataset
def load_data(file_path):
    """
    Loads the dataset from an Excel file, preprocesses the text, and handles missing values.
    """
    try:
        logger.info(f"Loading dataset from {file_path}...")
        data = pd.read_excel(file_path, engine="openpyxl")  # Load dataset
        logger.info(f"Dataset loaded successfully with {len(data)} rows.")
        
        # Handling missing values in content and labels
        data['content'] = data['content'].fillna('')
        data['labels'] = data['labels'].fillna('unknown')
        
        # Apply text preprocessing function to each row
        data['text'] = data['content'].apply(preprocess_text)
        logger.info("Text preprocessing completed.")

        return data['text'], data['labels']
    except Exception as e:
        logger.error(f"Error loading dataset: {e}")
        raise

# Function to compute mBERT embeddings
def compute_mbert_embeddings(texts, batch_size=32):
    """
    Converts Tamil text into numerical embeddings using mBERT (multilingual BERT).
    """
    logger.info("Computing mBERT embeddings...")
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')  # Load tokenizer
    model = BertModel.from_pretrained('bert-base-multilingual-cased')  # Load mBERT model
    model.eval()  # Set model to evaluation mode
    
    embeddings = []
    with torch.no_grad():  # Disable gradient calculation for efficiency
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]  # Process batch of texts
            inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
            outputs = model(**inputs)  # Get model outputs
            batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # Extract CLS token representation
            embeddings.extend(batch_embeddings)
            
            if i % (batch_size * 10) == 0:
                logger.info(f"Processed {i}/{len(texts)} texts...")
    
    embeddings = np.array(embeddings)
    logger.info("mBERT embeddings computed successfully.")
    return embeddings

# Function to plot class distribution
def plot_class_distribution(labels, title):
    """
    Plots the distribution of class labels to analyze class imbalance.
    """
    label_counts = Counter(labels)  # Count occurrences of each label
    plt.figure(figsize=(8, 5))
    plt.bar(label_counts.keys(), label_counts.values(), color='skyblue')  # Create bar plot
    plt.xlabel('Class Labels')
    plt.ylabel('Count')
    plt.title(title)
    plt.xticks(rotation=0)  # Keep x-axis labels straight
    plt.show()

# Function to balance classes using SMOTE + Tomek Links
def balance_classes(X, y):
    """
    Balances the dataset using SMOTE + Tomek Links to handle class imbalance issues.
    """
    logger.info("Applying SMOTE + Tomek Links to balance classes...")
    smote_tomek = SMOTETomek(random_state=42)
    X_resampled, y_resampled = smote_tomek.fit_resample(X, y)  # Apply resampling
    logger.info(f"Class distribution after balancing: {Counter(y_resampled)}")
    return X_resampled, y_resampled

# Main function to train and evaluate the model
def train_and_evaluate(file_path):
    """
    Trains and evaluates an XGBoost classifier using mBERT embeddings and balanced data.
    """
    try:
        # Load and preprocess data
        texts, labels = load_data(file_path)
        
        # Encode categorical labels into numerical format
        label_encoder = LabelEncoder()
        labels_encoded = label_encoder.fit_transform(labels)
        label_mapping = {i: label for i, label in enumerate(label_encoder.classes_)}
        
        logger.info(f"Class distribution before balancing: {Counter(labels_encoded)}")
        plot_class_distribution(labels_encoded, "Class Distribution Before Balancing")
        
        # Compute embeddings from text
        mbert_embeddings = compute_mbert_embeddings(texts.tolist())
        
        # Balance dataset using SMOTE + Tomek Links
        X_resampled, y_resampled = balance_classes(mbert_embeddings, labels_encoded)
        plot_class_distribution(y_resampled, "Class Distribution After Balancing")
        
        # Train-test split (80% training, 20% testing)
        X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)
        
        logger.info("Training the model...")
        classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
        classifier.fit(X_train, y_train)  # Train classifier
        
        # Make predictions on test data
        predictions = classifier.predict(X_test)
        
        # Print classification report with metrics
        logger.info("Classification Report:")
        print(classification_report(y_test, predictions, target_names=[label_mapping[i] for i in range(len(label_mapping))]))
        
        # Compute evaluation metrics
        accuracy = accuracy_score(y_test, predictions)
        macro_f1 = f1_score(y_test, predictions, average='macro')
        macro_precision = precision_score(y_test, predictions, average='macro')
        macro_recall = recall_score(y_test, predictions, average='macro')
        
        # Log and print evaluation results
        logger.info(f"Accuracy: {accuracy:.4f}")
        logger.info(f"Macro Precision: {macro_precision:.4f}")
        logger.info(f"Macro Recall: {macro_recall:.4f}")
        logger.info(f"Macro F1-score: {macro_f1:.4f}")
        
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Macro Precision: {macro_precision:.4f}")
        print(f"Macro Recall: {macro_recall:.4f}")
        print(f"Macro F1-score: {macro_f1:.4f}")
    
    except Exception as e:
        logger.error(f"Error during training and evaluation: {e}")
        raise

# File path to dataset
file_path = r"C:\\Users\\haris\\OneDrive\\Desktop\\Sentiment Analysis\\PS_train(3).xlsx"

# Execute training and evaluation
train_and_evaluate(file_path)
