In [None]:
# SVM Classifier

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.kernel_approximation import RBFSampler
from transformers import BertTokenizer, BertModel
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import torch
import re
import nltk
from nltk.corpus import stopwords
import emoji
from sklearn.preprocessing import LabelEncoder
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

# Download stop words (only needs to be done once)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    if not isinstance(text, str):
        logger.warning("Non-string input detected; replacing with empty string.")
        return '' 
    text = text.strip()
    text = emoji.replace_emoji(text, replace='')
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(words)

# Function to load and preprocess the dataset
def load_data(file_path):
    try:
        logger.info(f"Loading dataset from {file_path}...")
        data = pd.read_excel(file_path, engine="openpyxl")
        logger.info(f"Dataset loaded successfully with {len(data)} rows.")
        
        data['content_translated'] = data['content_translated'].fillna('')
        data['labels'] = data['labels'].fillna('unknown')

        data['text'] = data['content_translated'].apply(preprocess_text)
        logger.info("Text preprocessing completed.")

        return data['text'], data['labels']
    except Exception as e:
        logger.error(f"Error loading dataset: {e}")
        raise

# Function to compute TF-IDF embeddings
def compute_tfidf_embeddings(texts, n_features=512):
    logger.info("Computing TF-IDF embeddings...")
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    tfidf_embeddings = tfidf_vectorizer.fit_transform(texts).toarray()
    rbf_sampler = RBFSampler(gamma=1.0, n_components=n_features, random_state=42)
    reduced_tfidf_embeddings = rbf_sampler.fit_transform(tfidf_embeddings)
    logger.info("TF-IDF embeddings computed successfully.")
    return reduced_tfidf_embeddings

# Function to compute BERT embeddings
def compute_bert_embeddings(texts, batch_size=32):
    logger.info("Computing BERT embeddings...")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    
    embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
            embeddings.extend(batch_embeddings)
    
            if i % (batch_size * 10) == 0:
                logger.info(f"Processed {i}/{len(texts)} texts...")
    logger.info("BERT embeddings computed successfully.")
    return np.array(embeddings)

# Function to fuse TF-IDF and BERT embeddings
def fuse_embeddings(tfidf_embeddings, bert_embeddings):
    logger.info("Fusing TF-IDF and BERT embeddings...")
    fused = np.hstack((tfidf_embeddings, bert_embeddings))
    logger.info("Embeddings fused successfully.")
    return fused

# Function to train and evaluate model
def train_and_evaluate(file_path):
    try:
        texts, labels = load_data(file_path)
        logger.info("Encoding labels...")
        label_encoder = LabelEncoder()
        labels = label_encoder.fit_transform(labels)

        tfidf_embeddings = compute_tfidf_embeddings(texts)
        bert_embeddings = compute_bert_embeddings(texts.tolist())
        fused_embeddings = fuse_embeddings(tfidf_embeddings, bert_embeddings)

        X_train, X_test, y_train, y_test = train_test_split(fused_embeddings, labels, test_size=0.2, random_state=42)

        classifier = SVC(kernel='linear', probability=True, random_state=42)
        classifier.fit(X_train, y_train)

        predictions = classifier.predict(X_test)
        report = classification_report(y_test, predictions, target_names=label_encoder.classes_, digits=2, output_dict=True)
        
        print("{:<20} {:<10} {:<10} {:<10} {:<10}".format("Category", "Precision", "Recall", "F1-score", "Support"))
        for category, metrics in report.items():
            if category not in ["accuracy", "macro avg", "weighted avg"]:
                print("{:<20} {:<10.2f} {:<10.2f} {:<10.2f} {:<10}".format(
                    category, metrics['precision'], metrics['recall'], metrics['f1-score'], int(metrics['support'])
                ))
        print("{:<20} {:<10.2f} {:<10.2f} {:<10.2f} {:<10}".format("Macro avg", 
            report['macro avg']['precision'], report['macro avg']['recall'], report['macro avg']['f1-score'], int(report['macro avg']['support'])))
        print("{:<20} {:<10.2f} {:<10.2f} {:<10.2f} {:<10}".format("Weighted avg", 
            report['weighted avg']['precision'], report['weighted avg']['recall'], report['weighted avg']['f1-score'], int(report['weighted avg']['support'])))
        print(f"Accuracy: {report['accuracy']:.4f}")
        print(f"Macro Precision: {report['macro avg']['precision']:.4f}")
        print(f"Macro Recall: {report['macro avg']['recall']:.4f}")
        print(f"Macro F1-score: {report['macro avg']['f1-score']:.4f}")

    except Exception as e:
        logger.error(f"Error during training and evaluation: {e}")
        raise

# File path to dataset
file_path = "Tamil_sentiment_analysis_translated.xlsx"
train_and_evaluate(file_path)


In [None]:
# Random Forest Classifier

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.kernel_approximation import RBFSampler
from transformers import BertTokenizer, BertModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import torch
import re
import nltk
from nltk.corpus import stopwords
import emoji
from sklearn.preprocessing import LabelEncoder
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

# Download stop words (only needs to be done once)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    if not isinstance(text, str):
        logger.warning("Non-string input detected; replacing with empty string.")
        return '' 
    text = text.strip()
    text = emoji.replace_emoji(text, replace='')
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(words)

# Function to load and preprocess the dataset
def load_data(file_path):
    try:
        logger.info(f"Loading dataset from {file_path}...")
        data = pd.read_excel(file_path, engine="openpyxl")
        logger.info(f"Dataset loaded successfully with {len(data)} rows.")
        
        data['content_translated'] = data['content_translated'].fillna('')
        data['labels'] = data['labels'].fillna('unknown')

        data['text'] = data['content_translated'].apply(preprocess_text)
        logger.info("Text preprocessing completed.")

        return data['text'], data['labels']
    except Exception as e:
        logger.error(f"Error loading dataset: {e}")
        raise

# Function to compute TF-IDF embeddings
def compute_tfidf_embeddings(texts, n_features=512):
    logger.info("Computing TF-IDF embeddings...")
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    tfidf_embeddings = tfidf_vectorizer.fit_transform(texts).toarray()
    rbf_sampler = RBFSampler(gamma=1.0, n_components=n_features, random_state=42)
    reduced_tfidf_embeddings = rbf_sampler.fit_transform(tfidf_embeddings)
    logger.info("TF-IDF embeddings computed successfully.")
    return reduced_tfidf_embeddings

# Function to compute BERT embeddings
def compute_bert_embeddings(texts, batch_size=32):
    logger.info("Computing BERT embeddings...")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    
    embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
            embeddings.extend(batch_embeddings)
    
            if i % (batch_size * 10) == 0:
                logger.info(f"Processed {i}/{len(texts)} texts...")
    logger.info("BERT embeddings computed successfully.")
    return np.array(embeddings)

# Function to fuse TF-IDF and BERT embeddings
def fuse_embeddings(tfidf_embeddings, bert_embeddings):
    logger.info("Fusing TF-IDF and BERT embeddings...")
    fused = np.hstack((tfidf_embeddings, bert_embeddings))
    logger.info("Embeddings fused successfully.")
    return fused

# Function to train and evaluate model
def train_and_evaluate(file_path):
    try:
        texts, labels = load_data(file_path)
        logger.info("Encoding labels...")
        label_encoder = LabelEncoder()
        labels = label_encoder.fit_transform(labels)

        tfidf_embeddings = compute_tfidf_embeddings(texts)
        bert_embeddings = compute_bert_embeddings(texts.tolist())
        fused_embeddings = fuse_embeddings(tfidf_embeddings, bert_embeddings)

        X_train, X_test, y_train, y_test = train_test_split(fused_embeddings, labels, test_size=0.2, random_state=42)

        classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        classifier.fit(X_train, y_train)

        predictions = classifier.predict(X_test)
        report = classification_report(y_test, predictions, target_names=label_encoder.classes_, digits=2, output_dict=True)
        
        print("{:<20} {:<10} {:<10} {:<10} {:<10}".format("Category", "Precision", "Recall", "F1-score", "Support"))
        for category, metrics in report.items():
            if category not in ["accuracy", "macro avg", "weighted avg"]:
                print("{:<20} {:<10.2f} {:<10.2f} {:<10.2f} {:<10}".format(
                    category, metrics['precision'], metrics['recall'], metrics['f1-score'], int(metrics['support'])
                ))
        print("{:<20} {:<10.2f} {:<10.2f} {:<10.2f} {:<10}".format("Macro avg", 
            report['macro avg']['precision'], report['macro avg']['recall'], report['macro avg']['f1-score'], int(report['macro avg']['support'])))
        print("{:<20} {:<10.2f} {:<10.2f} {:<10.2f} {:<10}".format("Weighted avg", 
            report['weighted avg']['precision'], report['weighted avg']['recall'], report['weighted avg']['f1-score'], int(report['weighted avg']['support'])))
        print(f"Accuracy: {report['accuracy']:.4f}")
        print(f"Macro Precision: {report['macro avg']['precision']:.4f}")
        print(f"Macro Recall: {report['macro avg']['recall']:.4f}")
        print(f"Macro F1-score: {report['macro avg']['f1-score']:.4f}")

    except Exception as e:
        logger.error(f"Error during training and evaluation: {e}")
        raise

# File path to dataset
file_path = "Tamil_sentiment_analysis_translated.xlsx"
train_and_evaluate(file_path)


In [None]:
# XG-Boost Classifier

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.kernel_approximation import RBFSampler
from transformers import BertTokenizer, BertModel
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import torch
import re
import nltk
from nltk.corpus import stopwords
import emoji
from sklearn.preprocessing import LabelEncoder
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger()

# Download stop words (only needs to be done once)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    if not isinstance(text, str):
        logger.warning("Non-string input detected; replacing with empty string.")
        return '' 
    text = text.strip()
    text = emoji.replace_emoji(text, replace='')
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(words)

# Function to load and preprocess the dataset
def load_data(file_path):
    try:
        logger.info(f"Loading dataset from {file_path}...")
        data = pd.read_excel(file_path, engine="openpyxl")
        logger.info(f"Dataset loaded successfully with {len(data)} rows.")
        
        data['content_translated'] = data['content_translated'].fillna('')
        data['labels'] = data['labels'].fillna('unknown')

        data['text'] = data['content_translated'].apply(preprocess_text)
        logger.info("Text preprocessing completed.")

        return data['text'], data['labels']
    except Exception as e:
        logger.error(f"Error loading dataset: {e}")
        raise

# Function to compute TF-IDF embeddings
def compute_tfidf_embeddings(texts, n_features=512):
    logger.info("Computing TF-IDF embeddings...")
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    tfidf_embeddings = tfidf_vectorizer.fit_transform(texts).toarray()
    rbf_sampler = RBFSampler(gamma=1.0, n_components=n_features, random_state=42)
    reduced_tfidf_embeddings = rbf_sampler.fit_transform(tfidf_embeddings)
    logger.info("TF-IDF embeddings computed successfully.")
    return reduced_tfidf_embeddings

# Function to compute BERT embeddings
def compute_bert_embeddings(texts, batch_size=32):
    logger.info("Computing BERT embeddings...")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    
    embeddings = []
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
            embeddings.extend(batch_embeddings)
    
            if i % (batch_size * 10) == 0:
                logger.info(f"Processed {i}/{len(texts)} texts...")
    logger.info("BERT embeddings computed successfully.")
    return np.array(embeddings)

# Function to fuse TF-IDF and BERT embeddings
def fuse_embeddings(tfidf_embeddings, bert_embeddings):
    logger.info("Fusing TF-IDF and BERT embeddings...")
    fused = np.hstack((tfidf_embeddings, bert_embeddings))
    logger.info("Embeddings fused successfully.")
    return fused

# Function to train and evaluate model
def train_and_evaluate(file_path):
    try:
        texts, labels = load_data(file_path)
        logger.info("Encoding labels...")
        label_encoder = LabelEncoder()
        labels = label_encoder.fit_transform(labels)

        tfidf_embeddings = compute_tfidf_embeddings(texts)
        bert_embeddings = compute_bert_embeddings(texts.tolist())
        fused_embeddings = fuse_embeddings(tfidf_embeddings, bert_embeddings)

        X_train, X_test, y_train, y_test = train_test_split(fused_embeddings, labels, test_size=0.2, random_state=42)

        classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
        classifier.fit(X_train, y_train)

        predictions = classifier.predict(X_test)
        report = classification_report(y_test, predictions, target_names=label_encoder.classes_, digits=2, output_dict=True)
        
        print("{:<20} {:<10} {:<10} {:<10} {:<10}".format("Category", "Precision", "Recall", "F1-score", "Support"))
        for category, metrics in report.items():
            if category not in ["accuracy", "macro avg", "weighted avg"]:
                print("{:<20} {:<10.2f} {:<10.2f} {:<10.2f} {:<10}".format(
                    category, metrics['precision'], metrics['recall'], metrics['f1-score'], int(metrics['support'])
                ))
        print("{:<20} {:<10.2f} {:<10.2f} {:<10.2f} {:<10}".format("Macro avg", 
            report['macro avg']['precision'], report['macro avg']['recall'], report['macro avg']['f1-score'], int(report['macro avg']['support'])))
        print("{:<20} {:<10.2f} {:<10.2f} {:<10.2f} {:<10}".format("Weighted avg", 
            report['weighted avg']['precision'], report['weighted avg']['recall'], report['weighted avg']['f1-score'], int(report['weighted avg']['support'])))
        print(f"Accuracy: {report['accuracy']:.4f}")
        print(f"Macro Precision: {report['macro avg']['precision']:.4f}")
        print(f"Macro Recall: {report['macro avg']['recall']:.4f}")
        print(f"Macro F1-score: {report['macro avg']['f1-score']:.4f}")

    except Exception as e:
        logger.error(f"Error during training and evaluation: {e}")
        raise

# File path to dataset
file_path = "Tamil_sentiment_analysis_translated.xlsx"
train_and_evaluate(file_path)
