In [1]:
!pip install mlflow

In [1]:
import os
from google.colab import userdata
from transformers import pipeline
import os
import sqlite3
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import mlflow.transformers
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    confusion_matrix, classification_report, roc_curve, auc
)
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from datasets import Dataset
import torch
import warnings
warnings.filterwarnings('ignore')



In [None]:
"""
Complete Sentiment Model Evaluation on SQLite Database
Evaluates a HuggingFace sentiment model on 1000 random reviews
"""

# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)


def balance_dataset(df, method='undersample', sentiment_col='voted_up'):
    """Balance the dataset by sentiment"""
    print("\n" + "=" * 80)
    print(f"DATASET BALANCING (Method: {method.upper()})")
    print("=" * 80)
    
    # Get counts
    sentiment_counts = df[sentiment_col].value_counts()
    print(f"\nBefore balancing:")
    for label, count in sentiment_counts.items():
        print(f"  {label}: {count:,}")

    if method == 'undersample':
        # Undersample majority class
        min_count = sentiment_counts.min()

        balanced_dfs = []
        for label in sentiment_counts.index:
            label_df = df[df[sentiment_col] == label]
            sampled_df = label_df.sample(n=min_count, random_state=42)
            balanced_dfs.append(sampled_df)

        df_balanced = pd.concat(balanced_dfs, ignore_index=True)

    elif method == 'oversample':
        # Oversample minority class
        max_count = sentiment_counts.max()

        balanced_dfs = []
        for label in sentiment_counts.index:
            label_df = df[df[sentiment_col] == label]
            sampled_df = label_df.sample(n=max_count, replace=True, random_state=42)
            balanced_dfs.append(sampled_df)

        df_balanced = pd.concat(balanced_dfs, ignore_index=True)

    else:
        print("Invalid method! Using original dataset.")
        return df

    # Shuffle
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

    # Show new distribution
    new_counts = df_balanced[sentiment_col].value_counts()
    print(f"\nAfter balancing:")
    for label, count in new_counts.items():
        print(f"  {label}: {count:,}")

    return df_balanced


def load_data_from_sqlite(db_path: str, text_column: str = 'review', label_column: str = 'voted_up', table_name: str = 'reviews', limit: int = 1000):
    """Load labeled reviews from SQLite database"""
    print("Loading data from SQLite...")
    conn = sqlite3.connect(db_path)

    # Adjust column names according to your database schema
    query = f"""
        SELECT {text_column}, {label_column}
        FROM {table_name}
        WHERE {text_column} IS NOT NULL
        AND {label_column} IS NOT NULL
        ORDER BY RANDOM()
        LIMIT {limit}"""

    df = pd.read_sql_query(query, conn)
    conn.close()

    print(f"✓ Loaded {df.shape[0]} reviews")
    print(f"  - Positive samples: {sum(df[label_column])}")
    print(f"  - Negative samples: {len(df[label_column]) - sum(df[label_column])}")

    return df


def predict_sentiments(reviews, model_name="distilbert-base-uncased-finetuned-sst-2-english",
                       batch_size=32):
    """
    Predict sentiments using HuggingFace pipeline

    Args:
        reviews: List of review texts
        model_name: HuggingFace model identifier
        batch_size: Batch size for inference

    Returns:
        predictions: Binary predictions (0=negative, 1=positive)
        probabilities: Confidence scores for positive class
    """
    print(f"\nLoading model: {model_name}")
    sentiment_pipeline = pipeline(
        "sentiment-analysis",
        model=model_name,
        device=0 if torch.cuda.is_available() else -1  # Use GPU if available
    )

    print(f"Running inference on {len(reviews)} reviews...")
    results = sentiment_pipeline(reviews.to_list(), batch_size=batch_size,
                                 truncation=True, max_length=512, padding=True)

    # Convert to binary format (0=negative, 1=positive)
    predictions = []
    probabilities = []

    for result in results:
        # Handle different label formats
        label = result['label'].upper()
        score = result['score']

        if 'POSITIVE' in label or label == 'LABEL_1' or label == '1' or label == 1:
            predictions.append(1)
            probabilities.append(score)
        else:
            predictions.append(0)
            probabilities.append(1 - score)  # Flip probability for negative

    print("✓ Predictions complete")
    return np.array(predictions), np.array(probabilities)


def plot_confusion_matrix(y_true, y_pred, class_names=['Negative', 'Positive']):
    """Plot confusion matrix heatmap"""
    cm = confusion_matrix(y_true, y_pred)

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names,
                cbar_kws={'label': 'Count'})
    plt.title('Confusion Matrix', fontsize=14, fontweight='bold')
    plt.ylabel('True Label', fontsize=12)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.tight_layout()
    plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
    print("✓ Saved: confusion_matrix.png")
    plt.close()


def plot_roc_curve(y_true, y_proba):
    """Plot ROC curve"""
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2,
             label=f'ROC curve (AUC = {roc_auc:.3f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--',
             label='Random Classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title('ROC Curve', fontsize=14, fontweight='bold')
    plt.legend(loc="lower right")
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig('roc_curve.png', dpi=300, bbox_inches='tight')
    print("✓ Saved: roc_curve.png")
    plt.close()


def plot_metrics_comparison(precision, recall, f1, class_names=['Negative', 'Positive']):
    """Plot bar chart comparing metrics per class"""
    x = np.arange(len(class_names))
    width = 0.25

    fig, ax = plt.subplots(figsize=(10, 6))
    bars1 = ax.bar(x - width, precision, width, label='Precision', color='steelblue')
    bars2 = ax.bar(x, recall, width, label='Recall', color='coral')
    bars3 = ax.bar(x + width, f1, width, label='F1-Score', color='mediumseagreen')

    ax.set_ylabel('Score', fontsize=12)
    ax.set_title('Per-Class Metrics Comparison', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(class_names)
    ax.legend()
    ax.set_ylim([0, 1.1])
    ax.grid(axis='y', alpha=0.3)

    # Add value labels on bars
    for bars in [bars1, bars2, bars3]:
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.3f}', ha='center', va='bottom', fontsize=9)

    plt.tight_layout()
    plt.savefig('metrics_comparison.png', dpi=300, bbox_inches='tight')
    print("✓ Saved: metrics_comparison.png")
    plt.close()


def plot_prediction_distribution(y_proba, y_true):
    """Plot distribution of prediction probabilities"""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

    # Histogram
    ax1.hist(y_proba[y_true == 0], bins=30, alpha=0.6, label='Negative (True)', color='red')
    ax1.hist(y_proba[y_true == 1], bins=30, alpha=0.6, label='Positive (True)', color='green')
    ax1.set_xlabel('Predicted Probability (Positive Class)', fontsize=11)
    ax1.set_ylabel('Frequency', fontsize=11)
    ax1.set_title('Prediction Probability Distribution', fontsize=12, fontweight='bold')
    ax1.legend()
    ax1.grid(alpha=0.3)

    # Box plot
    data_to_plot = [y_proba[y_true == 0], y_proba[y_true == 1]]
    box = ax2.boxplot(data_to_plot, labels=['Negative (True)', 'Positive (True)'],
                      patch_artist=True)
    box['boxes'][0].set_facecolor('lightcoral')
    box['boxes'][1].set_facecolor('lightgreen')
    ax2.set_ylabel('Predicted Probability (Positive Class)', fontsize=11)
    ax2.set_title('Prediction Confidence by True Class', fontsize=12, fontweight='bold')
    ax2.grid(alpha=0.3)

    plt.tight_layout()
    plt.savefig('prediction_distribution.png', dpi=300, bbox_inches='tight')
    print("✓ Saved: prediction_distribution.png")
    plt.close()


def evaluate_model(y_true, y_pred, y_proba, class_names=['Negative', 'Positive']):
    """
    Comprehensive model evaluation with all metrics and visualizations
    """
    print("\n" + "="*70)
    print("SENTIMENT CLASSIFICATION EVALUATION RESULTS")
    print("="*70)

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, average=None
    )

    # Macro and weighted averages
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        y_true, y_pred, average='macro'
    )
    precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(
        y_true, y_pred, average='weighted'
    )

    # Additional metrics
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    roc_auc = auc(fpr, tpr)

    # Print overall metrics
    print(f"\n{'OVERALL METRICS':-^70}")
    print(f"Accuracy:           {accuracy*100:.2f}%")
    print(f"ROC AUC Score:      {roc_auc:.4f}")

    # Print macro averages
    print(f"\n{'MACRO AVERAGES':-^70}")
    print(f"Precision (Macro):  {precision_macro*100:.2f}%")
    print(f"Recall (Macro):     {recall_macro*100:.2f}%")
    print(f"F1-Score (Macro):   {f1_macro*100:.2f}%")

    # Print weighted averages
    print(f"\n{'WEIGHTED AVERAGES':-^70}")
    print(f"Precision (Weighted): {precision_weighted*100:.2f}%")
    print(f"Recall (Weighted):    {recall_weighted*100:.2f}%")
    print(f"F1-Score (Weighted):  {f1_weighted*100:.2f}%")

    # Print per-class metrics
    print(f"\n{'PER-CLASS METRICS':-^70}")
    for i, class_name in enumerate(class_names):
        print(f"\n{class_name} (n={support[i]}):")
        print(f"  Precision:  {precision[i]*100:.2f}%")
        print(f"  Recall:     {recall[i]*100:.2f}%")
        print(f"  F1-Score:   {f1[i]*100:.2f}%")

    # Detailed classification report
    print(f"\n{'DETAILED CLASSIFICATION REPORT':-^70}")
    print(classification_report(y_true, y_pred, target_names=class_names, digits=4))

    # Generate all plots
    print(f"\n{'GENERATING VISUALIZATIONS':-^70}")
    plot_confusion_matrix(y_true, y_pred, class_names)
    plot_roc_curve(y_true, y_proba)
    plot_metrics_comparison(precision, recall, f1, class_names)
    plot_prediction_distribution(y_proba, y_true)

    print("\n" + "="*70)
    print("EVALUATION COMPLETE!")
    print("="*70)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'support': support
    }


# ============================================================================
# MAIN EXECUTION
# ============================================================================

if __name__ == "__main__":

    # Models used
    # 'distilbert/distilbert-base-uncased'
    # 'bert-base-uncased'
    # 'roberta-base'
    # 'cardiffnlp/twitter-roberta-base-sentiment'
    # 'siebert/sentiment-roberta-large-english' BEST
    # Configuration
    DB_PATH = '/content/gamesDB.db'              # Your database path
    TABLE_NAME = 'reviews'              # Your table name
    TEXT_COLUMN = 'review'         # Column with review text
    LABEL_COLUMN = 'voted_up'    # Column with labels (0/1)
    NUM_REVIEWS = 1000                  # Number of reviews to evaluate
    MODEL_NAME = 'roberta-uncased'
    BATCH_SIZE = 32
    BALANCE_TYPE = 'oversample'
    USE_BALANCED =  False

    # Step 1: Load data from database
    df = load_data_from_sqlite(
        db_path=DB_PATH,
        table_name=TABLE_NAME,
        text_column=TEXT_COLUMN,
        label_column=LABEL_COLUMN,
        limit=NUM_REVIEWS
    )

    # Balance the dataset
    if USE_BALANCED:
      df_balanced = balance_dataset(df, BALANCE_TYPE)
      reviews, labels = df_balanced[TEXT_COLUMN], df_balanced[LABEL_COLUMN]
    else:
      reviews, labels = df[TEXT_COLUMN], df[LABEL_COLUMN]

    # Step 2: Make predictions
    predictions, probabilities = predict_sentiments(
        reviews=reviews,
        model_name=MODEL_NAME,
        batch_size=BATCH_SIZE
    )

    # Step 3: Evaluate model
    results = evaluate_model(
        y_true=labels,
        y_pred=predictions,
        y_proba=probabilities,
        class_names=['Negative', 'Positive']
    )

    print(f"\n✓ All results saved to current directory")
    print(f"  - confusion_matrix.png")
    print(f"  - roc_curve.png")
    print(f"  - metrics_comparison.png")
    print(f"  - prediction_distribution.png")