In [None]:
!pip install mlflow

In [None]:
from google.colab import userdata
userdata.get('HF_TOKEN')
import os
import sqlite3
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import mlflow.transformers
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    confusion_matrix, classification_report
)
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    AutoModel, pipeline, Trainer, TrainingArguments
)
from datasets import Dataset
import warnings
warnings.filterwarnings('ignore')

# Load environment variables
load_dotenv()

# MLflow Configuration
os.environ['MLFLOW_TRACKING_USERNAME'] = os.getenv(userdata.get('username'))
os.environ['MLFLOW_TRACKING_PASSWORD'] = os.getenv(userdata.get('password'))
os.environ['MLFLOW_TRACKING_URI'] = os.getenv('MLFLOW_TRACKING_URI', 'https://mlflow.worldwidecheps.synology./')

# Set MLflow experiment
mlflow.set_experiment("steam-reviews-analysis")


# ============================================================================
# 1. DATA LOADING
# ============================================================================

def load_data_from_sqlite(db_path, table_name="reviews"):
    """Load labeled reviews from SQLite database"""
    print("Loading data from SQLite...")
    conn = sqlite3.connect(db_path)

    # Adjust column names according to your database schema
    query = f"""
    SELECT
        review,
        voted_up
    FROM {table_name}
    """

    df = pd.read_sql_query(query, conn)
    conn.close()

    print(f"Loaded {len(df)} reviews")
    print(f"Columns: {df.columns.tolist()}")
    print(f"\nSentiment distribution:\n{df['sentiment_label'].value_counts()}")
    print(f"\nCategory distribution:\n{df['product_category'].value_counts()}")

    return df


# ============================================================================
# 2. SENTIMENT CLASSIFICATION WITH TRANSFORMERS
# ============================================================================

class SentimentClassifier:
    def __init__(self, model_name="distilbert-base-uncased", num_labels=2):
        """
        Initialize sentiment classifier
        model_name options:
        - distilbert-base-uncased (lightweight, fast)
        - bert-base-uncased (strong general purpose)
        - roberta-base (robust to nuances)
        - cardiffnlp/twitter-roberta-base-sentiment (for short texts)
        """
        self.model_name = model_name
        self.num_labels = num_labels
        self.tokenizer = None
        self.model = None
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

    def prepare_data(self, texts, labels):
        """Prepare data for training"""
        dataset = Dataset.from_dict({
            'text': texts,
            'label': labels
        })
        return dataset

    def tokenize_function(self, examples):
        """Tokenize texts"""
        return self.tokenizer(
            examples['text'],
            padding='max_length',
            truncation=True,
            max_length=512
        )

    def train(self, train_texts, train_labels, val_texts, val_labels,
              epochs=3, batch_size=16, learning_rate=2e-5):
        """Fine-tune transformer model"""

        print(f"\nTraining {self.model_name} on {self.device}...")

        # Initialize tokenizer and model
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=self.num_labels
        ).to(self.device)

        # Prepare datasets
        train_dataset = self.prepare_data(train_texts, train_labels)
        val_dataset = self.prepare_data(val_texts, val_labels)

        # Tokenize
        train_dataset = train_dataset.map(self.tokenize_function, batched=True)
        val_dataset = val_dataset.map(self.tokenize_function, batched=True)

        # Training arguments
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            learning_rate=learning_rate,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=10,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
        )

        # Trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
        )

        # Train
        trainer.train()

        return trainer

    def predict(self, texts):
        """Make predictions"""
        self.model.eval()
        predictions = []

        with torch.no_grad():
            for text in texts:
                inputs = self.tokenizer(
                    text,
                    return_tensors="pt",
                    truncation=True,
                    max_length=512,
                    padding=True
                ).to(self.device)

                outputs = self.model(**inputs)
                pred = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]
                predictions.append(pred)

        return np.array(predictions)


def evaluate_sentiment_model(y_true, y_pred, class_names=['Negative', 'Positive']):
    """Comprehensive model evaluation"""

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average=None
    )

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Print results
    print("\n" + "="*60)
    print("SENTIMENT CLASSIFICATION RESULTS")
    print("="*60)
    print(f"\nOverall Accuracy: {accuracy*100:.2f}%\n")

    print("Per-Class Metrics:")
    print("-" * 60)
    for i, class_name in enumerate(class_names):
        print(f"{class_name}:")
        print(f"  Precision: {precision[i]*100:.2f}%")
        print(f"  Recall:    {recall[i]*100:.2f}%")
        print(f"  F1-Score:  {f1[i]*100:.2f}%")
        print()

    # Classification report
    print("\nDetailed Classification Report:")
    print(classification_report(y_true, y_pred, target_names=class_names))

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()

    # Save plot
    cm_path = 'confusion_matrix.png'
    plt.savefig(cm_path, dpi=300, bbox_inches='tight')
    plt.close()

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm,
        'cm_plot_path': cm_path
    }