In [None]:
!pip install mlflow

In [None]:

from google.colab import userdata
import os
import sqlite3
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import mlflow.transformers
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    confusion_matrix, classification_report
)
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    AutoModel, pipeline, Trainer, TrainingArguments
)
from datasets import Dataset
from peft import (
    LoraConfig, 
    get_peft_model, 
    TaskType,
    PeftModel,
    prepare_model_for_kbit_training
)
load_dotenv()

import warnings
warnings.filterwarnings('ignore')

# MLflow Configuration
os.environ['MLFLOW_TRACKING_USERNAME'] = userdata.get('username')
os.environ['MLFLOW_TRACKING_PASSWORD'] = userdata.get('password')
os.environ['MLFLOW_TRACKING_URI'] = ''

# Set MLflow experiment
mlflow.set_experiment("steam-reviews-fine-tune")
mlflow.autolog()


# ============================================================================
# 1. DATA LOADING
# ============================================================================

def load_data_from_sqlite(db_path, table_name="train"):
    """
    Load labeled reviews from SQLite database
    
    Args:
        db_path: Path to SQLite database
        table_name: Name of the table (or base name if split is provided)
    
    Returns:
        DataFrame with reviews and sentiment labels
    """
    
    conn = sqlite3.connect(db_path)

    # Adjust column names according to your database schema
    query = f"""
    SELECT
        review_text_clean,
        voted_up
    FROM {table_name}
    """

    df = pd.read_sql_query(query, conn)
    conn.close()

    print(f"âœ“ Loaded {len(df)} from {table_name}")
    
    print(f"Sentiment distribution: Positive={df['voted_up'].sum()}, Negative={(~df['voted_up'].astype(bool)).sum()}")

    return df


# ============================================================================
# 2. SENTIMENT CLASSIFICATION WITH PEFT/LoRA
# ============================================================================

class SentimentClassifierWithLoRA:
    """
    Fine-tune transformer models using PEFT/LoRA for efficient training
    
    Benefits of LoRA:
    - Train only 0.1-1% of parameters (much faster)
    - Lower memory requirements
    - Easy to save/share (adapter weights are tiny)
    - Can switch between multiple adapters on same base model
    """
    
    def __init__(self, model_name="distilbert-base-uncased", num_labels=2, use_lora=True):
        """
        Initialize sentiment classifier with optional LoRA
        
        Recommended models:
        - distilbert-base-uncased (lightweight, fast)
        - bert-base-uncased (strong general purpose)
        - roberta-base (robust to nuances)
        """
        self.model_name = model_name
        self.num_labels = num_labels
        self.use_lora = use_lora
        self.tokenizer = None
        self.model = None
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")

    def prepare_data(self, texts, labels):
        """Prepare data for training"""
        dataset = Dataset.from_dict({
            'text': texts,
            'label': labels
        })
        return dataset

    def tokenize_function(self, examples):
        """Tokenize texts"""
        return self.tokenizer(
            examples['text'],
            padding='max_length',
            truncation=True,
            max_length=512
        )
    
    def compute_metrics(self, eval_pred):
        """Compute metrics during training"""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels, predictions, average='weighted'
        )
        acc = accuracy_score(labels, predictions)
        
        return {
            'accuracy': acc,
            'precision': precision,
            'recall': recall,
            'f1': f1
        }

    def setup_lora_config(self):
        """
        Configure LoRA parameters
        
        Key parameters:
        - r: LoRA rank (4-16 typical, higher = more capacity but slower)
        - lora_alpha: Scaling factor (typically 2*r)
        - target_modules: Which layers to apply LoRA to
        - lora_dropout: Dropout for LoRA layers
        """
        lora_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,  # Sequence classification
            r=8,  # LoRA rank - balance between performance and efficiency
            lora_alpha=16,  # Scaling factor (2*r is common)
            lora_dropout=0.1,  # Dropout for regularization
            target_modules=["q_lin", "v_lin"],  # For DistilBERT (query and value projections)
            # For BERT/RoBERTa use: ["query", "value"] or ["query", "key", "value"]
            bias="none",  # Don't train bias terms
            inference_mode=False,  # We're training, not inferencing
        )
        return lora_config

    def train(self, train_texts, train_labels, val_texts, val_labels,
              epochs=3, batch_size=16, learning_rate=3e-4,
              output_dir='./results', lora_output_dir='./lora_adapter'):
        """
        Fine-tune transformer model with LoRA
        
        Note: LoRA typically uses higher learning rates (1e-4 to 1e-3)
        compared to full fine-tuning (2e-5 to 5e-5)
        """
        print(f"\n{'='*70}")
        print(f"FINE-TUNING {self.model_name.upper()}")
        print(f"Using LoRA: {self.use_lora}")
        print(f"{'='*70}")
        print(f"Training samples: {len(train_texts)}")
        print(f"Validation samples: {len(val_texts)}")
        print(f"Epochs: {epochs}")
        print(f"Batch size: {batch_size}")
        print(f"Learning rate: {learning_rate}")

        # Initialize tokenizer
        print(f"\nLoading tokenizer and model...")
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        
        # Load base model
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=self.num_labels
        )
        
        # Apply LoRA if enabled
        if self.use_lora:
            print("\nðŸ”§ Applying LoRA configuration...")
            lora_config = self.setup_lora_config()
            self.model = get_peft_model(self.model, lora_config)
            
            # Print trainable parameters
            trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
            total_params = sum(p.numel() for p in self.model.parameters())
            print(f"âœ“ Trainable params: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")
            print(f"  Total params: {total_params:,}")
            self.model.print_trainable_parameters()
        
        self.model = self.model.to(self.device)

        # Prepare datasets
        print("\nPreparing datasets...")
        train_dataset = self.prepare_data(train_texts, train_labels)
        val_dataset = self.prepare_data(val_texts, val_labels)

        # Tokenize
        print("Tokenizing...")
        train_dataset = train_dataset.map(self.tokenize_function, batched=True)
        val_dataset = val_dataset.map(self.tokenize_function, batched=True)

        # Training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=epochs,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            learning_rate=learning_rate,
            warmup_steps=100,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=50,
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            report_to="none",
            fp16=torch.cuda.is_available(),
        )

        # Trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=self.compute_metrics,
        )

        # Train
        print(f"\n{'='*70}")
        print("TRAINING STARTED")
        print(f"{'='*70}\n")
        
        trainer.train()
        
        print(f"\n{'='*70}")
        print("TRAINING COMPLETE")
        print(f"{'='*70}")

        # Save the model
        if self.use_lora:
            print(f"\nðŸ’¾ Saving LoRA adapter to {lora_output_dir}...")
            self.model.save_pretrained(lora_output_dir)
            self.tokenizer.save_pretrained(lora_output_dir)
            print(f"âœ“ LoRA adapter saved (only {self._get_dir_size(lora_output_dir):.2f} MB)")
        else:
            print(f"\nðŸ’¾ Saving full model to {output_dir}...")
            self.model.save_pretrained(output_dir)
            self.tokenizer.save_pretrained(output_dir)
            print(f"âœ“ Model saved")

        return trainer
    
    def _get_dir_size(self, path):
        """Get directory size in MB"""
        total_size = 0
        for dirpath, dirnames, filenames in os.walk(path):
            for f in filenames:
                fp = os.path.join(dirpath, f)
                total_size += os.path.getsize(fp)
        return total_size / (1024 * 1024)

    def load_model(self, adapter_path='./lora_adapter', base_model_name=None):
        """
        Load fine-tuned model
        
        For LoRA models:
        - Loads base model + adapter weights
        - Much faster than loading full model
        
        Args:
            adapter_path: Path to LoRA adapter or full model
            base_model_name: Base model name (if different from init)
        """
        if base_model_name is None:
            base_model_name = self.model_name
            
        print(f"\nLoading model from {adapter_path}...")
        self.tokenizer = AutoTokenizer.from_pretrained(adapter_path)
        
        if self.use_lora:
            # Load base model first
            print(f"Loading base model: {base_model_name}")
            base_model = AutoModelForSequenceClassification.from_pretrained(
                base_model_name,
                num_labels=self.num_labels
            )
            
            # Load LoRA adapter
            print("Loading LoRA adapter...")
            self.model = PeftModel.from_pretrained(base_model, adapter_path)
            
            # Merge adapter with base model for faster inference (optional)
            print("Merging adapter with base model for inference...")
            self.model = self.model.merge_and_unload()
        else:
            # Load full fine-tuned model
            self.model = AutoModelForSequenceClassification.from_pretrained(adapter_path)
        
        self.model = self.model.to(self.device)
        self.model.eval()
        print("âœ“ Model loaded and ready for inference")

    def predict(self, texts, batch_size=32):
        """Make predictions with probabilities"""
        self.model.eval()
        predictions = []
        probabilities = []

        print(f"Making predictions on {len(texts)} samples...")
        
        with torch.no_grad():
            for i in range(0, len(texts), batch_size):
                batch_texts = texts[i:i + batch_size]
                
                inputs = self.tokenizer(
                    batch_texts,
                    return_tensors="pt",
                    truncation=True,
                    max_length=512,
                    padding=True
                ).to(self.device)

                outputs = self.model(**inputs)
                
                # Get predictions
                preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
                predictions.extend(preds)
                
                # Get probabilities for positive class
                probs = torch.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy()
                probabilities.extend(probs)
                
                if (i // batch_size + 1) % 10 == 0:
                    print(f"  Processed {i + len(batch_texts)}/{len(texts)} samples")

        print("âœ“ Predictions complete")
        return np.array(predictions), np.array(probabilities)


# ============================================================================
# 3. EVALUATION FUNCTIONS
# ============================================================================

def evaluate_sentiment_model(y_true, y_pred, y_proba=None, 
                            class_names=['Negative', 'Positive'],
                            model_name='model'):
    """
    Comprehensive model evaluation
    
    Args:
        y_true: Ground truth labels
        y_pred: Predicted labels
        y_proba: Prediction probabilities (optional)
        class_names: Names of classes
        model_name: Name for saving plots
    """
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, average=None, zero_division=0
    )
    
    # Macro averages
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        y_true, y_pred, average='macro', zero_division=0
    )

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Print results
    print("\n" + "="*60)
    print("SENTIMENT CLASSIFICATION RESULTS")
    print("="*60)
    print(f"\nOverall Accuracy: {accuracy*100:.2f}%")
    
    print(f"\nMacro Averages:")
    print(f"  Precision: {precision_macro*100:.2f}%")
    print(f"  Recall:    {recall_macro*100:.2f}%")
    print(f"  F1-Score:  {f1_macro*100:.2f}%")

    print("\nPer-Class Metrics:")
    print("-" * 60)
    for i, class_name in enumerate(class_names):
        print(f"{class_name} (n={support[i]}):")
        print(f"  Precision: {precision[i]*100:.2f}%")
        print(f"  Recall:    {recall[i]*100:.2f}%")
        print(f"  F1-Score:  {f1[i]*100:.2f}%")
        print()

    # Classification report
    print("\nDetailed Classification Report:")
    print(classification_report(y_true, y_pred, target_names=class_names, zero_division=0))

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()

    # Save plot
    cm_path = f'{model_name.replace("/", "_")}_confusion_matrix.png'
    plt.savefig(cm_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"âœ“ Confusion matrix saved to {cm_path}")

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm,
        'cm_plot_path': cm_path
    }


# ============================================================================
# 4. MAIN EXECUTION EXAMPLE
# ============================================================================

if __name__ == "__main__":
    
    # ========== CONFIGURATION ==========
    DB_PATH = r'/content/reviews_processed.db'
    
    # Model settings
    MODEL_NAME = 'distilbert/distilbert-base-uncased'
    USE_LORA = True  # Set to False for full fine-tuning
    
    # Training settings
    EPOCHS = 3
    BATCH_SIZE = 16
    LEARNING_RATE = 3e-4 if USE_LORA else 2e-5  # Higher LR for LoRA
    TEST_SIZE = 0.2
    VAL_SIZE = 0.1
    
    # Paths
    LORA_ADAPTER_PATH = './lora_sentiment_adapter'
    FULL_MODEL_PATH = './full_sentiment_model'
    
    # ========== LOAD DATA ==========
    print("\n" + "="*70)
    print("STEP 1: LOADING DATA")
    print("="*70)
    
    # Load training data
    review_text_clean = 'review_text_clean'
    voted_up = 'voted_up'
    train_df = load_data_from_sqlite(DB_PATH, 'train')
    train_texts = train_df[review_text_clean].tolist()
    train_labels = train_df[voted_up].tolist()
    
    # Load validation data
    val_df = load_data_from_sqlite(DB_PATH, 'validation')
    val_texts = val_df[review_text_clean].tolist()
    val_labels = val_df[voted_up].tolist()
    
    # Load test data
    test_df = load_data_from_sqlite(DB_PATH, 'test')
    test_texts = test_df[review_text_clean].tolist()
    test_labels = test_df[voted_up].tolist()
    
    print(f"Training set:   {len(train_texts)} samples")
    print(f"Validation set: {len(val_texts)} samples")
    print(f"Test set:       {len(test_texts)} samples")
    
    # ========== TRAIN MODEL WITH LORA ==========
    print("\n" + "="*70)
    print("STEP 3: TRAINING MODEL")
    print("="*70)
    
    classifier = SentimentClassifierWithLoRA(
        model_name=MODEL_NAME,
        num_labels=2,
        use_lora=USE_LORA
    )
    
    trainer = classifier.train(
        train_texts=train_texts,
        train_labels=train_labels,
        val_texts=val_texts,
        val_labels=val_labels,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        learning_rate=LEARNING_RATE,
        lora_output_dir=LORA_ADAPTER_PATH if USE_LORA else FULL_MODEL_PATH
    )
    
    # ========== EVALUATE ON TEST SET ==========
    print("\n" + "="*70)
    print("STEP 4: EVALUATING MODEL (After Training)")
    print("="*70)
    
    test_predictions, test_probabilities = classifier.predict(test_texts)
    
    results = evaluate_sentiment_model(
        y_true=test_labels,
        y_pred=test_predictions,
        y_proba=test_probabilities,
        class_names=['Negative', 'Positive'],
        model_name=f"{MODEL_NAME}_lora" if USE_LORA else MODEL_NAME
    )
    
    # ========== LOAD AND EVALUATE SAVED MODEL ==========
    print("\n" + "="*70)
    print("STEP 5: TESTING MODEL LOADING")
    print("="*70)
    
    # Create new classifier instance
    classifier_loaded = SentimentClassifierWithLoRA(
        model_name=MODEL_NAME,
        num_labels=2,
        use_lora=USE_LORA
    )
    
    # Load the saved model
    model_path = LORA_ADAPTER_PATH if USE_LORA else FULL_MODEL_PATH
    classifier_loaded.load_model(
        adapter_path=model_path,
        base_model_name=MODEL_NAME
    )
    
    # Make predictions with loaded model
    print("\nMaking predictions with loaded model...")
    loaded_predictions, loaded_probabilities = classifier_loaded.predict(test_texts)
    
    # Evaluate loaded model
    loaded_results = evaluate_sentiment_model(
        y_true=test_labels,
        y_pred=loaded_predictions,
        y_proba=loaded_probabilities,
        class_names=['Negative', 'Positive'],
        model_name=f"{MODEL_NAME}_loaded"
    )
    
    # Verify predictions match
    predictions_match = np.array_equal(test_predictions, loaded_predictions)
    print(f"\nâœ“ Loaded model predictions match original: {predictions_match}")
    
    print("\n" + "="*70)
    print("PIPELINE COMPLETE!")
    print("="*70)
    print(f"\n{'LoRA Adapter' if USE_LORA else 'Full Model'} saved to: {model_path}")
    print(f"Test Accuracy: {results['accuracy']*100:.2f}%")
    print(f"Test F1 (Macro): {results['f1'].mean()*100:.2f}%")

2025/12/30 11:14:21 INFO mlflow.tracking.fluent: Experiment with name 'steam-reviews-fine-tune' does not exist. Creating a new experiment.
2025/12/30 11:14:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for keras.
2025/12/30 11:14:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/12/30 11:14:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.
2025/12/30 11:14:24 INFO mlflow.tracking.fluent: Autologging successfully enabled for transformers.



STEP 1: LOADING DATA
âœ“ Loaded 37902 from train
Sentiment distribution: Positive=18951, Negative=18951
âœ“ Loaded 8122 from validation
Sentiment distribution: Positive=4061, Negative=4061
âœ“ Loaded 8122 from test
Sentiment distribution: Positive=4061, Negative=4061
Training set:   37902 samples
Validation set: 8122 samples
Test set:       8122 samples

STEP 3: TRAINING MODEL
Using device: cuda

FINE-TUNING DISTILBERT/DISTILBERT-BASE-UNCASED
Using LoRA: True
Training samples: 37902
Validation samples: 8122
Epochs: 3
Batch size: 16
Learning rate: 0.0003

Loading tokenizer and model...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]