In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Auto-Labeling with Ensemble Models\n",
    "## Phase 3: Model Development - Step 1\n",
    "\n",
    "This notebook demonstrates the auto-labeling pipeline using ensemble of pre-trained models:\n",
    "- VADER Sentiment\n",
    "- TextBlob\n",
    "- FinBERT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "from pathlib import Path\n",
    "\n",
    "# Add project root to path\n",
    "project_root = Path.cwd().parent\n",
    "sys.path.insert(0, str(project_root))\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "from src.models.labeling.auto_labeler import AutoLabeler\n",
    "from src.models.labeling.confidence_filter import ConfidenceFilter\n",
    "from src.models.labeling.label_validator import LabelValidator\n",
    "from src.data.preprocessor import TextPreprocessor\n",
    "from src.utils.config import Config\n",
    "\n",
    "# Set style\n",
    "plt.style.use('seaborn-v0_8-darkgrid')\n",
    "sns.set_palette('husl')\n",
    "\n",
    "print(\"✅ Imports successful!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load and Explore Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load data\n",
    "data_path = Config.RAW_DATA_DIR / \"reddit_posts_20251010_150745.csv\"\n",
    "df = pd.read_csv(data_path)\n",
    "\n",
    "print(f\"Loaded {len(df)} samples\")\n",
    "print(f\"\\nColumns: {df.columns.tolist()}\")\n",
    "print(f\"\\nFirst few rows:\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Preprocess Text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize preprocessor\n",
    "preprocessor = TextPreprocessor()\n",
    "\n",
    "# Preprocess\n",
    "print(\"Preprocessing text...\")\n",
    "df = preprocessor.preprocess_dataframe(df)\n",
    "\n",
    "print(\"✅ Preprocessing complete!\")\n",
    "print(f\"\\nSample preprocessed text:\")\n",
    "print(df['preprocessed_text'].iloc[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Auto-Labeling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize auto-labeler\n",
    "labeler = AutoLabeler()\n",
    "\n",
    "# Label data (this will take a few minutes)\n",
    "print(\"Starting auto-labeling...\")\n",
    "df = labeler.label_dataframe(df, text_column='preprocessed_text')\n",
    "\n",
    "print(\"\\n✅ Auto-labeling complete!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Analyze Labeling Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get statistics\n",
    "stats = labeler.get_labeling_stats(df)\n",
    "\n",
    "print(\"Labeling Statistics:\")\n",
    "print(f\"  Total samples: {stats['total_samples']}\")\n",
    "print(f\"  Average confidence: {stats['avg_confidence']:.4f}\")\n",
    "print(f\"  Average agreement: {stats['avg_agreement']:.4f}\")\n",
    "print(f\"\\nLabel distribution:\")\n",
    "for label, count in stats['label_distribution'].items():\n",
    "    print(f\"  {label}: {count} ({count/stats['total_samples']*100:.1f}%)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize label distribution\n",
    "fig, axes = plt.subplots(1, 3, figsize=(15, 4))\n",
    "\n",
    "# Label distribution\n",
    "df['auto_label'].value_counts().plot(kind='bar', ax=axes[0])\n",
    "axes[0].set_title('Label Distribution')\n",
    "axes[0].set_xlabel('Label')\n",
    "axes[0].set_ylabel('Count')\n",
    "\n",
    "# Confidence distribution\n",
    "axes[1].hist(df['label_confidence'], bins=50, edgecolor='black')\n",
    "axes[1].set_title('Confidence Distribution')\n",
    "axes[1].set_xlabel('Confidence')\n",
    "axes[1].set_ylabel('Count')\n",
    "axes[1].axvline(0.6, color='r', linestyle='--', label='Threshold')\n",
    "axes[1].legend()\n",
    "\n",
    "# Agreement distribution\n",
    "axes[2].hist(df['label_agreement'], bins=20, edgecolor='black')\n",
    "axes[2].set_title('Agreement Distribution')\n",
    "axes[2].set_xlabel('Agreement Score')\n",
    "axes[2].set_ylabel('Count')\n",
    "axes[2].axvline(0.67, color='r', linestyle='--', label='Threshold')\n",
    "axes[2].legend()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Validate Labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize validator\n",
    "validator = LabelValidator()\n",
    "\n",
    "# Run validation\n",
    "validation_report = validator.comprehensive_validation(df)\n",
    "\n",
    "print(f\"\\nValidation Status: {validation_report['overall_status']}\")\n",
    "print(f\"\\nInter-annotator agreement: {validation_report['inter_annotator_agreement']['average_kappa']:.3f}\")\n",
    "print(f\"Interpretation: {validation_report['inter_annotator_agreement']['interpretation']}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Filter by Confidence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize filter\n",
    "conf_filter = ConfidenceFilter(\n",
    "    min_confidence=0.6,\n",
    "    min_agreement=0.67,\n",
    "    use_both_criteria=True\n",
    ")\n",
    "\n",
    "# Filter data\n",
    "high_conf_df, low_conf_df = conf_filter.filter(df)\n",
    "\n",
    "print(f\"\\nHigh confidence samples: {len(high_conf_df)} ({len(high_conf_df)/len(df)*100:.1f}%)\")\n",
    "print(f\"Low confidence samples: {len(low_conf_df)} ({len(low_conf_df)/len(df)*100:.1f}%)\")\n",
    "\n",
    "# Get filter stats\n",
    "filter_stats = conf_filter.get_filter_stats(high_conf_df, low_conf_df)\n",
    "print(f\"\\nHigh confidence label distribution:\")\n",
    "for label, count in filter_stats['high_confidence_label_dist'].items():\n",
    "    print(f\"  {label}: {count}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Save Labeled Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save high confidence data\n",
    "output_path = Config.PROCESSED_DATA_DIR / \"labeled_data.csv\"\n",
    "high_conf_df.to_csv(output_path, index=False)\n",
    "print(f\"✅ Saved {len(high_conf_df)} high confidence samples to {output_path}\")\n",
    "\n",
    "# Save low confidence for review\n",
    "low_conf_path = Config.PROCESSED_DATA_DIR / \"labeled_data_low_confidence.csv\"\n",
    "low_conf_df.to_csv(low_conf_path, index=False)\n",
    "print(f\"✅ Saved {len(low_conf_df)} low confidence samples to {low_conf_path}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. Sample Predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Show some examples\n",
    "print(\"Sample Predictions:\\n\")\n",
    "print(\"=\"*100)\n",
    "\n",
    "for label in ['positive', 'neutral', 'negative']:\n",
    "    print(f\"\\n{label.upper()} Examples:\")\n",
    "    print(\"-\"*100)\n",
    "    \n",
    "    samples = high_conf_df[high_conf_df['auto_label'] == label].head(3)\n",
    "    \n",
    "    for idx, row in samples.iterrows():\n",
    "        print(f\"\\nText: {row['title'][:100]}...\")\n",
    "        print(f\"Confidence: {row['label_confidence']:.3f}\")\n",
    "        print(f\"Agreement: {row['label_agreement']:.3f}\")\n",
    "        print(f\"Models: TB={row['textblob_prediction']}, VADER={row['vader_prediction']}, FinBERT={row['finbert_prediction']}\")\n",
    "        print(\"-\"*100)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Summary\n",
    "\n",
    "✅ **Auto-labeling Complete!**\n",
    "\n",
    "We have successfully:\n",
    "1. Loaded and preprocessed Reddit data\n",
    "2. Applied ensemble auto-labeling (VADER + TextBlob + FinBERT)\n",
    "3. Validated label quality\n",
    "4. Filtered high-confidence samples\n",
    "5. Saved labeled data for model training\n",
    "\n",
    "**Next Steps:**\n",
    "- Train baseline model (Logistic Regression)\n",
    "- Train LSTM model\n",
    "- Train BERT model\n",
    "- Train FinBERT model\n",
    "- Create ensemble model"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}