In [None]:
"""
Master Pipeline Script - DSA210 Real Estate Analysis Project
Coordinates the entire workflow from data loading to final analysis
"""

import os
import sys
import subprocess
from pathlib import Path

def get_python_executable():
    """Get the Python executable path - use venv if available"""
    venv_path = os.path.join('venv', 'Scripts', 'python.exe')
    if os.path.exists(venv_path):
        return venv_path
    else:
        return sys.executable

def print_header(title):
    """Print a formatted header for each pipeline step"""
    print("\n" + "="*70)
    print(f"  {title}")
    print("="*70)

def run_script(path, description):
    """
    Execute a Python script and handle errors
    
    Args:
        path: Relative path to the script
        description: Description of what this step does
    
    Returns:
        bool: True if successful, False otherwise
    """
    print_header(description)
    print(f"Executing: {path}\n")
    
    # Check if file exists
    if not os.path.exists(path):
        print(f" ERROR: Script not found at '{path}'")
        return False

    try:
        # Run the script using the virtual environment if available
        python_exe = get_python_executable()
        result = subprocess.run([python_exe, path], check=True)
        print(f"\n '{path}' completed successfully")
        return True
    except subprocess.CalledProcessError as e:
        print(f"\n ERROR: '{path}' failed with error code {e.returncode}")
        return False
    except Exception as e:
        print(f"\n ERROR: An unexpected error occurred: {e}")
        return False

def check_data_exists():
    """Verify that raw data files exist"""
    raw_data_dir = 'data/raw'
    if not os.path.exists(raw_data_dir):
        print(f" ERROR: Data directory '{raw_data_dir}' not found!")
        return False
    
    subdirs = ['emlakjet', 'sahibinden', 'hepsiemlak']
    found_data = False
    
    for subdir in subdirs:
        path = os.path.join(raw_data_dir, subdir)
        if os.path.exists(path):
            files = os.listdir(path)
            xlsx_files = [f for f in files if f.endswith('.xlsx') or f.endswith('.csv')]
            if xlsx_files:
                print(f" Found {len(xlsx_files)} data file(s) in {path}")
                found_data = True
            else:
                print(f" No Excel/CSV files found in {path}")
    
    return found_data

def main():
    """Main pipeline orchestration"""
    print_header("DSA210 REAL ESTATE ANALYSIS PIPELINE")
    print("Welcome! This script will run the complete analysis workflow.\n")
    
    # Check data availability
    print("Checking for raw data files...")
    if not check_data_exists():
        print("\n WARNING: No raw data files detected.")
        print("   Make sure you have data in: data/raw/emlakjet/, data/raw/sahibinden/, etc.")
        response = input("\nContinue anyway? (y/n): ").lower()
        if response != 'y':
            print("Exiting...")
            return

    # Step 1: Machine Learning & Deal Finder Analysis
    # (This reads raw data and produces ML results)
    success = run_script(
        "analysis/ml_analysis.py",
        "STEP 1: Running Machine Learning Analysis & Deal Finder"
    )
    
    if not success:
        print_header("PIPELINE FAILED")
        print("The ML analysis step failed. Please check the error messages above.")
        return False

    # Success message
    print_header("PIPELINE COMPLETED SUCCESSFULLY ")
    print("""
Your analysis results are ready:

 OUTPUTS GENERATED:
   ├── data/outputs/ml_analysis_results.xlsx
   │   ├── Sheet 1: Model Performance comparison
   │   ├── Sheet 2: Best Deals (undervalued properties)
   │   └── Sheet 3: Clustering Analysis (market segments)
   │
   ├── data/outputs/deal_finder_results.png
   │   ├── Model Performance visualization
   │   ├── Market Segmentation (PCA + K-Means)
   │   └── Deal Finder scatter plot
   │
   └── final_model.pkl
       (Trained ML model - can be used for predictions)

PROJECT STRUCTURE:
   ├── analysis/           (Analysis scripts)
   ├── data/
   │   ├── raw/           (Original scraped data)
   │   ├── processed/     (Cleaned data - for future use)
   │   └── outputs/       (Final results)
   ├── scrapers/          (Working scrapers only)
   │   └── archive/       (Old scraper variants)
   └── visualizations/    (Chart outputs)

NEXT STEPS:
   1. Open 'data/outputs/ml_analysis_results.xlsx' to see:
      - Best performing ML model
      - List of potentially undervalued properties
      - Market segmentation analysis
   
   2. View 'data/outputs/deal_finder_results.png' for visualizations
   
   3. Review individual sheets:
      - "2_Best_Deals_Finder" shows properties where Predicted Price > Actual Price
      - These are potential bargains!

 For more information, check README.md
    """)
    
    return True

if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)
