In [None]:
# %% [markdown]
# # üè™ TechStore - ETL Pipeline Complet
# 
# **Projet**: Business Intelligence - TechStore Data Platform  
# **√âquipe**:
# - **Sarah Djerrab & Khaoula Merah**: Extraction & Frontend Development  
# - **Hadjer Hanani**: Transformation & Feature Engineering  
# - **Tasnim Bagha**: Database Architecture  
# 
# **Universit√©**: 8 Mai 1945 Guelma  
# **D√©partement**: Intelligence Artificielle (4√®me Ann√©e)  
# **Date**: Janvier 2026

# %% [markdown]
# ## üìã Table des Mati√®res
# 
# 1. [Configuration & Imports](#1-configuration)
# 2. [Extraction des Donn√©es](#2-extraction)
#    - MySQL (ERP)
#    - Web Scraping (Prix Concurrents)
#    - OCR (Factures Legacy)
# 3. [Transformation des Donn√©es](#3-transformation)
#    - Nettoyage
#    - Enrichissement
#    - Analyse de Sentiment
#    - Calcul Net Profit
# 4. [Chargement dans le Data Warehouse](#4-chargement)
# 5. [Validation & Tests](#5-validation)

# %% [markdown]
# ## 1Ô∏è‚É£ Configuration & Imports

# %%
# Imports syst√®me
import pandas as pd
import numpy as np
import os
import sys
import warnings
from pathlib import Path
from datetime import datetime
import json

warnings.filterwarnings('ignore')

# Configuration des chemins
PROJECT_ROOT = Path.cwd()
SCRIPTS_DIR = PROJECT_ROOT / 'scripts'
DATA_DIR = PROJECT_ROOT / 'Data'
EXTRACTED_DIR = DATA_DIR / 'extracted'
TRANSFORMED_DIR = DATA_DIR / 'transformed'
DATABASE_DIR = PROJECT_ROOT / 'database'

# Cr√©er les r√©pertoires n√©cessaires
for directory in [EXTRACTED_DIR, TRANSFORMED_DIR, DATABASE_DIR]:
    directory.mkdir(parents=True, exist_ok=True)

# Ajouter scripts au path
sys.path.insert(0, str(SCRIPTS_DIR))

print("=" * 70)
print("üè™ TECHSTORE - ETL PIPELINE")
print("=" * 70)
print(f"üìÅ Project Root: {PROJECT_ROOT}")
print(f"üìÖ Execution: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 70)

# %% [markdown]
# ## 2Ô∏è‚É£ Extraction des Donn√©es

# %% [markdown]
# ### 2.1 Extraction MySQL (ERP)

# %%
print("\n" + "=" * 70)
print("üìä EXTRACTION MYSQL (ERP)")
print("=" * 70)

# Configuration MySQL
MYSQL_CONFIG = {
    'host': 'boughida.com',
    'database': 'techstore_erp',
    'user': 'student_user_4ing',
    'password': 'bi_guelma_2025'
}

try:
    from extract_mysql import MySQLExtractor
    
    # Cr√©er l'extracteur
    extractor = MySQLExtractor(**MYSQL_CONFIG)
    
    if extractor.connect():
        print("‚úÖ Connexion MySQL √©tablie\n")
        
        # Extraire toutes les tables
        extraction_summary = extractor.extract_all_tables()
        
        # Fermer la connexion
        extractor.close()
        
        print(f"\n‚úÖ {len(extraction_summary)} tables extraites avec succ√®s")
    else:
        print("‚ùå √âchec de connexion MySQL")
        
except Exception as e:
    print(f"‚ùå Erreur extraction MySQL: {e}")
    print("   Assurez-vous que scripts/extract_mysql.py existe")

# %% [markdown]
# ### 2.2 Web Scraping (Prix Concurrents)

# %%
print("\n" + "=" * 70)
print("üï∑Ô∏è WEB SCRAPING - PRIX CONCURRENTS")
print("=" * 70)

try:
    from scrape_competitors import scrape_with_fallback
    
    # Ex√©cuter le scraping avec fallback
    df_competitor = scrape_with_fallback()
    
    if df_competitor is not None and len(df_competitor) > 0:
        print(f"\n‚úÖ {len(df_competitor)} prix concurrents extraits")
        print(f"üíæ Fichier: {EXTRACTED_DIR / 'competitor_prices.csv'}")
        print("\nüìä Aper√ßu:")
        print(df_competitor.head())
    else:
        print("‚ö†Ô∏è Aucune donn√©e concurrent r√©cup√©r√©e")
        
except Exception as e:
    print(f"‚ùå Erreur scraping: {e}")
    print("   Assurez-vous que scripts/scrape_competitors.py existe")

# %% [markdown]
# ### 2.3 OCR - Factures Legacy (BONUS)

# %%
print("\n" + "=" * 70)
print("üìÑ OCR - FACTURES LEGACY (BONUS)")
print("=" * 70)

# V√©rifier si des factures existent
invoice_dir = DATA_DIR / 'legacy_invoices'
has_invoices = invoice_dir.exists() and len(list(invoice_dir.glob('*.jpg'))) > 0

if has_invoices:
    print(f"üìÅ {len(list(invoice_dir.glob('*.jpg')))} factures d√©tect√©es\n")
    
    try:
        from extract_legacy_invoices import InvoiceOCRProcessor
        
        # Initialiser et ex√©cuter OCR
        processor = InvoiceOCRProcessor(str(invoice_dir))
        df_legacy = processor.process_and_save()
        
        if df_legacy is not None and len(df_legacy) > 0:
            print(f"\n‚úÖ {len(df_legacy)} factures trait√©es")
            print(f"üíæ Fichier: {EXTRACTED_DIR / 'legacy_sales.csv'}")
        else:
            print("‚ö†Ô∏è Aucune donn√©e extraite par OCR")
            
    except Exception as e:
        print(f"‚ö†Ô∏è Erreur OCR: {e}")
        print("   Utilisation de donn√©es manuelles...")
else:
    print("‚ÑπÔ∏è Aucune facture trouv√©e (√©tape optionnelle)")
    print(f"   Pour activer: placez les images dans {invoice_dir}/")

# %% [markdown]
# ## 3Ô∏è‚É£ Transformation des Donn√©es

# %% [markdown]
# ### 3.1 Chargement des Donn√©es Extraites

# %%
print("\n" + "=" * 70)
print("üì¶ CHARGEMENT DES DONN√âES EXTRAITES")
print("=" * 70)

# Charger toutes les donn√©es extraites
dataframes = {}

csv_files = {
    'sales': 'sales.csv',
    'products': 'products.csv',
    'customers': 'customers.csv',
    'stores': 'stores.csv',
    'cities': 'cities.csv',
    'categories': 'categories.csv',
    'subcategories': 'subcategories.csv',
    'reviews': 'reviews.csv',
    'competitor_prices': 'competitor_prices.csv'
}

for name, filename in csv_files.items():
    filepath = EXTRACTED_DIR / filename
    if filepath.exists():
        dataframes[name] = pd.read_csv(filepath)
        print(f"‚úÖ {name:20} {len(dataframes[name]):>7,} lignes")
    else:
        print(f"‚ö†Ô∏è {name:20} Fichier non trouv√©")

print(f"\nüìä {len(dataframes)} fichiers charg√©s")

# %% [markdown]
# ### 3.2 Ex√©cution du Pipeline de Transformation

# %%
print("\n" + "=" * 70)
print("üîÑ EX√âCUTION DU PIPELINE DE TRANSFORMATION")
print("=" * 70)

try:
    # Changer le r√©pertoire vers scripts pour l'import
    os.chdir(SCRIPTS_DIR)
    
    # Importer et ex√©cuter transform_data
    import transform_data
    
    # Ex√©cuter le pipeline principal
    transform_data.main()
    
    # Retour au r√©pertoire racine
    os.chdir(PROJECT_ROOT)
    
    print("\n‚úÖ Transformation termin√©e avec succ√®s")
    
except Exception as e:
    print(f"‚ùå Erreur transformation: {e}")
    import traceback
    traceback.print_exc()
    os.chdir(PROJECT_ROOT)

# %% [markdown]
# ### 3.3 V√©rification des Fichiers Transform√©s

# %%
print("\n" + "=" * 70)
print("üìã V√âRIFICATION DES FICHIERS TRANSFORM√âS")
print("=" * 70)

transformed_files = {
    'Dim_Customer': 'Dim_Customer.csv',
    'Dim_Product': 'Dim_Product.csv',
    'Dim_Store': 'Dim_Store.csv',
    'Dim_Date': 'Dim_Date.csv',
    'Fact_Sales': 'Fact_Sales.csv',
    'Marketing_ROI': 'marketing_roi.csv'
}

transformed_data = {}

for name, filename in transformed_files.items():
    filepath = TRANSFORMED_DIR / filename
    if filepath.exists():
        df = pd.read_csv(filepath)
        transformed_data[name] = df
        print(f"‚úÖ {name:20} {len(df):>7,} lignes √ó {len(df.columns):>2} colonnes")
    else:
        print(f"‚ùå {name:20} Fichier non trouv√©")

print(f"\nüìä {len(transformed_data)}/6 tables transform√©es")

# %% [markdown]
# ### 3.4 Aper√ßu du Sch√©ma en √âtoile

# %%
if len(transformed_data) >= 5:
    print("\n" + "=" * 70)
    print("‚≠ê APER√áU DU SCH√âMA EN √âTOILE")
    print("=" * 70)
    
    # Dimensions
    print("\nüìä DIMENSIONS:")
    for dim in ['Dim_Customer', 'Dim_Product', 'Dim_Store', 'Dim_Date']:
        if dim in transformed_data:
            df = transformed_data[dim]
            print(f"\n  {dim}:")
            print(f"    Lignes: {len(df):,}")
            print(f"    Colonnes: {', '.join(df.columns.tolist()[:5])}...")
            print(f"    Aper√ßu:")
            print(df.head(2).to_string(index=False))
    
    # Fait
    print("\nüìä FAIT:")
    if 'Fact_Sales' in transformed_data:
        df = transformed_data['Fact_Sales']
        print(f"\n  Fact_Sales:")
        print(f"    Lignes: {len(df):,}")
        print(f"    Colonnes: {', '.join(df.columns.tolist())}")
        print(f"    Aper√ßu:")
        print(df.head(3).to_string(index=False))
        
        # Statistiques financi√®res
        print(f"\n  üìà Statistiques Financi√®res:")
        print(f"    Revenu Total: {df['total_revenue'].sum():,.2f} DZD")
        print(f"    Profit Net Total: {df['net_profit'].sum():,.2f} DZD")
        print(f"    Marge Profit: {(df['net_profit'].sum() / df['total_revenue'].sum() * 100):.2f}%")

# %% [markdown]
# ## 4Ô∏è‚É£ Chargement dans le Data Warehouse

# %% [markdown]
# ### 4.1 Cr√©ation de la Base de Donn√©es SQLite

# %%
print("\n" + "=" * 70)
print("üóÑÔ∏è CR√âATION DU DATA WAREHOUSE")
print("=" * 70)

try:
    # Changer vers scripts
    os.chdir(SCRIPTS_DIR)
    
    # Ex√©cuter create_database.py
    exec(open('create_database.py').read())
    
    # Retour au r√©pertoire racine
    os.chdir(PROJECT_ROOT)
    
    print("\n‚úÖ Data Warehouse cr√©√© avec succ√®s")
    
except Exception as e:
    print(f"‚ùå Erreur cr√©ation DB: {e}")
    import traceback
    traceback.print_exc()
    os.chdir(PROJECT_ROOT)

# %% [markdown]
# ### 4.2 Test de Connexion au Data Warehouse

# %%
print("\n" + "=" * 70)
print("üîç TEST DE CONNEXION AU DATA WAREHOUSE")
print("=" * 70)

try:
    import sqlite3
    
    db_path = DATABASE_DIR / 'techstore_dw.db'
    
    if db_path.exists():
        conn = sqlite3.connect(str(db_path))
        
        # Lister les tables
        cursor = conn.cursor()
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
        tables = cursor.fetchall()
        
        print(f"\n‚úÖ Connexion √©tablie: {db_path}")
        print(f"\nüìä Tables disponibles ({len(tables)}):")
        
        for table in tables:
            table_name = table[0]
            cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
            count = cursor.fetchone()[0]
            print(f"  ‚Ä¢ {table_name:20} {count:>8,} lignes")
        
        # Test de requ√™te Star Schema
        print("\nüîó Test de jointure Star Schema:")
        test_query = """
        SELECT 
            fs.Sale_ID,
            dd.Full_Date,
            dp.Product_Name,
            ds.Store_Name,
            dc.Customer_Name,
            fs.Total_Revenue,
            fs.Net_Profit
        FROM Fact_Sales fs
        JOIN Dim_Date dd ON fs.Date_ID = dd.Date_ID
        JOIN Dim_Product dp ON fs.Product_ID = dp.Product_ID
        JOIN Dim_Store ds ON fs.Store_ID = ds.Store_ID
        JOIN Dim_Customer dc ON fs.Customer_ID = dc.Customer_ID
        LIMIT 5
        """
        
        test_df = pd.read_sql(test_query, conn)
        print(test_df.to_string(index=False))
        
        print("\n‚úÖ Jointures fonctionnelles!")
        
        conn.close()
    else:
        print(f"‚ùå Base de donn√©es non trouv√©e: {db_path}")
        
except Exception as e:
    print(f"‚ùå Erreur test DB: {e}")

# %% [markdown]
# ## 5Ô∏è‚É£ Validation & Tests

# %% [markdown]
# ### 5.1 Tests de Qualit√© des Donn√©es

# %%
print("\n" + "=" * 70)
print("‚úÖ VALIDATION DE LA QUALIT√â DES DONN√âES")
print("=" * 70)

validation_results = []

for name, df in transformed_data.items():
    # Valeurs manquantes
    missing = df.isnull().sum().sum()
    missing_pct = (missing / (df.shape[0] * df.shape[1])) * 100
    
    # Doublons
    if 'Customer_ID' in df.columns:
        duplicates = df['Customer_ID'].duplicated().sum()
    elif 'Product_ID' in df.columns:
        duplicates = df['Product_ID'].duplicated().sum()
    elif 'Store_ID' in df.columns:
        duplicates = df['Store_ID'].duplicated().sum()
    elif 'Date_ID' in df.columns:
        duplicates = df['Date_ID'].duplicated().sum()
    elif 'Sale_ID' in df.columns:
        duplicates = df['Sale_ID'].duplicated().sum()
    else:
        duplicates = 0
    
    validation_results.append({
        'Table': name,
        'Lignes': len(df),
        'Colonnes': len(df.columns),
        'Valeurs_Manquantes': missing,
        'Pct_Manquant': f"{missing_pct:.2f}%",
        'Doublons_Cl√©s': duplicates,
        'Statut': '‚úÖ' if missing_pct < 5 and duplicates == 0 else '‚ö†Ô∏è'
    })

df_validation = pd.DataFrame(validation_results)
print("\n" + df_validation.to_string(index=False))

# %% [markdown]
# ### 5.2 Tests des Requ√™tes SQL

# %%
print("\n" + "=" * 70)
print("üß™ TESTS DES REQU√äTES SQL")
print("=" * 70)

try:
    os.chdir(SCRIPTS_DIR)
    
    # Ex√©cuter test_queries.py
    exec(open('test_queries.py').read())
    
    os.chdir(PROJECT_ROOT)
    
except Exception as e:
    print(f"‚ö†Ô∏è Erreur tests SQL: {e}")
    os.chdir(PROJECT_ROOT)

# %% [markdown]
# ## üìä R√©sum√© Final du Pipeline ETL

# %%
print("\n" + "=" * 70)
print("üéâ PIPELINE ETL COMPL√âT√â AVEC SUCC√àS")
print("=" * 70)

summary = {
    'Extraction': {
        'Tables MySQL': len([k for k in dataframes.keys() if k not in ['competitor_prices']]),
        'Prix Concurrents': 'competitor_prices' in dataframes,
        'Factures Legacy (OCR)': (DATA_DIR / 'extracted' / 'legacy_sales.csv').exists()
    },
    'Transformation': {
        'Dimensions': len([k for k in transformed_data.keys() if k.startswith('Dim_')]),
        'Fait': 'Fact_Sales' in transformed_data,
        'Analyses': 'Marketing_ROI' in transformed_data
    },
    'Chargement': {
        'Database': (DATABASE_DIR / 'techstore_dw.db').exists(),
        'Tables': len(tables) if 'tables' in locals() else 0
    }
}

print("\n‚úÖ EXTRACTION:")
for key, value in summary['Extraction'].items():
    status = "‚úÖ" if value else "‚è≠Ô∏è"
    print(f"  {status} {key}: {value}")

print("\n‚úÖ TRANSFORMATION:")
for key, value in summary['Transformation'].items():
    status = "‚úÖ" if value else "‚ùå"
    print(f"  {status} {key}: {value}")

print("\n‚úÖ CHARGEMENT:")
for key, value in summary['Chargement'].items():
    status = "‚úÖ" if value else "‚ùå"
    print(f"  {status} {key}: {value}")

print("\nüöÄ PROCHAINES √âTAPES:")
print("  1. ‚úÖ Lancer le dashboard: streamlit run dashboard/dashboard_app.py")
print("  2. ‚úÖ Analyser les KPIs et m√©triques business")
print("  3. ‚úÖ G√©n√©rer le rapport final")

print("\n" + "=" * 70)
print("üìÅ FICHIERS G√âN√âR√âS:")
print("=" * 70)
print(f"  ‚Ä¢ Data Warehouse: {DATABASE_DIR / 'techstore_dw.db'}")
print(f"  ‚Ä¢ Donn√©es Extraites: {EXTRACTED_DIR}/")
print(f"  ‚Ä¢ Donn√©es Transform√©es: {TRANSFORMED_DIR}/")
print("\n" + "=" * 70)

ModuleNotFoundError: No module named 'mysql'