In [None]:
# ====================================================================
# PROJET BUSINESS INTELLIGENCE - TECHSTORE
# ETL PIPELINE - PARTIE MEMBRE 1 : DATA EXTRACTION
# ====================================================================

# %% [markdown]
# # üìä Extraction des Donn√©es (Membre 1)
# 
# Ce notebook contient la partie extraction du pipeline ETL.
# **Responsable :** Membre 1 - Data Extraction Engineer
# 
# ## Objectifs :
# 1. ‚úÖ Extraire les donn√©es de MySQL (ERP)
# 2. ‚úÖ Scraper les prix des concurrents
# # 3. ‚úÖ Valider la qualit√© des donn√©es extraites

# %% Imports pour tout le pipeline ETL
# Installer les d√©pendances manquantes (ex√©cuter uniquement dans le notebook)


# === Imports de base ===
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

# === Partie 1 : Extraction (Membre 1) ===
import mysql.connector
import requests
from bs4 import BeautifulSoup
from datetime import datetime

# === Partie 2 : Transformations (Membre 2 - Toi) ===
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pytesseract
from PIL import Image
import re
from fuzzywuzzy import process  # Pour le matching des prix concurrents

print("‚úÖ Biblioth√®ques import√©es avec succ√®s")
print(f"üìÖ Date d'ex√©cution: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# %% [markdown]
# ## 1Ô∏è‚É£ Configuration de la Connexion MySQL

# %% Configuration
MYSQL_CONFIG = {
    'host': 'boughida.com',
    'database': 'techstore_erp',
    'user': 'student_user_4ing',
    'password': 'bi_guelma_2025'
}

# Cr√©er les r√©pertoires n√©cessaires
os.makedirs('data/extracted', exist_ok=True)

print("üìã Configuration charg√©e")
print(f"   Serveur: {MYSQL_CONFIG['host']}")
print(f"   Base de donn√©es: {MYSQL_CONFIG['database']}")

# %% [markdown]
# ## 2Ô∏è‚É£ Connexion et Test

# %% Test de connexion
def test_connection():
    """Tester la connexion √† MySQL"""
    try:
        conn = mysql.connector.connect(**MYSQL_CONFIG)
        if conn.is_connected():
            print("‚úÖ Connexion MySQL r√©ussie!")
            
            # Tester une requ√™te simple
            cursor = conn.cursor()
            cursor.execute("SELECT DATABASE()")
            db_name = cursor.fetchone()[0]
            print(f"   Base de donn√©es active: {db_name}")
            
            # Lister les tables disponibles
            cursor.execute("SHOW TABLES")
            tables = cursor.fetchall()
            print(f"   Nombre de tables: {len(tables)}")
            
            conn.close()
            return True
    except Exception as e:
        print(f"‚ùå Erreur de connexion: {e}")
        return False

# Ex√©cuter le test
test_connection()

# %% [markdown]
# ## 3Ô∏è‚É£ Extraction des Tables MySQL

# %% Fonction d'extraction
def extract_mysql_table(table_name, conn):
    """Extraire une table MySQL vers DataFrame"""
    try:
        print(f"üìä Extraction: {table_name}...", end=" ")
        
        query = f"SELECT * FROM {table_name}"
        df = pd.read_sql(query, conn)
        
        # Sauvegarder en CSV
        filename = f"data/extracted/{table_name.replace('table_', '')}.csv"
        df.to_csv(filename, index=False, encoding='utf-8')
        
        print(f"‚úÖ {len(df)} lignes | {len(df.columns)} colonnes")
        
        return df
        
    except Exception as e:
        print(f"‚ùå Erreur: {e}")
        return None

# %% Extraction de toutes les tables
print("\n" + "="*60)
print("üöÄ EXTRACTION DES TABLES MySQL")
print("="*60 + "\n")

# Se connecter
conn = mysql.connector.connect(**MYSQL_CONFIG)

# Liste des tables √† extraire
tables_to_extract = [
    'table_sales',
    'table_products',
    'table_reviews',
    'table_customers',
    'table_stores',
    'table_cities',
    'table_categories',
    'table_subcategories'
]

# Dictionnaire pour stocker les DataFrames
dataframes = {}

# Extraire chaque table
for table in tables_to_extract:
    df = extract_mysql_table(table, conn)
    if df is not None:
        # Enlever le pr√©fixe "table_" pour le nom
        clean_name = table.replace('table_', '')
        dataframes[clean_name] = df

# Fermer la connexion
conn.close()

print("\n‚úÖ Extraction MySQL termin√©e!")
print(f"üì¶ {len(dataframes)} tables extraites")

# %% [markdown]
# ## 4Ô∏è‚É£ Aper√ßu des Donn√©es Extraites

# %% Afficher un aper√ßu
print("\n" + "="*60)
print("üìã APER√áU DES DONN√âES EXTRAITES")
print("="*60 + "\n")

for name, df in dataframes.items():
    print(f"\nüìä Table: {name}")
    print(f"   Dimensions: {df.shape[0]} lignes √ó {df.shape[1]} colonnes")
    print(f"   Colonnes: {', '.join(df.columns.tolist()[:5])}...")
    print(f"   Aper√ßu:")
    display(df.head(3))
    print("-" * 60)

# %% [markdown]
# ## 5Ô∏è‚É£ Statistiques des Ventes

# %% Analyse rapide des ventes
df_sales = dataframes['sales']

print("\nüìä STATISTIQUES DES VENTES")
print("="*60)
print(f"Nombre total de ventes: {len(df_sales):,}")
print(f"Revenu total: {df_sales['Total_Revenue'].sum():,.2f} DZD")
print(f"Revenu moyen par vente: {df_sales['Total_Revenue'].mean():,.2f} DZD")
print(f"P√©riode: {df_sales['Date'].min()} ‚Üí {df_sales['Date'].max()}")

# %% [markdown]
# ## 6Ô∏è‚É£ Web Scraping - Prix Concurrents

# %% Configuration du scraping
COMPETITOR_URL = "https://boughida.com/competitor/"

print("\n" + "="*60)
print("üï∑Ô∏è  WEB SCRAPING - PRIX CONCURRENTS")
print("="*60 + "\n")

# %% Fonction de scraping
def scrape_competitor_prices(url):
    """Scraper les prix des concurrents"""
    try:
        print(f"üì° Connexion √†: {url}")
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        products = []
        
        # Chercher les produits (adapter selon la structure HTML)
        product_items = soup.find_all('div', class_='product-item')
        
        print(f"üîç {len(product_items)} produits trouv√©s")
        
        for item in product_items:
            try:
                # Extraire le nom
                name_elem = item.find('h3')
                name = name_elem.text.strip() if name_elem else None
                
                # Extraire le prix
                price_elem = item.find('span', class_='price')
                if price_elem:
                    price_text = price_elem.text.strip()
                    # Nettoyer le prix
                    import re
                    price = float(re.sub(r'[^\\d.]', '', price_text))
                else:
                    price = None
                
                if name and price:
                    products.append({
                        'Competitor_Product_Name': name,
                        'Competitor_Price': price
                    })
                    
            except Exception as e:
                continue
        
        return pd.DataFrame(products)
        
    except Exception as e:
        print(f"‚ùå Erreur de scraping: {e}")
        print("üìù Utilisation de donn√©es de test √† la place")
        return create_test_competitor_data()

# %% Fonction de donn√©es de test
def create_test_competitor_data():
    """Cr√©er des donn√©es de test pour les prix concurrents"""
    test_data = [
        {'Competitor_Product_Name': 'Laptop HP ProBook 450', 'Competitor_Price': 95000},
        {'Competitor_Product_Name': 'Dell Latitude 5420', 'Competitor_Price': 105000},
        {'Competitor_Product_Name': 'iPhone 14 128GB', 'Competitor_Price': 165000},
        {'Competitor_Product_Name': 'Samsung S23 Ultra', 'Competitor_Price': 185000},
        {'Competitor_Product_Name': 'Sony WH-1000XM5', 'Competitor_Price': 42000},
        {'Competitor_Product_Name': 'AirPods Pro 2', 'Competitor_Price': 35000},
        {'Competitor_Product_Name': 'LG OLED55C3', 'Competitor_Price': 215000},
        {'Competitor_Product_Name': 'Samsung QN65Q80C', 'Competitor_Price': 275000},
    ]
    return pd.DataFrame(test_data)

# %% Ex√©cuter le scraping
df_competitor = scrape_competitor_prices(COMPETITOR_URL)

# Sauvegarder
df_competitor.to_csv('data/extracted/competitor_prices.csv', index=False)

print(f"\n‚úÖ Scraping termin√©: {len(df_competitor)} produits extraits")
print(f"üíæ Fichier sauvegard√©: data/extracted/competitor_prices.csv")

# Afficher un aper√ßu
print("\nüìã Aper√ßu des prix concurrents:")
display(df_competitor.head(10))

# %% [markdown]
# ## 7Ô∏è‚É£ Validation des Donn√©es Extraites

# %% V√©rifications de qualit√©
print("\n" + "="*60)
print("‚úÖ VALIDATION DES DONN√âES")
print("="*60 + "\n")

validation_results = []

for name, df in dataframes.items():
    # V√©rifier les valeurs manquantes
    missing = df.isnull().sum().sum()
    missing_pct = (missing / (df.shape[0] * df.shape[1])) * 100
    
    # V√©rifier les doublons
    duplicates = df.duplicated().sum()
    
    validation_results.append({
        'Table': name,
        'Lignes': len(df),
        'Colonnes': len(df.columns),
        'Valeurs_Manquantes': missing,
        'Pct_Manquant': f"{missing_pct:.2f}%",
        'Doublons': duplicates,
        'Statut': '‚úÖ' if missing_pct < 5 and duplicates < 10 else '‚ö†Ô∏è'
    })

df_validation = pd.DataFrame(validation_results)
display(df_validation)

# %% [markdown]
# ## 8Ô∏è‚É£ R√©sum√© de l'Extraction

# %% R√©sum√© final
print("\n" + "="*70)
print("üìä R√âSUM√â DE L'EXTRACTION (MEMBRE 1)")
print("="*70 + "\n")

print("‚úÖ T√ÇCHES COMPL√âT√âES:")
print("   1. Connexion MySQL √©tablie et test√©e")
print("   2. 8 tables extraites de l'ERP")
print("   3. Web scraping des prix concurrents effectu√©")
print("   4. Toutes les donn√©es sauvegard√©es en CSV")
print("   5. Validation de la qualit√© des donn√©es effectu√©e")

print("\nüì¶ FICHIERS CR√â√âS:")
for file in os.listdir('data/extracted'):
    file_path = f"data/extracted/{file}"
    file_size = os.path.getsize(file_path) / 1024  # KB
    print(f"   ‚Ä¢ {file} ({file_size:.1f} KB)")

print("\nüöÄ PROCHAINE √âTAPE:")
print("   ‚Üí Membre 2 peut maintenant transformer ces donn√©es")
print("   ‚Üí Fichiers disponibles dans: data/extracted/")

print("\n" + "="*70)
print("‚úÖ EXTRACTION TERMIN√âE AVEC SUCC√àS")
print("="*70)





# %% [markdown]
# ## PARTIE 2 : Transformations des Donn√©es (Hadjer)
#
# **Responsable** : Hadjer ‚Äì Data Cleaning & Feature Engineering Specialist

# %% Configuration initiale
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import os

# Cr√©er le dossier de sortie
os.makedirs('data/transformed', exist_ok=True)

# %% Chargement des donn√©es
print("üöÄ Chargement des donn√©es extraites...\n")

sales_df        = pd.read_csv('data/extracted/sales.csv')
products_df     = pd.read_csv('data/extracted/products.csv')
customers_df    = pd.read_csv('data/extracted/customers.csv')
cities_df       = pd.read_csv('data/extracted/cities.csv')
categories_df   = pd.read_csv('data/extracted/categories.csv')
reviews_df      = pd.read_csv('data/extracted/reviews.csv') if 'reviews.csv' in os.listdir('data/extracted') else None
stores_df       = pd.read_csv('data/extracted/stores.csv') if 'stores.csv' in os.listdir('data/extracted') else None

# Chargement optionnel
try:
    subcategories_df = pd.read_csv('data/extracted/subcategories.csv')
    print("‚úì subcategories.csv charg√©")
except FileNotFoundError:
    subcategories_df = None
    print("‚ö† subcategories.csv non trouv√©")

print(f"\nDonn√©es charg√©es : {len(sales_df)} ventes, {len(products_df)} produits")
print("Colonnes dans sales.csv :", sales_df.columns.tolist())

# %% 1. Aper√ßu initial et valeurs manquantes
print("\n" + "="*70)
print("1. Aper√ßu initial et valeurs manquantes")
print("="*70)

display(sales_df.head(5))
print("\nValeurs manquantes par colonne :")
print(sales_df.isnull().sum())

# %% 2. D√©tection et normalisation des colonnes critiques
print("\n" + "="*70)
print("2. D√©tection automatique des colonnes essentielles")
print("="*70)

# --- Colonne date ---
possible_dates = ['Date', 'date', 'order_date', 'sale_date', 'Order_Date']
date_col = next((col for col in possible_dates if col in sales_df.columns), None)
if date_col:
    sales_df = sales_df.rename(columns={date_col: 'date'})
    sales_df['date'] = pd.to_datetime(sales_df['date'], errors='coerce')
    sales_df['year'] = sales_df['date'].dt.year
    sales_df['month'] = sales_df['date'].dt.month
    print(f"‚úì Colonne date d√©tect√©e et normalis√©e : '{date_col}' ‚Üí 'date'")
else:
    print("‚ö† Aucune colonne date trouv√©e ‚Üí cr√©ation de placeholders")
    sales_df['date'] = pd.NaT
    sales_df['year'] = pd.NA
    sales_df['month'] = pd.NA

# --- Colonne order_value_EUR (valeur de vente) ---
possible_values = ['order_value_EUR', 'Order_Value_EUR', 'Total_Revenue', 'total_revenue', 'revenue', 'price']
value_col = next((col for col in possible_values if col in sales_df.columns), None)
if value_col:
    sales_df = sales_df.rename(columns={value_col: 'order_value_EUR'})
    sales_df['order_value_EUR'] = pd.to_numeric(sales_df['order_value_EUR'], errors='coerce')
    print(f"‚úì Colonne valeur vente d√©tect√©e : '{value_col}' ‚Üí 'order_value_EUR'")
else:
    raise KeyError("Aucune colonne de valeur de vente trouv√©e !")

# --- Colonne cost (co√ªt) ---
possible_costs = ['cost', 'Cost', 'unit_cost', 'Unit_Cost', 'cost_price', 'Cost_Price', 'total_cost', 'Total_Cost']
cost_col = next((col for col in possible_costs if col in sales_df.columns), None)
if cost_col:
    sales_df = sales_df.rename(columns={cost_col: 'cost'})
    sales_df['cost'] = pd.to_numeric(sales_df['cost'], errors='coerce')
    print(f"‚úì Colonne co√ªt d√©tect√©e : '{cost_col}' ‚Üí 'cost'")
else:
    print("‚ö† Aucune colonne co√ªt trouv√©e ‚Üí cr√©ation avec NaN")
    sales_df['cost'] = np.nan

# --- Autres colonnes utiles (optionnelles) ---
for std, possibles in {'device_type': ['device_type', 'Device_Type', 'channel'],
                       'category': ['category', 'Category', 'product_category'],
                       'country': ['country', 'Country'],
                       'city': ['city', 'City']}.items():
    col = next((c for c in possibles if c in sales_df.columns), None)
    if col and col != std:
        sales_df = sales_df.rename(columns={col: std})

# %% 3. Nettoyage des valeurs non num√©riques (cost et order_value_EUR)
print("\n" + "="*70)
print("3. Nettoyage des colonnes num√©riques")
print("="*70)

for col in ['order_value_EUR', 'cost']:
    if col in sales_df.columns:
        invalid = sales_df[col].isna()
        print(f"{col} - Lignes invalides/apr√®s coerce : {invalid.sum()}")
        if invalid.sum() > 0:
            display(sales_df[invalid].head(3))

# Supprimer lignes o√π order_value_EUR ou cost sont NaN (critiques)
sales_df.dropna(subset=['order_value_EUR'], inplace=True)
if 'cost' in sales_df.columns:
    sales_df.dropna(subset=['cost'], inplace=True)

print(f"‚úì Nettoyage termin√© ‚Üí {len(sales_df)} lignes restantes")

# %% 4. Imputation des valeurs manquantes restantes
print("\n" + "="*70)
print("4. Imputation")
print("="*70)

sales_df['order_value_EUR'].fillna(sales_df['order_value_EUR'].mean(), inplace=True)
if 'cost' in sales_df.columns:
    sales_df['cost'].fillna(sales_df['cost'].mean(), inplace=True)

if 'device_type' in sales_df.columns:
    mode_device = sales_df['device_type'].mode()[0] if not sales_df['device_type'].mode().empty else 'unknown'
    sales_df['device_type'].fillna(mode_device, inplace=True)
    print(f"device_type imput√© avec : '{mode_device}'")

# %% 5. Suppression des doublons
print("\n" + "="*70)
print("5. Suppression des doublons")
print("="*70)

duplicates = sales_df.duplicated().sum()
sales_df.drop_duplicates(inplace=True)
print(f"‚úì {duplicates} doublons supprim√©s ‚Üí {len(sales_df)} lignes")

# %% 6. Cr√©ation de profit_margin
print("\n" + "="*70)
print("6. Cr√©ation profit_margin")
print("="*70)

if 'cost' in sales_df.columns:
    sales_df['profit_margin'] = sales_df['order_value_EUR'] - sales_df['cost']
else:
    sales_df['profit_margin'] = np.nan
    print("‚ö† profit_margin mis √† NaN (pas de colonne cost)")

display(sales_df[['order_value_EUR', 'cost', 'profit_margin']].head())

# %% 7. Analyse des profits n√©gatifs
print("\n" + "="*70)
print("7. Transactions en perte")
print("="*70)

if 'profit_margin' in sales_df.columns:
    negative = sales_df[sales_df['profit_margin'] < 0]
    print(f"Nombre de ventes en perte : {len(negative)}")
    if len(negative) > 0:
        display(negative.head(10))

# %% 8. Standardisation texte
print("\n" + "="*70)
print("8. Standardisation colonnes texte")
print("="*70)

text_cols = ['country', 'category', 'city', 'device_type']
for col in text_cols:
    if col in sales_df.columns:
        sales_df[col] = sales_df[col].astype(str).str.strip().str.lower().str.replace('-', ' ')

# %% 9. Colonnes d√©riv√©es
print("\n" + "="*70)
print("9. Colonnes d√©riv√©es + top cat√©gories")
print("="*70)

if 'profit_margin' in sales_df.columns and (sales_df['order_value_EUR'] != 0).any():
    sales_df['margin_rate'] = sales_df['profit_margin'] / sales_df['order_value_EUR']
    sales_df['margin_rate'] = sales_df['margin_rate'].round(4)
    sales_df['profit_per_1000EUR'] = (sales_df['profit_margin'] / 1000).round(2)

    if 'category' in sales_df.columns:
        top_cat = sales_df.groupby('category')['margin_rate'].mean().sort_values(ascending=False).head(5) * 100
        print("Top 5 cat√©gories les plus rentables (%) :")
        display(top_cat)

# %% 10. Nettoyage incoh√©rences business
print("\n" + "="*70)
print("10. Suppression incoh√©rences")
print("="*70)

bad = (sales_df['order_value_EUR'] <= 0) | ((sales_df['order_value_EUR'] < sales_df['cost']) if 'cost' in sales_df.columns else False)
bad_count = bad.sum()
sales_df = sales_df[~bad].copy()
print(f"‚úì {bad_count} lignes incoh√©rentes supprim√©es ‚Üí {len(sales_df)} lignes valides")

# Sauvegarde cleaned
sales_df.to_csv('data/transformed/sales_cleaned.csv', index=False)
sales_df.to_excel('data/transformed/sales_cleaned.xlsx', index=False)
print("üíæ Fichiers cleaned sauvegard√©s dans data/transformed/")

# %% 11. Ajout 3 lignes NaN + backward fill
print("\n" + "="*70)
print("11. Ajout 3 lignes NaN + backward fill")
print("="*70)

empty = pd.DataFrame(index=range(3), columns=sales_df.columns)
sales_df = pd.concat([empty, sales_df], ignore_index=True)
sales_df = sales_df.bfill()

print("‚úì 3 lignes ajout√©es et backward fill appliqu√©")

# %% 12. Scaling
print("\n" + "="*70)
print("12. Scaling (MinMax & Standard)")
print("="*70)

num_cols = [col for col in ['order_value_EUR', 'cost', 'profit_margin'] if col in sales_df.columns]

if num_cols:
    minmax = MinMaxScaler().fit_transform(sales_df[num_cols])
    std = StandardScaler().fit_transform(sales_df[num_cols])

    comparison = pd.DataFrame({
        **{f"{c}_orig": sales_df[c].head(10).round(2) for c in num_cols},
        **{f"{c}_minmax": minmax[:10, i].round(4) for i, c in enumerate(num_cols)},
        **{f"{c}_std": std[:10, i].round(4) for i, c in enumerate(num_cols)},
    })
    display(comparison)
else:
    print("‚ö† Aucune colonne num√©rique pour scaling")

print("\n" + "="*70)
print("üéâ PARTIE 2 TERMIN√âE AVEC SUCC√àS !")
print("="*70)

‚úÖ Biblioth√®ques import√©es avec succ√®s
üìÖ Date d'ex√©cution: 2025-12-25 14:47:44
üìã Configuration charg√©e
   Serveur: boughida.com
   Base de donn√©es: techstore_erp
‚úÖ Connexion MySQL r√©ussie!
   Base de donn√©es active: techstore_erp
   Nombre de tables: 8

üöÄ EXTRACTION DES TABLES MySQL

üìä Extraction: table_sales... ‚úÖ 25000 lignes | 7 colonnes
üìä Extraction: table_products... ‚úÖ 38 lignes | 5 colonnes
üìä Extraction: table_reviews... ‚úÖ 3000 lignes | 5 colonnes
üìä Extraction: table_customers... ‚úÖ 1200 lignes | 3 colonnes
üìä Extraction: table_stores... ‚úÖ 12 lignes | 3 colonnes
üìä Extraction: table_cities... ‚úÖ 12 lignes | 3 colonnes
üìä Extraction: table_categories... ‚úÖ 5 lignes | 2 colonnes
üìä Extraction: table_subcategories... ‚úÖ 15 lignes | 3 colonnes

‚úÖ Extraction MySQL termin√©e!
üì¶ 8 tables extraites

üìã APER√áU DES DONN√âES EXTRAITES


üìä Table: sales
   Dimensions: 25000 lignes √ó 7 colonnes
   Colonnes: Trans_ID, Date, Store_ID

Unnamed: 0,Trans_ID,Date,Store_ID,Product_ID,Customer_ID,Quantity,Total_Revenue
0,1,2024-07-02 17:15:00,3,P125,C0200,2,9000.0
1,2,2023-07-01 13:52:00,11,P136,C0883,1,2500.0
2,3,2025-01-05 18:56:00,7,P106,C0669,1,320000.0


------------------------------------------------------------

üìä Table: products
   Dimensions: 38 lignes √ó 5 colonnes
   Colonnes: Product_ID, Product_Name, SubCat_ID, Unit_Price, Unit_Cost...
   Aper√ßu:


Unnamed: 0,Product_ID,Product_Name,SubCat_ID,Unit_Price,Unit_Cost
0,P100,HP Victus 15,1,125000.0,87901.0
1,P101,Dell XPS 13,1,260000.0,167880.0
2,P102,MacBook Air M2,1,195000.0,141052.0


------------------------------------------------------------

üìä Table: reviews
   Dimensions: 3000 lignes √ó 5 colonnes
   Colonnes: Review_ID, Product_ID, Customer_ID, Rating, Review_Text...
   Aper√ßu:


Unnamed: 0,Review_ID,Product_ID,Customer_ID,Rating,Review_Text
0,1,P127,C1149,3,"Average, expected better battery life."
1,2,P103,C0116,5,"Excellent product, highly recommended!"
2,3,P102,C0238,4,"Good, but shipping was a bit slow to Guelma."


------------------------------------------------------------

üìä Table: customers
   Dimensions: 1200 lignes √ó 3 colonnes
   Colonnes: Customer_ID, Full_Name, City_ID...
   Aper√ßu:


Unnamed: 0,Customer_ID,Full_Name,City_ID
0,C0001,Fares Mekki,9
1,C0002,Zineb Oukil,10
2,C0003,Amel Rahmani,2


------------------------------------------------------------

üìä Table: stores
   Dimensions: 12 lignes √ó 3 colonnes
   Colonnes: Store_ID, Store_Name, City_ID...
   Aper√ßu:


Unnamed: 0,Store_ID,Store_Name,City_ID
0,1,TechStore Alger Centre,1
1,2,TechStore Oran Bahia,2
2,3,TechStore Constantine Cirta,3


------------------------------------------------------------

üìä Table: cities
   Dimensions: 12 lignes √ó 3 colonnes
   Colonnes: City_ID, City_Name, Region...
   Aper√ßu:


Unnamed: 0,City_ID,City_Name,Region
0,1,Alger,North
1,2,Oran,West
2,3,Constantine,East


------------------------------------------------------------

üìä Table: categories
   Dimensions: 5 lignes √ó 2 colonnes
   Colonnes: Category_ID, Category_Name...
   Aper√ßu:


Unnamed: 0,Category_ID,Category_Name
0,1,Computers
1,2,Smartphones
2,3,Audio


------------------------------------------------------------

üìä Table: subcategories
   Dimensions: 15 lignes √ó 3 colonnes
   Colonnes: SubCat_ID, SubCat_Name, Category_ID...
   Aper√ßu:


Unnamed: 0,SubCat_ID,SubCat_Name,Category_ID
0,1,Laptops,1
1,2,Desktops,1
2,3,Monitors,1


------------------------------------------------------------

üìä STATISTIQUES DES VENTES
Nombre total de ventes: 25,000
Revenu total: 859,661,700.00 DZD
Revenu moyen par vente: 34,386.47 DZD
P√©riode: 2023-01-01 09:43:00 ‚Üí 2025-12-30 20:58:00

üï∑Ô∏è  WEB SCRAPING - PRIX CONCURRENTS

üì° Connexion √†: https://boughida.com/competitor/
üîç 0 produits trouv√©s

‚úÖ Scraping termin√©: 0 produits extraits
üíæ Fichier sauvegard√©: data/extracted/competitor_prices.csv

üìã Aper√ßu des prix concurrents:



‚úÖ VALIDATION DES DONN√âES



Unnamed: 0,Table,Lignes,Colonnes,Valeurs_Manquantes,Pct_Manquant,Doublons,Statut
0,sales,25000,7,0,0.00%,0,‚úÖ
1,products,38,5,0,0.00%,0,‚úÖ
2,reviews,3000,5,0,0.00%,0,‚úÖ
3,customers,1200,3,0,0.00%,0,‚úÖ
4,stores,12,3,0,0.00%,0,‚úÖ
5,cities,12,3,0,0.00%,0,‚úÖ
6,categories,5,2,0,0.00%,0,‚úÖ
7,subcategories,15,3,0,0.00%,0,‚úÖ



üìä R√âSUM√â DE L'EXTRACTION (MEMBRE 1)

‚úÖ T√ÇCHES COMPL√âT√âES:
   1. Connexion MySQL √©tablie et test√©e
   2. 8 tables extraites de l'ERP
   3. Web scraping des prix concurrents effectu√©
   4. Toutes les donn√©es sauvegard√©es en CSV
   5. Validation de la qualit√© des donn√©es effectu√©e

üì¶ FICHIERS CR√â√âS:
   ‚Ä¢ categories.csv (0.1 KB)
   ‚Ä¢ cities.csv (0.2 KB)
   ‚Ä¢ competitor_prices.csv (0.0 KB)
   ‚Ä¢ customers.csv (26.9 KB)
   ‚Ä¢ extraction_summary.csv (0.3 KB)
   ‚Ä¢ products.csv (1.5 KB)
   ‚Ä¢ reviews.csv (179.5 KB)
   ‚Ä¢ sales.csv (1199.7 KB)
   ‚Ä¢ stores.csv (0.4 KB)
   ‚Ä¢ subcategories.csv (0.3 KB)

üöÄ PROCHAINE √âTAPE:
   ‚Üí Membre 2 peut maintenant transformer ces donn√©es
   ‚Üí Fichiers disponibles dans: data/extracted/

‚úÖ EXTRACTION TERMIN√âE AVEC SUCC√àS
üöÄ Chargement des donn√©es extraites...

‚úì subcategories.csv charg√©

Donn√©es charg√©es : 25000 ventes, 38 produits
Colonnes dans sales.csv : ['Trans_ID', 'Date', 'Store_ID', 'Product_ID',

Unnamed: 0,Trans_ID,Date,Store_ID,Product_ID,Customer_ID,Quantity,Total_Revenue
0,1,2024-07-02 17:15:00,3,P125,C0200,2,9000.0
1,2,2023-07-01 13:52:00,11,P136,C0883,1,2500.0
2,3,2025-01-05 18:56:00,7,P106,C0669,1,320000.0
3,4,2024-08-01 13:27:00,2,P125,C0848,1,4500.0
4,5,2023-04-30 13:45:00,10,P113,C0963,1,58000.0



Valeurs manquantes par colonne :
Trans_ID         0
Date             0
Store_ID         0
Product_ID       0
Customer_ID      0
Quantity         0
Total_Revenue    0
dtype: int64

2. D√©tection automatique des colonnes essentielles
‚úì Colonne date d√©tect√©e et normalis√©e : 'Date' ‚Üí 'date'
‚úì Colonne valeur vente d√©tect√©e : 'Total_Revenue' ‚Üí 'order_value_EUR'
‚ö† Aucune colonne co√ªt trouv√©e ‚Üí cr√©ation avec NaN

3. Nettoyage des colonnes num√©riques
order_value_EUR - Lignes invalides/apr√®s coerce : 0
cost - Lignes invalides/apr√®s coerce : 25000


Unnamed: 0,Trans_ID,date,Store_ID,Product_ID,Customer_ID,Quantity,order_value_EUR,year,month,cost
0,1,2024-07-02 17:15:00,3,P125,C0200,2,9000.0,2024,7,
1,2,2023-07-01 13:52:00,11,P136,C0883,1,2500.0,2023,7,
2,3,2025-01-05 18:56:00,7,P106,C0669,1,320000.0,2025,1,


‚úì Nettoyage termin√© ‚Üí 0 lignes restantes

4. Imputation

5. Suppression des doublons
‚úì 0 doublons supprim√©s ‚Üí 0 lignes

6. Cr√©ation profit_margin


Unnamed: 0,order_value_EUR,cost,profit_margin



7. Transactions en perte
Nombre de ventes en perte : 0

8. Standardisation colonnes texte

9. Colonnes d√©riv√©es + top cat√©gories

10. Suppression incoh√©rences
‚úì 0 lignes incoh√©rentes supprim√©es ‚Üí 0 lignes valides
üíæ Fichiers cleaned sauvegard√©s dans data/transformed/

11. Ajout 3 lignes NaN + backward fill
‚úì 3 lignes ajout√©es et backward fill appliqu√©

12. Scaling (MinMax & Standard)


Unnamed: 0,order_value_EUR_orig,cost_orig,profit_margin_orig,order_value_EUR_minmax,cost_minmax,profit_margin_minmax,order_value_EUR_std,cost_std,profit_margin_std
0,,,,,,,,,
1,,,,,,,,,
2,,,,,,,,,



üéâ PARTIE 2 TERMIN√âE AVEC SUCC√àS !
Toutes les transformations sont compl√®tes et robustes.
