In [1]:
from pathlib import Path
import re, csv, codecs
from typing import Dict, Tuple, List
import pandas as pd

# Notebook and files are in the same folder
RAW_DIR = Path(".")   # current directory = raw_data

# Quick sanity check: list files here
sorted([p.name for p in RAW_DIR.iterdir() if p.is_file()])


['MLNOTEBOOK.ipynb',
 'X_test_clean.joblib',
 'cleaned_SIG_YOUSSER.csv',
 'cleaned_dataset.csv',
 'info.ipynb',
 'rf_model.joblib',
 'training_dataset_balanced.csv',
 'y_test.joblib']

In [2]:
def read_sample_bytes(path: Path, n_bytes: int = 512_000) -> bytes:
    with open(path, "rb") as f:
        return f.read(n_bytes)

def sniff_delimiter_and_header(path: Path) -> Tuple[str, bool]:
    sample = read_sample_bytes(path)
    text = None
    for enc in ("utf-8-sig", "utf-8", "cp1252", "latin-1"):
        try:
            text = sample.decode(enc)
            break
        except UnicodeDecodeError:
            continue
    if text is None:
        text = sample.decode("latin-1", errors="replace")

    try:
        dialect = csv.Sniffer().sniff(text, delimiters=[",", ";", "\t", "|", ":"])
        has_header = csv.Sniffer().has_header(text)
        return dialect.delimiter, has_header
    except csv.Error:
        lines = [ln for ln in text.splitlines() if ln.strip()]
        first = lines[0] if lines else ""
        candidates = [",", ";", "\t", "|", ":"]
        counts = {d: len(first.split(d)) for d in candidates}
        best = max(counts, key=counts.get)
        tokens = [t.strip().strip('"') for t in first.split(best)]
        has_header = any(not re.fullmatch(r"-?\d+(\.\d+)?", t) for t in tokens if t)
        return best, has_header

def count_columns(path: Path, delimiter: str) -> int:
    # first non-empty line (header or first row)
    for enc in ("utf-8-sig", "utf-8", "cp1252", "latin-1"):
        try:
            with codecs.open(path, "r", encoding=enc, errors="strict") as f:
                for line in f:
                    if line.strip():
                        return len(next(csv.reader([line], delimiter=delimiter)))
        except UnicodeDecodeError:
            continue
    with codecs.open(path, "r", encoding="latin-1", errors="replace") as f:
        for line in f:
            if line.strip():
                return len(next(csv.reader([line], delimiter=delimiter)))
    return 0

def fast_count_lines(path: Path) -> int:
    nl = 0
    with open(path, "rb", buffering=1024 * 1024) as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            nl += chunk.count(b"\n")
    if nl == 0 and path.stat().st_size > 0:
        return 1
    return nl

def fmt_int(n: int) -> str:
    return f"{n:,}"


In [3]:
FILE_DESCRIPTIONS: Dict[str, str] = {
    r"\bSIG_YOUSSER\b|\bSIG\b": "Données démographiques clients",
    r"\bCREDIT_YOUSSER\b|\bCREDIT\b": "Historique des crédits",
    r"\bCHAABI[_\s]*MOBILE_YOUSSER\b": "Produits mobiles",
    r"\bCHAABI[_\s]*NET_YOUSSER\b": "Produits net banking",
    r"\bPRODUIT[_\s]*PACK_YOUSSER\b": "Packs / offres produits",
    r"\bPRODUIT[_\s]*BANCASSURANCE_YOUSSER\b": "Produits bancassurance",
    r"\bCARTE_YOUSSER\b": "Cartes bancaires",
    r"\bCompte_YOUSSER\b|\bCOMPTE\b": "Comptes clients",
}

# Force header presence/absence if needed (filename -> True/False)
HEADER_OVERRIDES: Dict[str, bool] = {
    # "Compte_YOUSSER": True,
    # "SIG_YOUSSER": True,
}

def best_description(fname: str) -> str:
    for pat, desc in FILE_DESCRIPTIONS.items():
        if re.search(pat, fname, flags=re.IGNORECASE):
            return desc
    return ""


In [4]:
def build_inventory(raw_dir: Path) -> pd.DataFrame:
    rows: List[dict] = []

    # include .txt, .csv, and files without extension; exclude notebooks
    candidates = [
        p for p in raw_dir.iterdir()
        if p.is_file() and p.suffix.lower() not in {".ipynb"} and p.name != ".ipynb_checkpoints"
           and (p.suffix.lower() in {".txt", ".csv"} or p.suffix == "")
    ]

    for p in sorted(candidates, key=lambda x: x.name.lower()):
        delimiter, detected_header = sniff_delimiter_and_header(p)
        has_header = HEADER_OVERRIDES.get(p.name, detected_header)

        total_lines = fast_count_lines(p)
        n_rows = max(total_lines - (1 if has_header else 0), 0)
        n_cols = count_columns(p, delimiter)

        rows.append({
            "Source": p.name,
            "Lignes": n_rows,
            "Colonnes": n_cols,
            "Description": best_description(p.name),
        })

    # Add total
    df = pd.DataFrame(rows, columns=["Source", "Lignes", "Colonnes", "Description"])
    total = pd.DataFrame([{
        "Source": "Total",
        "Lignes": int(df["Lignes"].sum()) if not df.empty else 0,
        "Colonnes": "",
        "Description": "Volume global",
    }])
    return pd.concat([df, total], ignore_index=True)

df_info = build_inventory(RAW_DIR)
df_info


Unnamed: 0,Source,Lignes,Colonnes,Description
0,cleaned_dataset.csv,5530486,19.0,
1,cleaned_SIG_YOUSSER.csv,6700655,13.0,
2,training_dataset_balanced.csv,8800174,18.0,
3,Total,21031315,,Volume global


In [5]:
def print_summary_for_report(df: pd.DataFrame):
    print("Source | Lignes | Colonnes | Description")
    print("-" * 90)
    for _, r in df.iloc[:-1].iterrows():
        print(f"{r['Source']} | {fmt_int(int(r['Lignes']))} | {r['Colonnes']} | {r['Description']}")
    r = df.iloc[-1]
    print("-" * 90)
    print(f"{r['Source']} | {fmt_int(int(r['Lignes']))} |  | {r['Description']}")

print_summary_for_report(df_info)


Source | Lignes | Colonnes | Description
------------------------------------------------------------------------------------------
cleaned_dataset.csv | 5,530,486 | 19 | 
cleaned_SIG_YOUSSER.csv | 6,700,655 | 13 | 
training_dataset_balanced.csv | 8,800,174 | 18 | 
------------------------------------------------------------------------------------------
Total | 21,031,315 |  | Volume global


In [6]:
import pandas as pd

# Chargement du dataset final
df = pd.read_csv("cleaned_dataset.csv")  # ou CSV

# Nombre d'observations uniques
nb_clients = df["ID_CLIENT"].nunique()

# Nombre total de lignes
nb_rows = len(df)

# Nombre de colonnes
nb_cols = df.shape[1]

print(f"Clients uniques : {nb_clients:,}")
print(f"Lignes totales  : {nb_rows:,}")
print(f"Variables       : {nb_cols}")


  df = pd.read_csv("cleaned_dataset.csv")  # ou CSV


Clients uniques : 5,530,486
Lignes totales  : 5,530,486
Variables       : 19


In [7]:
print(df.memory_usage(deep=True).sum() / (1024**2), "MB")


2693.06990814209 MB


In [8]:
categories = {
    "Démographiques": ["SEXE", "DATE_NAISSANCE", "NOMBRE_ENFANT", "MARITAL_STATUS"],
    "Géographiques": ["VILLE", "COUNTRY", "RESIDENCE", "CODE_VILLE"],
    "Socio-économiques": ["PROFESSION", "FLAG_PROPRIETAIRE_LOGEMENT"],
    "Produits": ["has_mobile", "has_net", "has_pack", "has_bancassurance", "products_count", "digital_intensity"],
    "Variables cibles": ["credit_obtenu", "MT_ACCORDE"]
}

for cat, vars_list in categories.items():
    print(cat, len(vars_list), vars_list)


Démographiques 4 ['SEXE', 'DATE_NAISSANCE', 'NOMBRE_ENFANT', 'MARITAL_STATUS']
Géographiques 4 ['VILLE', 'COUNTRY', 'RESIDENCE', 'CODE_VILLE']
Socio-économiques 2 ['PROFESSION', 'FLAG_PROPRIETAIRE_LOGEMENT']
Produits 6 ['has_mobile', 'has_net', 'has_pack', 'has_bancassurance', 'products_count', 'digital_intensity']
Variables cibles 2 ['credit_obtenu', 'MT_ACCORDE']
