# Étape 9 — Reproductibilité

Objectif : satisfaire exactement la consigne :

- Fixer les seeds (aléatoire)
- Setup d’environnement clair
- Une commande (ou une séquence de notebooks) permettant de reproduire les résultats

Ce notebook :
1) fixe les seeds
2) vérifie la racine projet et crée `models/` et `reports/` si besoin
3) génère `requirements.txt`
4) génère `RUNBOOK.md` (ordre d’exécution clair)
5) génère / met à jour `README.md` (résumé + quickstart)
6) génère `reports/artifacts_summary.json` (preuves des outputs)


In [1]:
import os
import random
import numpy as np

RANDOM_STATE = 42
os.environ["PYTHONHASHSEED"] = str(RANDOM_STATE)
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

print("Seeds fixed:", RANDOM_STATE)


Seeds fixed: 42


In [2]:
from pathlib import Path
import datetime
import json
import pandas as pd

# IMPORTANT : notebook exécuté depuis la racine du projet
PROJECT_ROOT = Path(".")
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")

print("Project root:", PROJECT_ROOT.resolve())


Project root: C:\Users\Ahmed\Documents\GitHub\netflixandtvshows


In [3]:
expected_files = [
    "netflix_titles.csv",
    "DataAcquisition.ipynb",
    "DataComprehension.ipynb",
    "ExploratoryAnalytics.ipynb",
    "Step3-DataCleaning.ipynb",
]

expected_dirs = ["models", "reports"]

present_files = {p.name for p in PROJECT_ROOT.glob("*") if p.is_file()}
present_dirs = {p.name for p in PROJECT_ROOT.glob("*") if p.is_dir()}

missing_files = [f for f in expected_files if f not in present_files]
missing_dirs = [d for d in expected_dirs if d not in present_dirs]

print("\nMissing expected files:", missing_files)
print("Missing expected dirs:", missing_dirs)

for d in expected_dirs:
    (PROJECT_ROOT / d).mkdir(exist_ok=True)

print("Ensured dirs exist: ./models and ./reports")



Missing expected files: []
Missing expected dirs: []
Ensured dirs exist: ./models and ./reports


In [5]:
requirements_path = PROJECT_ROOT / "requirements.txt"

requirements_content = """\
numpy
pandas
scikit-learn
joblib
matplotlib
"""

# backup si déjà existant et non vide
if requirements_path.exists():
    old = requirements_path.read_text(encoding="utf-8", errors="ignore")
    if old.strip():
        backup = PROJECT_ROOT / "requirements.backup.txt"
        backup.write_text(old, encoding="utf-8")
        print("Backup created:", backup)

requirements_path.write_text(requirements_content, encoding="utf-8")
print("Wrote:", requirements_path)
print(requirements_path.read_text())


Backup created: requirements.backup.txt
Wrote: requirements.txt
numpy
pandas
scikit-learn
joblib
matplotlib



In [6]:
runbook_path = PROJECT_ROOT / "RUNBOOK.md"

runbook = f"""# RUNBOOK — Reproduire le projet

Dernière génération : {now}

1) Installer
python -m venv .venv
pip install -r requirements.txt

2) Exécuter les notebooks
- DataAcquisition.ipynb
- DataComprehension.ipynb
- Step3-DataCleaning.ipynb
- ExploratoryAnalytics.ipynb
- Step7-HyperparameterTuning_ONLYCSV.ipynb
- Step8-EvaluationAndSelection.ipynb
- Etape9-Reproductibilite.ipynb
"""

runbook_path.write_text(runbook, encoding="utf-8")
print("RUNBOOK.md written")


RUNBOOK.md written


In [7]:
readme_path = PROJECT_ROOT / "README.md"

readme = f"""# Mini pipeline NLP — Netflix is_mature

Reproductibilité :
- Seeds fixées : {RANDOM_STATE}
- Chemins relatifs
- Voir RUNBOOK.md pour la reproduction

Généré le {now}
"""

readme_path.write_text(readme, encoding="utf-8")
print("README.md written")


README.md written


In [8]:
artifacts = {
    "generated_at": now,
    "random_state": RANDOM_STATE,
    "models": [p.as_posix() for p in (PROJECT_ROOT / "models").glob("*.joblib")],
    "reports": [p.as_posix() for p in (PROJECT_ROOT / "reports").glob("*.csv")],
}

artifacts_path = PROJECT_ROOT / "reports" / "artifacts_summary.json"
artifacts_path.write_text(json.dumps(artifacts, indent=2), encoding="utf-8")
print("artifacts_summary.json written")


artifacts_summary.json written
