In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.ensemble import StackingClassifier

# 1. Chargement des données
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# 2. Prétraitement
train["TotalCharges"] = pd.to_numeric(train["TotalCharges"], errors="coerce")
test["TotalCharges"] = pd.to_numeric(test["TotalCharges"], errors="coerce")

train["TotalCharges"].fillna(train["MonthlyCharges"] * train["tenure"], inplace=True)
test["TotalCharges"].fillna(test["MonthlyCharges"] * test["tenure"], inplace=True)

# Feature engineering
def enrich(df):
    df["TotalServices"] = (df[[
        "PhoneService", "InternetService", "OnlineSecurity", "OnlineBackup",
        "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"
    ]] == "Yes").sum(axis=1)
    df["IsEngaged"] = (df["tenure"] > 12).astype(int)
    df["IsAlone"] = ((df["Partner"] == "No") & (df["Dependents"] == "No")).astype(int)
    df["IsFiberUser"] = (df["InternetService"] == "Fiber optic").astype(int)
    df["IsDSLUser"] = (df["InternetService"] == "DSL").astype(int)
    return df

train = enrich(train)
test = enrich(test)

# Séparation features/target
X = train.drop(columns=["Churn", "id"])
y = train["Churn"]
X_test = test.drop(columns=["id"])

# Encodage LabelEncoder
cat_cols = X.select_dtypes(include=["object"]).columns
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))
    label_encoders[col] = le

# 3. Split pour validation (80/20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Convert y_val to integers
y_val_encoded = y_val.apply(lambda x: 1 if x == 'Yes' else 0)

# 4. Définition des modèles de base
base_models = [
    ('cat', CatBoostClassifier(verbose=0, random_state=42)),
    ('xgb', XGBClassifier(eval_metric="logloss", use_label_encoder=False, random_state=42)),
    ('lgb', LGBMClassifier(random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))
]

# 5. Stacking final
final_model = StackingClassifier(
    estimators=base_models,
    final_estimator=CatBoostClassifier(verbose=0, learning_rate=0.01, random_state=42),
    passthrough=True,
    cv=5
)

# 6. Entraînement
final_model.fit(X_train, y_train)

# 7. Recherche du meilleur seuil F1
probs = final_model.predict_proba(X_val)[:, 1]
best_thresh, best_f1 = 0.5, 0
for t in np.arange(0.3, 0.9, 0.01):
    preds = (probs > t).astype(int)
    score = f1_score(y_val_encoded, preds) # Use the encoded y_val
    if score > best_f1:
        best_f1 = score
        best_thresh = t

print(f"✅ Meilleur seuil = {best_thresh:.2f}, F1 = {best_f1:.4f}")

# 8. Réentraînement sur 100% des données
final_model.fit(X, y)

# 9. Prédiction sur test
test_probs = final_model.predict_proba(X_test)[:, 1]
test_preds = (test_probs > best_thresh).astype(int)

# 10. Export
submission = pd.DataFrame({
    "id": test["id"],
    "Churn": test_preds
})
submission.to_csv("submission.csv", index=False)
print("✅ Fichier submission.csv généré.")

In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('submission.csv')

# Replace 0 with 'No' and 1 with 'Yes' in the 'Churn' column
df['Churn'] = df['Churn'].replace({0: 'No', 1: 'Yes'})

# Save the modified DataFrame back to a CSV file (optional)
df.to_csv('submission.csv', index=False)

# Display the first few rows to verify the changes
print(df.head())