In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.metrics import f1_score
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import joblib
import os

# Set random seed for reproducibility
np.random.seed(42)

# Load data (unchanged per request)
try:
    train_df = pd.read_csv('churn-detection/train.csv')
    test_df = pd.read_csv('churn-detection/test.csv')
except FileNotFoundError:
    print("Error: train.csv or test.csv not found. Please upload files to /content/ or specify correct path.")
    raise

# Define columns
categorical_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                      'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                      'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
                      'PaperlessBilling', 'PaymentMethod']
numerical_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']
service_cols = ['PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
               'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

# Convert numerical columns, handle missing values
for col in numerical_columns:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce').fillna(0)
    test_df[col] = pd.to_numeric(test_df[col], errors='coerce').fillna(0)

# Encode target first
train_df['Churn'] = train_df['Churn'].map({'No': 0, 'Yes': 1})

# Encode categorical variables
Encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype(str))
    test_df[col] = test_df[col].astype(str).apply(lambda x: x if x in le.classes_ else le.classes_[0])
    le.classes_ = np.append(le.classes_, [x for x in test_df[col].unique() if x not in le.classes_])
    test_df[col] = le.transform(test_df[col]).astype(int)
    train_df[col] = train_df[col].astype(int)
    Encoders[col] = le
joblib.dump(Encoders, 'encoders_churn.pkl')

# Target encoding for high-cardinality columns
def target_encode(col, train_df, test_df, target='Churn'):
    means = train_df.groupby(col)[target].mean()
    train_df[f'{col}_target_enc'] = train_df[col].map(means)
    test_df[f'{col}_target_enc'] = test_df[col].map(means).fillna(means.mean())
    return train_df, test_df

for col in ['PaymentMethod', 'Contract']:
    train_df, test_df = target_encode(col, train_df, test_df)

# Optimized feature engineering
def add_features(df):
    df['TotalServices'] = df[service_cols].eq('Yes').sum(axis=1)
    df['MonthlyCharges_per_Tenure'] = df['MonthlyCharges'] / (df['tenure'] + 1e-6)
    df['Log_MonthlyCharges'] = np.log1p(df['MonthlyCharges'])
    df['TotalCharges_per_Tenure'] = df['TotalCharges'] / (df['tenure'] + 1e-6)
    df['IsSeniorAndAlone'] = ((df['SeniorCitizen'] == 1) & (df['Dependents'] == 'No')).astype(int)
    df['Loyalty_Score'] = df['tenure'] * (1 + df['Contract']).astype(int) + df['TotalServices']
    df['Contract_tenure'] = df['tenure'] * df['Contract'].astype(int)
    df['InternetService_MonthlyCharges'] = df['InternetService'].astype(int) * df['MonthlyCharges']
    df['Tenure_to_Contract'] = df['tenure'] / (df['Contract'].map({0: 1, 1: 12, 2: 24}).fillna(1) + 1e-6)
    df['MonthlyCharges_bin'] = pd.qcut(df['MonthlyCharges'], 5, labels=False, duplicates='drop')
    return df

train_df = add_features(train_df)
test_df = add_features(test_df)

# Clustering feature with n_init=10 to suppress warning
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
cluster_features = ['tenure', 'MonthlyCharges', 'TotalServices']
train_df['Customer_Cluster'] = kmeans.fit_predict(train_df[cluster_features])
test_df['Customer_Cluster'] = kmeans.predict(test_df[cluster_features])
joblib.dump(kmeans, 'kmeans.pkl')

# Polynomial features (limited to key interactions)
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
numerical_cols_poly = ['tenure', 'MonthlyCharges']
poly_features_train = poly.fit_transform(train_df[numerical_cols_poly])
poly_features_test = poly.transform(test_df[numerical_cols_poly])
poly_feature_names = poly.get_feature_names_out(numerical_cols_poly)
train_df[poly_feature_names] = poly_features_train
test_df[poly_feature_names] = poly_features_test

# Scale numerical features
numerical_columns_extended = numerical_columns + ['MonthlyCharges_per_Tenure', 'Log_MonthlyCharges',
                                                 'TotalCharges_per_Tenure', 'Loyalty_Score',
                                                 'Contract_tenure', 'InternetService_MonthlyCharges',
                                                 'Tenure_to_Contract', 'MonthlyCharges_bin',
                                                 'Customer_Cluster', 'PaymentMethod_target_enc',
                                                 'Contract_target_enc'] + list(poly_feature_names)
scaler = StandardScaler()
train_df[numerical_columns_extended] = scaler.fit_transform(train_df[numerical_columns_extended])
test_df[numerical_columns_extended] = scaler.transform(test_df[numerical_columns_extended])
joblib.dump(scaler, 'scaler.pkl')

# Prepare features
X = train_df.drop(['id', 'Churn'], axis=1)
y = train_df['Churn']
X_test_final = test_df[X.columns]

# Pre-tuned base models (all on CPU)
base_models = [
    ('catboost', CatBoostClassifier(iterations=800, learning_rate=0.03, depth=8, l2_leaf_reg=3,
                                    eval_metric='F1', verbose=0, random_state=42, class_weights=[1, 3])),
    ('lightgbm', LGBMClassifier(n_estimators=700, learning_rate=0.05, max_depth=7, num_leaves=50,
                                class_weight='balanced', random_state=42)),
    ('xgboost', XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=6, subsample=0.8,
                              scale_pos_weight=3, random_state=42)),  # CPU
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=5,
                                  class_weight='balanced', random_state=42, n_jobs=-1))
]

# Stacking classifier
stacking_model = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression(),
                                    cv=5, n_jobs=-1)

# Cross-validation and threshold optimization
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
best_f1, best_thresh = 0, 0.5
val_preds = np.zeros(len(X))
for train_idx, val_idx in skf.split(X, y):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    stacking_model.fit(X_tr, y_tr)
    proba = stacking_model.predict_proba(X_val)[:, 1]
    for thresh in np.arange(0.2, 0.81, 0.01):
        score = f1_score(y_val, proba >= thresh)
        if score > best_f1:
            best_f1 = score
            best_thresh = thresh
    val_preds[val_idx] = proba

print(f"Best F1 (CV): {best_f1:.4f} at threshold {best_thresh:.2f}")

# Train final model
stacking_model.fit(X, y)
joblib.dump(stacking_model, 'stacking_model.pkl')

# Predict on test set with blending
test_probas = np.zeros((len(X_test_final), len(base_models)))
for i, (name, model) in enumerate(base_models):
    model.fit(X, y)
    test_probas[:, i] = model.predict_proba(X_test_final)[:, 1]
weights = [0.4, 0.3, 0.2, 0.1]  # CatBoost, LightGBM, XGBoost, RF
blended_probas = np.average(test_probas, axis=1, weights=weights)
test_proba = stacking_model.predict_proba(X_test_final)[:, 1]
final_proba = 0.7 * test_proba + 0.3 * blended_probas
test_pred = np.where(final_proba >= best_thresh, 'Yes', 'No')

# Generate submission file
submission = pd.DataFrame({'id': test_df['id'], 'Churn': test_pred})
submission.to_csv('THEsubmission.csv', index=False)
print(f"Submission file generated: {submission.shape}")

print("="*100)
print(f"Best F1 (CV): {best_f1:.4f} at threshold {best_thresh:.2f}")
print("="*100)