<a href="https://www.kaggle.com/code/jaskarandhillon1609/churn-rate-in-business-analysis?scriptVersionId=235036991" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>


#   **Project related to the application of machine learning in business analysis**

In [16]:
import pandas as pd
import numpy as np
import os

np.random.seed(42)
n_customers = 1000

df = pd.DataFrame({
    'customer_id': [f'CUST{i:04d}' for i in range(n_customers)],
    'gender': np.random.choice(['Male', 'Female'], n_customers),
    'senior_citizen': np.random.choice([0, 1], n_customers, p=[0.85, 0.15]),
    'partner': np.random.choice(['Yes', 'No'], n_customers),
    'dependents': np.random.choice(['Yes', 'No'], n_customers),
    'tenure': np.random.randint(1, 72, n_customers),
    'phone_service': np.random.choice(['Yes', 'No'], n_customers, p=[0.9, 0.1]),
    'multiple_lines': np.random.choice(['Yes', 'No', 'No phone service'], n_customers),
    'internet_service': np.random.choice(['DSL', 'Fiber optic', 'No'], n_customers),
    'online_security': np.random.choice(['Yes', 'No', 'No internet service'], n_customers),
    'online_backup': np.random.choice(['Yes', 'No', 'No internet service'], n_customers),
    'device_protection': np.random.choice(['Yes', 'No', 'No internet service'], n_customers),
    'tech_support': np.random.choice(['Yes', 'No', 'No internet service'], n_customers),
    'streaming_tv': np.random.choice(['Yes', 'No', 'No internet service'], n_customers),
    'streaming_movies': np.random.choice(['Yes', 'No', 'No internet service'], n_customers),
    'contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_customers),
    'paperless_billing': np.random.choice(['Yes', 'No'], n_customers),
    'payment_method': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer', 'Credit card'], n_customers),
    'monthly_charges': np.round(np.random.normal(70, 30, n_customers), 2),
    'churn': np.random.choice([0, 1], n_customers, p=[0.73, 0.27])
})

df['monthly_charges'] = df['monthly_charges'].clip(20, 120)
df['total_charges'] = df['tenure'] * df['monthly_charges']

os.makedirs('data', exist_ok=True)
df.to_csv('data/customer_churn_business.csv', index=False)
print("Saved as 'data/customer_churn_business.csv'")


Saved as 'data/customer_churn_business.csv'


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
import xgboost as xgb
import warnings

# Suppress UserWarning about use_label_encoder
warnings.filterwarnings("ignore", message="Parameters: { \"use_label_encoder\" } are not used.")

# Load data
df = pd.read_csv('data/customer_churn_business.csv')
df = df.drop(columns=['customer_id'])

# Encode categorical features
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
df[categorical_cols] = df[categorical_cols].apply(LabelEncoder().fit_transform)

# Features and target
X = df.drop(columns=['churn'])
y = df['churn']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Optimized XGBoost model
# Remove use_label_encoder
model = xgb.XGBClassifier(
    eval_metric='logloss',
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    early_stopping_rounds=10
)

# Use eval_set in fit method
model.fit(X_train_scaled, y_train, eval_set=[(X_test_scaled, y_test)], verbose=False)

# Evaluation
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

print("AUC-ROC:", roc_auc_score(y_test, y_pred_proba))
print("AUC-PR:", average_precision_score(y_test, y_pred_proba))
# Suppress UndefinedMetricWarning and handle zero division
print(classification_report(y_test, y_pred, zero_division=1))

AUC-ROC: 0.5725095407290434
AUC-PR: 0.3022038436331209
              precision    recall  f1-score   support

           0       0.74      1.00      0.85       149
           1       1.00      0.00      0.00        51

    accuracy                           0.74       200
   macro avg       0.87      0.50      0.43       200
weighted avg       0.81      0.74      0.64       200

