In [None]:
mport pandas as pd
import numpy as np

# Simulate dataset based on project description
np.random.seed(42)
data = {
    "state": np.random.choice(['CA', 'TX', 'NY', 'FL', 'WA'], 1000),
    "account_length": np.random.randint(1, 100, 1000),
    "area_code": np.random.choice([408, 415, 510], 1000),
    "total_day_calls": np.random.randint(50, 150, 1000),
    "total_intl_charge": np.round(np.random.uniform(1.0, 5.0, 1000), 2),
    "number_customer_service_calls": np.random.randint(0, 10, 1000),
    "churn": np.random.choice(['yes', 'no'], 1000, p=[0.2, 0.8])
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Standardization and One-Hot Encoding
from sklearn.preprocessing import StandardScaler
df_numeric = df[["account_length", "total_day_calls", "total_intl_charge", "number_customer_service_calls"]]
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_numeric), columns=df_numeric.columns)

df_encoded = pd.get_dummies(df.drop(columns=df_numeric.columns), drop_first=True)
df_final = pd.concat([df_scaled, df_encoded], axis=1)

# EDA and Visualizations

import seaborn as sns
import matplotlib.pyplot as plt

# Bar plot of churn counts
plt.figure(figsize=(8, 5))
sns.countplot(x="churn", data=df, palette="viridis")
plt.title("Customer Churn Counts")
plt.savefig("churn_counts_plot.png")

# Heatmap of correlations
plt.figure(figsize=(8, 6))
corr_matrix = df_numeric.corr()
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.savefig("correlation_heatmap.png")

# Histogram for Total International Charges
plt.figure(figsize=(8, 5))
sns.histplot(data=df, x="total_intl_charge", hue="churn", kde=True, bins=20)
plt.title("Total International Charges Distribution")
plt.savefig("intl_charges_histogram.png")

# Predictive Modeling

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

# Split the data
X = df_final.drop(columns=["churn_yes"])
y = df_final["churn_yes"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

# Evaluation
for model, pred in [("Logistic Regression", y_pred_log), ("Random Forest", y_pred_rf), ("XGBoost", y_pred_xgb)]:
    print(f"Model: {model}")
    print(classification_report(y_test, pred))