In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline


In [2]:
df = pd.read_csv("C:\Hari\Projects\Business\customer_churn_dataset.csv")
df

Unnamed: 0,CustomerID,Age,Gender,Senior_Citizen,Partner,Dependents,Tenure_Months,Contract,Paperless_Billing,Payment_Method,...,Internet_Service,Online_Security,Online_Backup,Device_Protection,Tech_Support,Streaming_TV,Streaming_Movies,Monthly_Charges,Total_Charges,Churn
0,CUST_00001,45,Female,0,No,Yes,52,One year,Yes,Electronic check,...,DSL,No,No,Yes,No internet service,No internet service,No,78.16,3989.76,Yes
1,CUST_00002,38,Male,0,No,No,19,One year,Yes,Credit card,...,Fiber optic,No internet service,No,No,No,Yes,Yes,110.06,2098.31,Yes
2,CUST_00003,47,Female,1,Yes,No,35,One year,No,Electronic check,...,No,No,No,Yes,No,Yes,Yes,60.75,2083.04,No
3,CUST_00004,58,Female,0,No,Yes,48,Two year,Yes,Credit card,...,Fiber optic,No,No,No internet service,No internet service,No,No internet service,90.80,4325.62,Yes
4,CUST_00005,37,Female,0,Yes,No,60,Two year,No,Bank transfer,...,No,Yes,Yes,No internet service,No internet service,Yes,Yes,70.33,4247.93,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,CUST_04996,39,Female,0,No,Yes,61,Month-to-month,Yes,Bank transfer,...,DSL,Yes,Yes,No,Yes,No internet service,Yes,92.66,5609.56,No
4996,CUST_04997,48,Male,0,Yes,No,23,Month-to-month,No,Electronic check,...,Fiber optic,No,Yes,No,No internet service,No,No,94.71,2175.70,Yes
4997,CUST_04998,77,Male,1,No,Yes,36,Month-to-month,Yes,Credit card,...,Fiber optic,Yes,No internet service,Yes,Yes,No internet service,No internet service,110.98,4058.62,No
4998,CUST_04999,49,Male,0,No,Yes,38,Month-to-month,No,Credit card,...,Fiber optic,No internet service,No internet service,No,No internet service,No internet service,No internet service,90.14,3403.88,No


In [5]:
# 1. Dataset Overview
print("\n1. DATASET OVERVIEW")
print("-" * 30)
print(f"Total Customers: {len(df):,}")
print(f"Total Features: {len(df.columns)}")
print(f"Churn Rate: {(df['Churn'] == 'Yes').sum() / len(df) * 100:.2f}%")

# Data types
print("\nData Types:")
print(df.dtypes.value_counts())

# Missing values check
print(f"\nMissing Values: {df.isnull().sum().sum()}")




1. DATASET OVERVIEW
------------------------------
Total Customers: 5,000
Total Features: 22
Churn Rate: 57.36%

Data Types:
object     17
int64       3
float64     2
Name: count, dtype: int64

Missing Values: 0


In [4]:
# 2. Exploratory Data Analysis
print("\n\n2. EXPLORATORY DATA ANALYSIS")
print("-" * 40)

# Basic statistics for numerical columns
numerical_cols = ['Age', 'Tenure_Months', 'Monthly_Charges', 'Total_Charges']
print("\nNumerical Features Summary:")
print(df[numerical_cols].describe().round(2))

# Churn distribution
print(f"\nChurn Distribution:")
churn_counts = df['Churn'].value_counts()
for status, count in churn_counts.items():
    percentage = count / len(df) * 100
    print(f"  {status}: {count:,} ({percentage:.1f}%)")



2. EXPLORATORY DATA ANALYSIS
----------------------------------------

Numerical Features Summary:
           Age  Tenure_Months  Monthly_Charges  Total_Charges
count  5000.00        5000.00          5000.00        5000.00
mean     39.75          36.39            87.64        3191.91
std      11.56          20.67            20.32        2004.31
min      18.00           1.00            20.00          42.01
25%      32.00          19.00            72.56        1516.39
50%      40.00          36.00            90.15        2982.73
75%      47.00          54.00           103.90        4663.45
max      80.00          72.00           120.00        8714.44

Churn Distribution:
  Yes: 2,868 (57.4%)
  No: 2,132 (42.6%)


In [6]:
# 3. Key insights from categorical variables
print("\n\n3. CATEGORICAL ANALYSIS")
print("-" * 35)

categorical_cols = ['Gender', 'Senior_Citizen', 'Partner', 'Dependents', 'Contract', 
                   'Payment_Method', 'Internet_Service']

for col in categorical_cols[:4]:  # Show first 4 for brevity
    print(f"\n{col} vs Churn:")
    crosstab = pd.crosstab(df[col], df['Churn'], normalize='index') * 100
    print(crosstab.round(1))



3. CATEGORICAL ANALYSIS
-----------------------------------

Gender vs Churn:
Churn     No   Yes
Gender            
Female  42.9  57.1
Male    42.4  57.6

Senior_Citizen vs Churn:
Churn             No   Yes
Senior_Citizen            
0               43.4  56.6
1               38.2  61.8

Partner vs Churn:
Churn      No   Yes
Partner            
No       43.8  56.2
Yes      41.5  58.5

Dependents vs Churn:
Churn         No   Yes
Dependents            
No          42.8  57.2
Yes         42.3  57.7


In [9]:
# Advanced Analysis and Machine Learning
print("4. ADVANCED CHURN ANALYSIS")
print("-" * 35)

# Contract and Payment Method analysis
print("\nContract Type Analysis:")
contract_churn = df.groupby('Contract')['Churn'].apply(lambda x: (x == 'Yes').mean() * 100)
for contract, churn_rate in contract_churn.items():
    print(f"  {contract}: {churn_rate:.1f}% churn rate")

print("\nPayment Method Analysis:")
payment_churn = df.groupby('Payment_Method')['Churn'].apply(lambda x: (x == 'Yes').mean() * 100)
for method, churn_rate in payment_churn.items():
    print(f"  {method}: {churn_rate:.1f}% churn rate")

print("\nInternet Service Analysis:")
internet_churn = df.groupby('Internet_Service')['Churn'].apply(lambda x: (x == 'Yes').mean() * 100)
for service, churn_rate in internet_churn.items():
    print(f"  {service}: {churn_rate:.1f}% churn rate")

# Tenure analysis
print("\nTenure Analysis:")
df['Tenure_Group'] = pd.cut(df['Tenure_Months'], 
                           bins=[0, 12, 24, 36, 48, 72], 
                           labels=['0-12 months', '13-24 months', '25-36 months', 
                                  '37-48 months', '49+ months'])
tenure_churn = df.groupby('Tenure_Group')['Churn'].apply(lambda x: (x == 'Yes').mean() * 100)
for tenure, churn_rate in tenure_churn.items():
    print(f"  {tenure}: {churn_rate:.1f}% churn rate")

# Monthly charges analysis
print("\nMonthly Charges Analysis:")
df['Charge_Group'] = pd.cut(df['Monthly_Charges'], 
                           bins=[0, 50, 70, 90, 120], 
                           labels=['Low ($20-50)', 'Medium ($51-70)', 
                                  'High ($71-90)', 'Very High ($91+)'])
charge_churn = df.groupby('Charge_Group')['Churn'].apply(lambda x: (x == 'Yes').mean() * 100)
for charge, churn_rate in charge_churn.items():
    print(f"  {charge}: {churn_rate:.1f}% churn rate")


4. ADVANCED CHURN ANALYSIS
-----------------------------------

Contract Type Analysis:
  Month-to-month: 67.1% churn rate
  One year: 50.7% churn rate
  Two year: 40.5% churn rate

Payment Method Analysis:
  Bank transfer: 52.6% churn rate
  Credit card: 53.7% churn rate
  Electronic check: 65.2% churn rate
  Mailed check: 53.9% churn rate

Internet Service Analysis:
  DSL: 51.8% churn rate
  Fiber optic: 65.8% churn rate
  No: 49.7% churn rate

Tenure Analysis:
  0-12 months: 71.0% churn rate
  13-24 months: 57.9% churn rate
  25-36 months: 54.9% churn rate
  37-48 months: 52.1% churn rate
  49+ months: 54.3% churn rate

Monthly Charges Analysis:
  Low ($20-50): 52.0% churn rate
  Medium ($51-70): 50.2% churn rate
  High ($71-90): 53.1% churn rate
  Very High ($91+): 62.7% churn rate


  tenure_churn = df.groupby('Tenure_Group')['Churn'].apply(lambda x: (x == 'Yes').mean() * 100)
  charge_churn = df.groupby('Charge_Group')['Churn'].apply(lambda x: (x == 'Yes').mean() * 100)


In [11]:
print("5. MACHINE LEARNING MODEL DEVELOPMENT")
print("-" * 45)

# Prepare data for modeling
df_model = df.copy()

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Gender', 'Partner', 'Dependents', 'Contract', 'Paperless_Billing',
                      'Payment_Method', 'Phone_Service', 'Multiple_Lines', 'Internet_Service',
                      'Online_Security', 'Online_Backup', 'Device_Protection', 'Tech_Support',
                      'Streaming_TV', 'Streaming_Movies']

for col in categorical_columns:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col])
    label_encoders[col] = le

# Prepare features and target
X = df_model.drop(['CustomerID', 'Churn', 'Tenure_Group', 'Charge_Group'], axis=1)
y = LabelEncoder().fit_transform(df_model['Churn'])  # 0 = No, 1 = Yes

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Features used: {len(X.columns)}")

# Model 1: Random Forest
print("\n=== RANDOM FOREST MODEL ===")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

print(f"Random Forest Accuracy: {rf_accuracy:.3f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
for i, (_, row) in enumerate(feature_importance.head(10).iterrows()):
    print(f"{i+1:2d}. {row['feature']:<20} {row['importance']:.4f}")

# Model 2: Logistic Regression
print("\n=== LOGISTIC REGRESSION MODEL ===")
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(random_state=42))
])

lr_pipeline.fit(X_train, y_train)
lr_pred = lr_pipeline.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)

print(f"Logistic Regression Accuracy: {lr_accuracy:.3f}")

# Model comparison
print(f"\n=== MODEL COMPARISON ===")
print(f"Random Forest Accuracy:     {rf_accuracy:.3f}")
print(f"Logistic Regression Accuracy: {lr_accuracy:.3f}")

# Detailed classification report for best model
best_model = rf_model if rf_accuracy > lr_accuracy else lr_pipeline
best_pred = rf_pred if rf_accuracy > lr_accuracy else lr_pred
best_name = "Random Forest" if rf_accuracy > lr_accuracy else "Logistic Regression"

print(f"\n=== DETAILED RESULTS ({best_name.upper()}) ===")
print("Classification Report:")
print(classification_report(y_test, best_pred, target_names=['No Churn', 'Churn']))

5. MACHINE LEARNING MODEL DEVELOPMENT
---------------------------------------------
Training set size: 4000
Test set size: 1000
Features used: 20

=== RANDOM FOREST MODEL ===
Random Forest Accuracy: 0.653

Top 10 Most Important Features:
 1. Monthly_Charges      0.1394
 2. Total_Charges        0.1326
 3. Tenure_Months        0.1165
 4. Age                  0.1131
 5. Contract             0.0627
 6. Payment_Method       0.0473
 7. Streaming_Movies     0.0368
 8. Device_Protection    0.0364
 9. Online_Backup        0.0359
10. Streaming_TV         0.0349

=== LOGISTIC REGRESSION MODEL ===
Logistic Regression Accuracy: 0.670

=== MODEL COMPARISON ===
Random Forest Accuracy:     0.653
Logistic Regression Accuracy: 0.670

=== DETAILED RESULTS (LOGISTIC REGRESSION) ===
Classification Report:
              precision    recall  f1-score   support

    No Churn       0.65      0.49      0.56       426
       Churn       0.68      0.80      0.74       574

    accuracy                           0