In [3]:
# ==================================================
# PROJECT 2 (FIXED & UPGRADED):
# CUSTOMER CHURN PREDICTION WITH RANDOM FOREST
# ==================================================

import pandas as pd
import numpy as np

# ----------------------------------
# Step 1: Load Dataset (Fixed URL)
# ----------------------------------
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"

try:
    df = pd.read_csv(url)
    print("Dataset successfully loaded from the internet!")
except:
    # Fallback if your internet fails
    print("Internet error! Loading fallback sample instead...")
    data = {
        "gender":["Female","Male","Female","Male"],
        "SeniorCitizen":[0,1,0,0],
        "Partner":["Yes","No","Yes","No"],
        "Dependents":["No","No","Yes","No"],
        "tenure":[1,34,12,5],
        "PhoneService":["No","Yes","Yes","Yes"],
        "MonthlyCharges":[29.85,56.95,53.85,42.30],
        "TotalCharges":[29.85,1889.50,108.15,184.20],
        "Churn":["No","No","Yes","No"]
    }
    df = pd.DataFrame(data)

print(df.head())

# ----------------------------------
# Step 2: Data Cleaning
# ----------------------------------

# Remove customerID â€“ not useful for ML
if "customerID" in df.columns:
    df = df.drop("customerID", axis=1)

# Convert TotalCharges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# Drop missing rows created after conversion
df = df.dropna()

# ----------------------------------
# Step 3: Encode Categorical Variables
# ----------------------------------
df_encoded = pd.get_dummies(df, drop_first=True)

# ----------------------------------
# Step 4: Train-Test Split
# ----------------------------------
from sklearn.model_selection import train_test_split

X = df_encoded.drop("Churn_Yes", axis=1)
y = df_encoded["Churn_Yes"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ----------------------------------
# Step 5: Train Random Forest Model
# ----------------------------------
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    random_state=42
)

model.fit(X_train, y_train)

# ----------------------------------
# Step 6: Model Evaluation
# ----------------------------------
from sklearn.metrics import classification_report, accuracy_score

pred = model.predict(X_test)

print("\nModel Accuracy:", accuracy_score(y_test, pred))
print("\nClassification Report:\n")
print(classification_report(y_test, pred))

# ----------------------------------
# Step 7: Predict on a Sample Customer (Example)
# ----------------------------------
sample = X_test.iloc[:1]
prediction = model.predict(sample)

print("\nSample Customer Prediction:", prediction)
print("1 = Churn, 0 = Not Churn")


Dataset successfully loaded from the internet!
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport S