In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Load from a reliable GitHub source
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv(url)

# Clean & preprocess
df = df[df['TotalCharges'] != ' ']  # Remove empty TotalCharges rows
df['TotalCharges'] = df['TotalCharges'].astype(float)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Encode categorical features
df_encoded = pd.get_dummies(df[['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
                                'Contract', 'InternetService', 'PaymentMethod']], drop_first=True)

X = df_encoded
y = df['Churn']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict & evaluate
y_pred = model.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[923 110]
 [188 186]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.89      0.86      1033
           1       0.63      0.50      0.56       374

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407



This project uses a telecom dataset to predict customer churn. The data includes account information, contract type, charges, and customer demographics. The goal is to identify at-risk customers for targeted retention strategies.