# Customer Churn Model Training

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import pickle
import joblib
import os

# Load dataset
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Clean numeric column
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# Apply LabelEncoders and store them
encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    if col != 'customerID':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        encoders[col] = le  # save encoder for this column

# Drop customerID
df = df.drop('customerID', axis=1)

# Split features/labels
X = df.drop('Churn', axis=1)
y = df['Churn']

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Models
models = {
    'log_reg.pkl': LogisticRegression(max_iter=1000),
    'decision_tree.pkl': DecisionTreeClassifier(max_depth=5),
    'random_forest.pkl': RandomForestClassifier(n_estimators=100),
    'svm.pkl': SVC(probability=True)
}

# Ensure models folder exists
os.makedirs("models", exist_ok=True)

# Train & save
for filename, model in models.items():
    model.fit(X_train, y_train)
    pickle.dump(model, open(f'models/{filename}', 'wb'))
    print(f'✅ Saved {filename}')

# Save scaler + encoders
joblib.dump(scaler, "models/scaler.pkl")
joblib.dump(encoders, "models/encoders.pkl")
print("✅ Saved scaler.pkl and encoders.pkl")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)


✅ Saved log_reg.pkl
✅ Saved decision_tree.pkl
✅ Saved random_forest.pkl
✅ Saved svm.pkl
✅ Saved scaler.pkl and encoders.pkl
