In [4]:
# 1. Import Required Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib
import json

# 2. Load Dataset
df = pd.read_csv("../data/telco_churn.csv")

# 3. Data Cleaning
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')  # Convert to numeric
df.dropna(inplace=True)  # Drop rows with missing values
df.drop(['customerID'], axis=1, inplace=True)  # Remove unnecessary column

# 4. One-hot Encoding for Categorical Variables
df = pd.get_dummies(df, drop_first=True)

# 5. Separate Features and Target
X = df.drop('Churn_Yes', axis=1)
y = df['Churn_Yes']

# 6. Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 7. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

# 8. Train Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 9. Evaluate Model
y_pred = model.predict(X_test)
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

# 10. Save Model, Scaler, and Feature Names
joblib.dump(model, '../backend/model.pkl')
joblib.dump(scaler, '../backend/scaler.pkl')

with open('../backend/feature_names.json', 'w') as f:
    json.dump(list(X.columns), f)

Classification Report:

              precision    recall  f1-score   support

       False       0.83      0.89      0.86      1033
        True       0.62      0.51      0.56       374

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407

