In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configure plot display
%matplotlib inline
sns.set(style="whitegrid")

# Load dataset
df = pd.read_csv("../data/Telco-Customer-Churn.csv")

# Show the shape and first few rows
print("Dataset shape:", df.shape)
df.head()


In [None]:
# Data types
df.info()

# Nulls and empty strings
print("\nMissing values:")
print(df.isnull().sum())

# Check for blanks (some missing values are whitespace)
print("\nBlank values:")
print((df == " ").sum())


In [None]:
# Churn value counts
df['Churn'].value_counts().plot(kind='bar', title='Class Distribution: Churn')
plt.show()

# Optional: Display percentages
churn_dist = df['Churn'].value_counts(normalize=True) * 100
print(churn_dist)


In [None]:
# Replace blank strings with NaN and convert
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'].replace(" ", np.nan), errors='coerce')

# Check how many are NaN
print("TotalCharges nulls after conversion:", df['TotalCharges'].isnull().sum())

# Option 1: Drop rows with null TotalCharges (few)
df.dropna(subset=['TotalCharges'], inplace=True)

# Option 2 (alternative): Fill with median or 0
# df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Drop irrelevant columns
df.drop(['customerID'], axis=1, inplace=True)

# Convert target variable to binary
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Identify categorical columns
cat_cols = df.select_dtypes(include='object').columns.tolist()

# Apply Label Encoding (can use OneHot later if needed)
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])


In [None]:
from sklearn.model_selection import train_test_split

X = df.drop("Churn", axis=1)
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Start MLflow experiment
mlflow.set_experiment("churn-prediction")

with mlflow.start_run():
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    # Metrics
    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds)
    rec = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)

    # Log parameters and metrics
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)

    # Log model
    mlflow.sklearn.log_model(model, "logistic_model")

    print(f"Accuracy: {acc:.3f}, Precision: {prec:.3f}, Recall: {rec:.3f}, F1: {f1:.3f}")


In [None]:
import mlflow.models
from mlflow.models.signature import infer_signature

# Infer model signature
input_example = X_test.iloc[:1]
signature = infer_signature(X_test, model.predict(X_test))

# Log model with signature and input
mlflow.sklearn.log_model(
    sk_model=model,
    artifact_path="logistic_model",
    input_example=input_example,
    signature=signature
)
