## Customer Churn Analysis ##

In [3]:
import sys, pathlib
repo_root = pathlib.Path().cwd().parents[0]  # adjust if notebook is deeper
sys.path.append(str(repo_root))

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from src.customerchurn.utils import read_yaml

In [6]:
df = pd.read_csv('Telco-Customer-Churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [10]:
schema_path = os.path.join(str(repo_root), "configs", "schema.yaml")
train_path  = os.path.join(str(repo_root), "artifacts", "train_data.csv")
test_path   = os.path.join(str(repo_root), "artifacts", "test_data.csv")

report_path = os.path.join(str(repo_root), "artifacts", "validation", "validation_report.txt")
os.makedirs(os.path.dirname(report_path), exist_ok=True)

schema = read_yaml(schema_path)
schema

{'dataset': 'Telco-Customer-Churn',
 'required_columns': ['customerID',
  'gender',
  'SeniorCitizen',
  'Partner',
  'Dependents',
  'tenure',
  'PhoneService',
  'MultipleLines',
  'InternetService',
  'OnlineSecurity',
  'OnlineBackup',
  'DeviceProtection',
  'TechSupport',
  'StreamingTV',
  'StreamingMovies',
  'Contract',
  'PaperlessBilling',
  'PaymentMethod',
  'MonthlyCharges',
  'TotalCharges',
  'Churn'],
 'target_column': 'Churn',
 'target_allowed_values': ['Yes', 'No'],
 'unique_columns': ['customerID'],
 'numeric_columns': ['tenure', 'MonthlyCharges', 'TotalCharges'],
 'integer_columns': ['SeniorCitizen'],
 'categorical_columns': ['gender',
  'Partner',
  'Dependents',
  'PhoneService',
  'MultipleLines',
  'InternetService',
  'OnlineSecurity',
  'OnlineBackup',
  'DeviceProtection',
  'TechSupport',
  'StreamingTV',
  'StreamingMovies',
  'Contract',
  'PaperlessBilling',
  'PaymentMethod',
  'Churn']}

In [11]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
train_df.shape, test_df.shape

((5634, 21), (1409, 21))

In [44]:
report_lines = []
overall_ok = True


required_cols = schema.get('required_columns',[])
missing_train = [c for c in required_cols if c not in train_df.columns]
if missing_train:
    overall_ok = False
    report_lines.append(f"[TRAIN][FAIL] Missing required columns: {missing_train}")
else:
    report_lines.append(f"[TRAIN][PASS] All required columns")

missing_train
report_lines

['[TRAIN][PASS] All required columns']

In [45]:
target = schema.get('target_column')
allowed = schema.get('target_allowed_values',[])

if target is None:
    report_lines.append("[TRAIN][SKIP] target_column not defined in schema.yaml")
elif target not in train_df.columns:
    report_lines.append([f"[TRAIN][FAIL] target column {target} not found"])
else:
    bad_vals = sorted(set(train_df[target].dropna().unique()) - set(allowed))
    if bad_vals:
        overall_ok=False
        report_lines.append(f"[TRAIN][FAIL] Target has invalid values: {bad_vals} (allowed={allowed})")
    else:
        report_lines.append(f"[TRAIN][PASS] Target columns has valid values")

target,allowed, report_lines

('Churn',
 ['Yes', 'No'],
 ['[TRAIN][PASS] All required columns',
  '[TRAIN][PASS] Target columns has valid values'])

# Checking Unique Values

In [46]:
unique_cols = schema.get('unique_columns',[])

for col in unique_cols:
    if col in train_df.columns and train_df[col].duplicated().any():
        overall_ok=False
        n_dup = int(train_df[col].duplicated().sum())
        report_lines.append(f"[TRAIN][FAIL] Unique Column: {col} has {n_dup}")
    else:
        report_lines.append(f"[TRAIN][PASS] No duplicate values all are unique.")

unique_cols, report_lines


(['customerID'],
 ['[TRAIN][PASS] All required columns',
  '[TRAIN][PASS] Target columns has valid values',
  '[TRAIN][PASS] No duplicate values all are unique.'])

## Numeric cols convertibility

In [47]:
numeric_cols = schema.get("numeric_columns",[])

for col in numeric_cols:
    if col not in train_df.columns:
        overall_ok=False
        report_lines.append(f"[TRAIN][FAIL] Numeric column: {col} not found")
        continue
    
    coerced = pd.to_numeric(train_df[col],errors="coerce")
    n_bad = int(coerced.isna().sum() - train_df[col].isna().sum())

    if n_bad > 0:
        overall_ok=False
        report_lines.append(f"[TRAIN][FAIL] Numeric column : {col} has {n_bad} non-numeric values")
    else:
        report_lines.append(f"[TRAIN][PASS] Numeric colums: {col} is convertible")

numeric_cols, report_lines

(['tenure', 'MonthlyCharges', 'TotalCharges'],
 ['[TRAIN][PASS] All required columns',
  '[TRAIN][PASS] Target columns has valid values',
  '[TRAIN][PASS] No duplicate values all are unique.',
  '[TRAIN][PASS] Numeric colums: tenure is convertible',
  '[TRAIN][PASS] Numeric colums: MonthlyCharges is convertible',
  '[TRAIN][FAIL] Numeric column : TotalCharges has 8 non-numeric values'])

# Integer like columns #

In [48]:
integer_cols = schema.get("integer_columns",[])

for col in integer_cols:
    if col not in train_df.columns:
        overall_ok=False
        report_lines.append(f"[TRAIN][FAIL] Integer column {col} not found")
    

    coerce = pd.to_numeric(train_df[col],errors="coerce")
    non_int = int(coerced.dropna().apply(lambda x:float(x).is_integer() is False).sum())

    if non_int > 0:
        overall_ok=False
        report_lines.append(f"[TRAIN][FAIL] Integer column: {col} has {non_int} non-integer values")
    else:
        report_lines.append(f"[TRAIN][PASS] Integer column: {col} has integer values") 

integer_cols, report_lines

(['SeniorCitizen'],
 ['[TRAIN][PASS] All required columns',
  '[TRAIN][PASS] Target columns has valid values',
  '[TRAIN][PASS] No duplicate values all are unique.',
  '[TRAIN][PASS] Numeric colums: tenure is convertible',
  '[TRAIN][PASS] Numeric colums: MonthlyCharges is convertible',
  '[TRAIN][FAIL] Numeric column : TotalCharges has 8 non-numeric values',
  '[TRAIN][FAIL] Integer column: SeniorCitizen has 5373 non-integer values'])

## Categorical Column Existance

In [49]:
categorical_cols = schema.get("categorical_columns",[])
missing_cat_train = [c for c in categorical_cols if c not in train_df.columns]

if categorical_cols and missing_cat_train:
    overall_ok= False
    report_lines.append(f"[TRAIN][FAIL] Missing categorical columns: {missing_cat_train}")
elif categorical_cols:
    report_lines.append(f"[TRAIN][PASS] All categorical columns present")
else:
    report_lines.append(f"[TRAIN][SKIP] categorical_columns not defined in schema.yaml")

categorical_cols,missing_cat_train,report_lines

(['gender',
  'Partner',
  'Dependents',
  'PhoneService',
  'MultipleLines',
  'InternetService',
  'OnlineSecurity',
  'OnlineBackup',
  'DeviceProtection',
  'TechSupport',
  'StreamingTV',
  'StreamingMovies',
  'Contract',
  'PaperlessBilling',
  'PaymentMethod',
  'Churn'],
 [],
 ['[TRAIN][PASS] All required columns',
  '[TRAIN][PASS] Target columns has valid values',
  '[TRAIN][PASS] No duplicate values all are unique.',
  '[TRAIN][PASS] Numeric colums: tenure is convertible',
  '[TRAIN][PASS] Numeric colums: MonthlyCharges is convertible',
  '[TRAIN][FAIL] Numeric column : TotalCharges has 8 non-numeric values',
  '[TRAIN][FAIL] Integer column: SeniorCitizen has 5373 non-integer values',
  '[TRAIN][PASS] All categorical columns present'])

## Same check for Test data

In [50]:
# Required

missing_test = [c for c in required_cols if c not in test_df.columns]
if missing_test:
    overall_ok = False
    report_lines.append(f"[TEST][FAIL] Missing required columns: {missing_test}")
else:
    report_lines.append("[TEST][PASS] All required columns present")

missing_test,report_lines

([],
 ['[TRAIN][PASS] All required columns',
  '[TRAIN][PASS] Target columns has valid values',
  '[TRAIN][PASS] No duplicate values all are unique.',
  '[TRAIN][PASS] Numeric colums: tenure is convertible',
  '[TRAIN][PASS] Numeric colums: MonthlyCharges is convertible',
  '[TRAIN][FAIL] Numeric column : TotalCharges has 8 non-numeric values',
  '[TRAIN][FAIL] Integer column: SeniorCitizen has 5373 non-integer values',
  '[TRAIN][PASS] All categorical columns present',
  '[TEST][PASS] All required columns present'])

In [51]:
# Target
if target is None:
    report_lines.append("[TEST][SKIP] target_column not defined in schema.yaml")
elif target not in test_df.columns:
    overall_ok = False
    report_lines.append(f"[TEST][FAIL] Target column '{target}' not found")
else:
    bad_vals = sorted(set(test_df[target].dropna().unique()) - set(allowed))
    if bad_vals:
        overall_ok = False
        report_lines.append(f"[TEST][FAIL] Target has invalid values: {bad_vals} (allowed={allowed})")
    else:
        report_lines.append("[TEST][PASS] Target column values are valid")

report_lines

['[TRAIN][PASS] All required columns',
 '[TRAIN][PASS] Target columns has valid values',
 '[TRAIN][PASS] No duplicate values all are unique.',
 '[TRAIN][PASS] Numeric colums: tenure is convertible',
 '[TRAIN][PASS] Numeric colums: MonthlyCharges is convertible',
 '[TRAIN][FAIL] Numeric column : TotalCharges has 8 non-numeric values',
 '[TRAIN][FAIL] Integer column: SeniorCitizen has 5373 non-integer values',
 '[TRAIN][PASS] All categorical columns present',
 '[TEST][PASS] All required columns present',
 '[TEST][PASS] Target column values are valid']

In [52]:
#Unique
for col in unique_cols:
    if col in test_df.columns and test_df[col].duplicated().any():
        overall_ok = False
        n_dup = int(test_df[col].duplicated().sum())
        report_lines.append(f"[TEST][FAIL] Unique column '{col}' has {n_dup} duplicates")
    else:
        report_lines.append(f"[TEST][PASS] Unique column '{col}' has no duplicates (or not present)")
report_lines

['[TRAIN][PASS] All required columns',
 '[TRAIN][PASS] Target columns has valid values',
 '[TRAIN][PASS] No duplicate values all are unique.',
 '[TRAIN][PASS] Numeric colums: tenure is convertible',
 '[TRAIN][PASS] Numeric colums: MonthlyCharges is convertible',
 '[TRAIN][FAIL] Numeric column : TotalCharges has 8 non-numeric values',
 '[TRAIN][FAIL] Integer column: SeniorCitizen has 5373 non-integer values',
 '[TRAIN][PASS] All categorical columns present',
 '[TEST][PASS] All required columns present',
 '[TEST][PASS] Target column values are valid',
 "[TEST][PASS] Unique column 'customerID' has no duplicates (or not present)"]

In [53]:
# Numeric
for col in numeric_cols:
    if col not in test_df.columns:
        overall_ok = False
        report_lines.append(f"[TEST][FAIL] Numeric column '{col}' not found")
        continue
    coerced = pd.to_numeric(test_df[col], errors="coerce")
    n_bad = int(coerced.isna().sum() - test_df[col].isna().sum())
    if n_bad > 0:
        overall_ok = False
        report_lines.append(f"[TEST][FAIL] Numeric column '{col}' has {n_bad} non-numeric values")
    else:
        report_lines.append(f"[TEST][PASS] Numeric column '{col}' is numeric/convertible")
report_lines

['[TRAIN][PASS] All required columns',
 '[TRAIN][PASS] Target columns has valid values',
 '[TRAIN][PASS] No duplicate values all are unique.',
 '[TRAIN][PASS] Numeric colums: tenure is convertible',
 '[TRAIN][PASS] Numeric colums: MonthlyCharges is convertible',
 '[TRAIN][FAIL] Numeric column : TotalCharges has 8 non-numeric values',
 '[TRAIN][FAIL] Integer column: SeniorCitizen has 5373 non-integer values',
 '[TRAIN][PASS] All categorical columns present',
 '[TEST][PASS] All required columns present',
 '[TEST][PASS] Target column values are valid',
 "[TEST][PASS] Unique column 'customerID' has no duplicates (or not present)",
 "[TEST][PASS] Numeric column 'tenure' is numeric/convertible",
 "[TEST][PASS] Numeric column 'MonthlyCharges' is numeric/convertible",
 "[TEST][FAIL] Numeric column 'TotalCharges' has 3 non-numeric values"]

In [54]:
# Integer
for col in integer_cols:
    if col not in test_df.columns:
        overall_ok = False
        report_lines.append(f"[TEST][FAIL] Integer column '{col}' not found")
        continue
    coerced = pd.to_numeric(test_df[col], errors="coerce")
    non_int = coerced.dropna().apply(lambda x: float(x).is_integer() is False).sum()
    if non_int > 0:
        overall_ok = False
        report_lines.append(f"[TEST][FAIL] Integer column '{col}' has {int(non_int)} non-integer values")
    else:
        report_lines.append(f"[TEST][PASS] Integer column '{col}' is integer-like")

report_lines

['[TRAIN][PASS] All required columns',
 '[TRAIN][PASS] Target columns has valid values',
 '[TRAIN][PASS] No duplicate values all are unique.',
 '[TRAIN][PASS] Numeric colums: tenure is convertible',
 '[TRAIN][PASS] Numeric colums: MonthlyCharges is convertible',
 '[TRAIN][FAIL] Numeric column : TotalCharges has 8 non-numeric values',
 '[TRAIN][FAIL] Integer column: SeniorCitizen has 5373 non-integer values',
 '[TRAIN][PASS] All categorical columns present',
 '[TEST][PASS] All required columns present',
 '[TEST][PASS] Target column values are valid',
 "[TEST][PASS] Unique column 'customerID' has no duplicates (or not present)",
 "[TEST][PASS] Numeric column 'tenure' is numeric/convertible",
 "[TEST][PASS] Numeric column 'MonthlyCharges' is numeric/convertible",
 "[TEST][FAIL] Numeric column 'TotalCharges' has 3 non-numeric values",
 "[TEST][PASS] Integer column 'SeniorCitizen' is integer-like"]

In [55]:
# Categorical
missing_cat_test = [c for c in categorical_cols if c not in test_df.columns]
if categorical_cols and missing_cat_test:
    overall_ok = False
    report_lines.append(f"[TEST][FAIL] Missing categorical columns: {missing_cat_test}")
elif categorical_cols:
    report_lines.append("[TEST][PASS] All categorical columns present")
else:
    report_lines.append("[TEST][SKIP] categorical_columns not defined in schema.yaml")

overall_ok,report_lines

(False,
 ['[TRAIN][PASS] All required columns',
  '[TRAIN][PASS] Target columns has valid values',
  '[TRAIN][PASS] No duplicate values all are unique.',
  '[TRAIN][PASS] Numeric colums: tenure is convertible',
  '[TRAIN][PASS] Numeric colums: MonthlyCharges is convertible',
  '[TRAIN][FAIL] Numeric column : TotalCharges has 8 non-numeric values',
  '[TRAIN][FAIL] Integer column: SeniorCitizen has 5373 non-integer values',
  '[TRAIN][PASS] All categorical columns present',
  '[TEST][PASS] All required columns present',
  '[TEST][PASS] Target column values are valid',
  "[TEST][PASS] Unique column 'customerID' has no duplicates (or not present)",
  "[TEST][PASS] Numeric column 'tenure' is numeric/convertible",
  "[TEST][PASS] Numeric column 'MonthlyCharges' is numeric/convertible",
  "[TEST][FAIL] Numeric column 'TotalCharges' has 3 non-numeric values",
  "[TEST][PASS] Integer column 'SeniorCitizen' is integer-like",
  '[TEST][PASS] All categorical columns present'])

In [56]:
with open(report_path,'w',encoding="utf-8") as f:
    f.write("\n".join(report_lines))

print("Report saved at:", report_path )

Report saved at: c:\Users\Admin\Data Scientist Projects\Customer Churn Prediction\artifacts\validation\validation_report.txt


In [57]:
# %%
print("\n".join(report_lines))

[TRAIN][PASS] All required columns
[TRAIN][PASS] Target columns has valid values
[TRAIN][PASS] No duplicate values all are unique.
[TRAIN][PASS] Numeric colums: tenure is convertible
[TRAIN][PASS] Numeric colums: MonthlyCharges is convertible
[TRAIN][FAIL] Numeric column : TotalCharges has 8 non-numeric values
[TRAIN][FAIL] Integer column: SeniorCitizen has 5373 non-integer values
[TRAIN][PASS] All categorical columns present
[TEST][PASS] All required columns present
[TEST][PASS] Target column values are valid
[TEST][PASS] Unique column 'customerID' has no duplicates (or not present)
[TEST][PASS] Numeric column 'tenure' is numeric/convertible
[TEST][PASS] Numeric column 'MonthlyCharges' is numeric/convertible
[TEST][FAIL] Numeric column 'TotalCharges' has 3 non-numeric values
[TEST][PASS] Integer column 'SeniorCitizen' is integer-like
[TEST][PASS] All categorical columns present
