<a href="https://colab.research.google.com/github/kaveesha82/Telco-Customer-Churn/blob/main/CWDT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Library importing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

###Loading data

In [2]:
df = pd.read_csv("/content/dataset.csv")

###Data inspection


In [3]:
print ("Dataset shape :", df.shape)
print ("\n   Column types       ")
print (df.dtypes)
print (" Data Frame Overview ")
display(df.head())


Dataset shape : (7043, 21)

   Column types       
customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object
 Data Frame Overview 


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


###check data intergrity

In [4]:

#check for inconsistant data entry
for col in df.select_dtypes(include = ["object"]).columns:
  print(f"\ncolumn {col}")
  print(df[col].unique())


number of duplicates 0

column customerID
['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']

column gender
['Female' 'Male']

column Partner
['Yes' 'No']

column Dependents
['No' 'Yes']

column PhoneService
['No' 'Yes']

column MultipleLines
['No phone service' 'No' 'Yes']

column InternetService
['DSL' 'Fiber optic' 'No']

column OnlineSecurity
['No' 'Yes' 'No internet service']

column OnlineBackup
['Yes' 'No' 'No internet service']

column DeviceProtection
['No' 'Yes' 'No internet service']

column TechSupport
['No' 'Yes' 'No internet service']

column StreamingTV
['No' 'Yes' 'No internet service']

column StreamingMovies
['No' 'Yes' 'No internet service']

column Contract
['Month-to-month' 'One year' 'Two year']

column PaperlessBilling
['Yes' 'No']

column PaymentMethod
['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']

column TotalCharges
['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']

column C

###Data Cleaning


In [5]:
#drop duplicates
df = df.drop_duplicates()

#drop customer ID if it exists
if "customerID" in df.columns:
    df = df.drop("customerID" , axis=1)

# We force errors='coerce' to turn those " " string spaces into NaNs (Not a Number).
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

#after clenaning shape
print (f"dataset shape after cleaning : {df.shape}")

# Check how many were actually missing (hidden blanks)
hidden_missing = df['TotalCharges'].isnull().sum()
print(f"Hidden missing values found in TotalCharges: {hidden_missing}")


# Fill them with 0 (Logic: These are new customers with Tenure=0)
df['TotalCharges'] = df['TotalCharges'].fillna(0)

dataset shape after cleaning : (7043, 20)
Remaining Missing Values: 0


In [6]:


# --- STEP 2: ADDRESSING 'TotalCharges' (The Hidden Issue)
# As you noted, isnull() shows 0 initially. But ' ' exists.






# --- STEP 3: CLEANING ---
# 1. Drop 'customerID'
# We must drop this BEFORE looking for duplicates. Two rows might have different IDs
# but identical data. We want to remove the redundant data patterns.
if 'customerID' in df.columns:
    df = df.drop('customerID', axis=1)
    print("customerID dropped.")

# 2. Drop Duplicates
# Now that ID is gone, we check for rows that are exactly the same.
initial_rows = df.shape[0]
df = df.drop_duplicates()
dropped_rows = initial_rows - df.shape[0]
print(f"Duplicate rows dropped: {dropped_rows}")

# 3. Encode Target 'Churn' (Yes -> 1, No -> 0)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# --- STEP 4: CATEGORICAL ENCODING ---
# Convert text categories (e.g., 'Partner', 'Contract') into numbers (0/1).
# drop_first=True removes the first category to prevent correlation redundancy.
df_encoded = pd.get_dummies(df, drop_first=True)

print(f"Encoding Complete. Final Feature Count: {df_encoded.shape[1]}")

# --- STEP 5: STRATIFIED TRAIN-TEST SPLIT ---
# We split NOW to ensure both models use the exact same data.
# 'stratify=y' locks in the churn rate (26%) for both sets.
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\n--- Final Data Ready for Modeling ---")
print(f"Training Set Shape: {X_train.shape}")
print(f"Testing Set Shape:  {X_test.shape}")

Original Shape: (7043, 21)
Hidden missing values found in TotalCharges: 11
customerID dropped.
Duplicate rows dropped: 22
Encoding Complete. Final Feature Count: 31

--- Final Data Ready for Modeling ---
Training Set Shape: (5616, 30)
Testing Set Shape:  (1405, 30)


###data splitting
