# Loading libraries and dataset

In [1]:
import pandas as pd

df = pd.read_csv(r"D:\Data_Analysis_Projects\Telco_Customers_Churn\01_data\telco.csv")


# Data Validation

In [3]:
df.shape

(7043, 50)

In [4]:
df.columns

Index(['Customer ID', 'Gender', 'Age', 'Under 30', 'Senior Citizen', 'Married',
       'Dependents', 'Number of Dependents', 'Country', 'State', 'City',
       'Zip Code', 'Latitude', 'Longitude', 'Population', 'Quarter',
       'Referred a Friend', 'Number of Referrals', 'Tenure in Months', 'Offer',
       'Phone Service', 'Avg Monthly Long Distance Charges', 'Multiple Lines',
       'Internet Service', 'Internet Type', 'Avg Monthly GB Download',
       'Online Security', 'Online Backup', 'Device Protection Plan',
       'Premium Tech Support', 'Streaming TV', 'Streaming Movies',
       'Streaming Music', 'Unlimited Data', 'Contract', 'Paperless Billing',
       'Payment Method', 'Monthly Charge', 'Total Charges', 'Total Refunds',
       'Total Extra Data Charges', 'Total Long Distance Charges',
       'Total Revenue', 'Satisfaction Score', 'Customer Status', 'Churn Label',
       'Churn Score', 'CLTV', 'Churn Category', 'Churn Reason'],
      dtype='object')

In [5]:
df.isnull().sum()

Customer ID                             0
Gender                                  0
Age                                     0
Under 30                                0
Senior Citizen                          0
Married                                 0
Dependents                              0
Number of Dependents                    0
Country                                 0
State                                   0
City                                    0
Zip Code                                0
Latitude                                0
Longitude                               0
Population                              0
Quarter                                 0
Referred a Friend                       0
Number of Referrals                     0
Tenure in Months                        0
Offer                                3877
Phone Service                           0
Avg Monthly Long Distance Charges       0
Multiple Lines                          0
Internet Service                  

In [6]:
df.duplicated().sum()

np.int64(0)

# Validate Primary Key (Customer ID)

In [7]:
df['Customer ID'].nunique(), len(df)


(7043, 7043)

# Creating 5 logical tables

In [8]:
# Demographics Table
demographics = df[[
    'Customer ID',
    'Gender',
    'Age',
    'Under 30',
    'Senior Citizen',
    'Married',
    'Dependents',
    'Number of Dependents'
]].drop_duplicates()


In [9]:
# validation
demographics.isnull().sum()


Customer ID             0
Gender                  0
Age                     0
Under 30                0
Senior Citizen          0
Married                 0
Dependents              0
Number of Dependents    0
dtype: int64

In [10]:
# Location Table
location = df[[
    'Customer ID',
    'Country',
    'State',
    'City',
    'Zip Code',
    'Latitude',
    'Longitude'
]].drop_duplicates()


In [None]:
# Population Table
population = df[[
    'Zip Code',
    'Population'
]].drop_duplicates()


In [12]:
#Validation (Zip Code uniqueness)
population['Zip Code'].nunique() == len(population)


True

In [13]:
# Services Table
services = df[[
    'Customer ID',
    'Phone Service',
    'Multiple Lines',
    'Internet Service',
    'Internet Type',
    'Avg Monthly GB Download',
    'Online Security',
    'Online Backup',
    'Device Protection Plan',
    'Premium Tech Support',
    'Streaming TV',
    'Streaming Movies',
    'Streaming Music',
    'Unlimited Data',
    'Contract',
    'Monthly Charge',
    'Total Charges',
    'Total Extra Data Charges',
    'Total Long Distance Charges'
]].drop_duplicates()


In [14]:
# Status Table
status = df[[
    'Customer ID',
    'Customer Status',
    'Churn Label',
    'Churn Score',
    'Churn Category',
    'Churn Reason',
    'Satisfaction Score',
    'CLTV'
]].drop_duplicates()


# Data Quality Checks

In [16]:
# Missing Handling Values
# Age
demographics['Age'] = demographics['Age'].fillna(demographics['Age'].median())

# Categorical
status['Churn Reason'] = status['Churn Reason'].fillna('Not Applicable')
# Check missing values
demographics['Age'].isna().sum()
status['Churn Reason'].isna().sum()



np.int64(0)

# Missing values are handled selectively based on business logic


In [17]:
# Referential Integrity Checks
# Customers in services but not in demographics
set(services['Customer ID']) - set(demographics['Customer ID'])


set()

In [18]:
# Save Cleaned Tables for SQL and POWER BI
demographics.to_csv("demographics.csv", index=False)
location.to_csv("location.csv", index=False)
population.to_csv("population.csv", index=False)
services.to_csv("services.csv", index=False)
status.to_csv("status.csv", index=False)

