In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from datetime import datetime

Matplotlib is building the font cache; this may take a moment.


In [3]:
# configure visuals
sns.set_palette("husl")
pd.set_option('display.max_columns', None)

## Load and Inspect Data

In [8]:
df = pd.read_csv('data/raw/customer_churn.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [12]:
# basic info about the dataset
df.info() # no nulls
print('Data set shape: ', df.shape) # 7043 rows, 21 columns


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Data Quality Check

In [14]:
# check for missing values
missing_values = df.isnull().sum() # no missing values


customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [15]:
# check for duplicates
duplicates = df.duplicated().sum()
print(duplicates)

0


## Statistical Summary of the data

In [17]:
display(df.describe())

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [21]:
# summary of categorical columns
print("\nCategorical Features Summary:")
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    print(f"\n{col}:")
    display(df[col].value_counts()) # get the count of unique values
    print(f"Unique values: {df[col].nunique()}")



Categorical Features Summary:

customerID:


customerID
7590-VHVEG    1
3791-LGQCY    1
6008-NAIXK    1
5956-YHHRX    1
5365-LLFYV    1
             ..
9796-MVYXX    1
2637-FKFSY    1
1552-AAGRX    1
4304-TSPVK    1
3186-AJIEK    1
Name: count, Length: 7043, dtype: int64

Unique values: 7043

gender:


gender
Male      3555
Female    3488
Name: count, dtype: int64

Unique values: 2

Partner:


Partner
No     3641
Yes    3402
Name: count, dtype: int64

Unique values: 2

Dependents:


Dependents
No     4933
Yes    2110
Name: count, dtype: int64

Unique values: 2

PhoneService:


PhoneService
Yes    6361
No      682
Name: count, dtype: int64

Unique values: 2

MultipleLines:


MultipleLines
No                  3390
Yes                 2971
No phone service     682
Name: count, dtype: int64

Unique values: 3

InternetService:


InternetService
Fiber optic    3096
DSL            2421
No             1526
Name: count, dtype: int64

Unique values: 3

OnlineSecurity:


OnlineSecurity
No                     3498
Yes                    2019
No internet service    1526
Name: count, dtype: int64

Unique values: 3

OnlineBackup:


OnlineBackup
No                     3088
Yes                    2429
No internet service    1526
Name: count, dtype: int64

Unique values: 3

DeviceProtection:


DeviceProtection
No                     3095
Yes                    2422
No internet service    1526
Name: count, dtype: int64

Unique values: 3

TechSupport:


TechSupport
No                     3473
Yes                    2044
No internet service    1526
Name: count, dtype: int64

Unique values: 3

StreamingTV:


StreamingTV
No                     2810
Yes                    2707
No internet service    1526
Name: count, dtype: int64

Unique values: 3

StreamingMovies:


StreamingMovies
No                     2785
Yes                    2732
No internet service    1526
Name: count, dtype: int64

Unique values: 3

Contract:


Contract
Month-to-month    3875
Two year          1695
One year          1473
Name: count, dtype: int64

Unique values: 3

PaperlessBilling:


PaperlessBilling
Yes    4171
No     2872
Name: count, dtype: int64

Unique values: 2

PaymentMethod:


PaymentMethod
Electronic check             2365
Mailed check                 1612
Bank transfer (automatic)    1544
Credit card (automatic)      1522
Name: count, dtype: int64

Unique values: 4

TotalCharges:


TotalCharges
          11
20.2      11
19.75      9
20.05      8
19.9       8
          ..
6849.4     1
692.35     1
130.15     1
3211.9     1
6844.5     1
Name: count, Length: 6531, dtype: int64

Unique values: 6531

Churn:


Churn
No     5174
Yes    1869
Name: count, dtype: int64

Unique values: 2


## Distribution Analysis

In [26]:
# function to plot distribution for numerical features
def  plot_dist(df, columns, rows=3):
    plt.figure(figsize=(15, 5*rows))
    for i, c in enumerate(columns,1):
        print(i)
        print(c)
        plt.subplot(rows, 3, i)
        sns.histplot(data=df,x=col,kde=True)
        plt.title(f'Distribution of {col}')
        plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show

numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
plot_dist(df, numerical_cols)

TypeError: 'module' object is not callable