In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import opendatasets as od
import pandas as pd

#Import dataset
df = pd.read_csv('./WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [2]:
#Check first several lines of dataset 
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
#Check a summary of the dataset  
df.info

<bound method DataFrame.info of       customerID  gender  SeniorCitizen Partner Dependents  tenure  \
0     7590-VHVEG  Female              0     Yes         No       1   
1     5575-GNVDE    Male              0      No         No      34   
2     3668-QPYBK    Male              0      No         No       2   
3     7795-CFOCW    Male              0      No         No      45   
4     9237-HQITU  Female              0      No         No       2   
...          ...     ...            ...     ...        ...     ...   
7038  6840-RESVB    Male              0     Yes        Yes      24   
7039  2234-XADUH  Female              0     Yes        Yes      72   
7040  4801-JZAZL  Female              0     Yes        Yes      11   
7041  8361-LTMKD    Male              1     Yes         No       4   
7042  3186-AJIEK    Male              0      No         No      66   

     PhoneService     MultipleLines InternetService OnlineSecurity  ...  \
0              No  No phone service             DSL 

In [4]:
#Check if there are missing values of data
missing = pd.concat([df.isnull().sum(), 100 * df.isnull().mean()], axis=1)
missing.columns=["count", "%"]
missing.sort_values(by="count", ascending=True)

Unnamed: 0,count,%
customerID,0,0.0
MonthlyCharges,0,0.0
PaymentMethod,0,0.0
PaperlessBilling,0,0.0
Contract,0,0.0
StreamingMovies,0,0.0
StreamingTV,0,0.0
TechSupport,0,0.0
DeviceProtection,0,0.0
TotalCharges,0,0.0


In [5]:
#Check what types of data are in the dataset
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [6]:
#For the "Partner" column, map 'Yes' to 1 and 'No' to 0 and convert data from object to int8
df['Partner'] = df['Partner'].map({'Yes': 1, 'No': 0}).astype('int8')

# Check unique values in the Partner column
print(df['Partner'].unique())

[1 0]


In [7]:
#Confirm data type for "Partner" column was sucessfully changed to int8
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner                int8
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [8]:
#Confirm values in "Partner" column are 0 and 1 now 
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,1,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,0,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,0,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,0,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,0,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [9]:
#Check names of all columns
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [10]:
# Create a list of the columns to check for unique values
columns_to_check_for_unique_values = ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

#Loop through the list and print the unique values for each column
for column in columns_to_check_for_unique_values:
    print(f"Unique values in '{column}':")
    print(df[column].unique())
    print("-" * 30) # Adds a separator for clarity

Unique values in 'customerID':
['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']
------------------------------
Unique values in 'gender':
['Female' 'Male']
------------------------------
Unique values in 'SeniorCitizen':
[0 1]
------------------------------
Unique values in 'Partner':
[1 0]
------------------------------
Unique values in 'Dependents':
['No' 'Yes']
------------------------------
Unique values in 'tenure':
[ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
------------------------------
Unique values in 'PhoneService':
['No' 'Yes']
------------------------------
Unique values in 'MultipleLines':
['No phone service' 'No' 'Yes']
------------------------------
Unique values in 'InternetService':
['DSL' 'Fiber optic' 'No']
------------------------------
Unique values in

In [11]:
#Define the columns with only 'Yes', 'No' values to change to 0 and 1 values
columns_to_change_to_0_1_values = ['Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
#Select the columns and apply the function to all of them at once
df[columns_to_change_to_0_1_values] = df[columns_to_change_to_0_1_values].apply(lambda col: col.map({'Yes': 1, 'No': 0}).astype('int8'))

In [12]:
#Confirm data type for the columns has been updated to int8
print(df[columns_to_change_to_0_1_values].dtypes)

Dependents          int8
PhoneService        int8
PaperlessBilling    int8
Churn               int8
dtype: object


In [13]:
#Confirm columns from columns_to_change_to_0_1_values are now showing only 0 and 1 values
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,1,0,1,0,No phone service,DSL,No,...,No,No,No,No,Month-to-month,1,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,0,0,34,1,No,DSL,Yes,...,Yes,No,No,No,One year,0,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,0,0,2,1,No,DSL,Yes,...,No,No,No,No,Month-to-month,1,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,0,0,45,0,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,0,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,0,0,2,1,No,Fiber optic,No,...,No,No,No,No,Month-to-month,1,Electronic check,70.7,151.65,1


In [14]:
#Loop through the list and print the unique values for each column to confirm changes to 'Yes', 'No' values were updated correctly
for column in columns_to_check_for_unique_values:
    print(f"Unique values in '{column}':")
    print(df[column].unique())
    print("-" * 30) # Adds a separator for clarity

Unique values in 'customerID':
['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']
------------------------------
Unique values in 'gender':
['Female' 'Male']
------------------------------
Unique values in 'SeniorCitizen':
[0 1]
------------------------------
Unique values in 'Partner':
[1 0]
------------------------------
Unique values in 'Dependents':
[0 1]
------------------------------
Unique values in 'tenure':
[ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
------------------------------
Unique values in 'PhoneService':
[0 1]
------------------------------
Unique values in 'MultipleLines':
['No phone service' 'No' 'Yes']
------------------------------
Unique values in 'InternetService':
['DSL' 'Fiber optic' 'No']
------------------------------
Unique values in 'OnlineSecuri

In [15]:
#One-Hot Encode the multi-category columns
# Define the list of columns to be one-hot encoded
columns_to_encode = [
    'MultipleLines', 'InternetService', 'OnlineSecurity',
    'OnlineBackup', 'DeviceProtection', 'TechSupport',
    'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']

df_encoded = pd.get_dummies(df, columns=columns_to_encode, drop_first=False, dtype='int8')
print("Applied one-hot encoding to categorical columns.")

Applied one-hot encoding to categorical columns.


In [16]:
print("\n\n--- 3. Final Processed DataFrame ---")
print(df_encoded.head())
print("\n--- Final Data Types and Columns ---")
df_encoded.info()

#df_encoded is the new df to work with



--- 3. Final Processed DataFrame ---
   customerID  gender  SeniorCitizen  Partner  Dependents  tenure  \
0  7590-VHVEG  Female              0        1           0       1   
1  5575-GNVDE    Male              0        0           0      34   
2  3668-QPYBK    Male              0        0           0       2   
3  7795-CFOCW    Male              0        0           0      45   
4  9237-HQITU  Female              0        0           0       2   

   PhoneService  PaperlessBilling  MonthlyCharges TotalCharges  ...  \
0             0                 1           29.85        29.85  ...   
1             1                 0           56.95       1889.5  ...   
2             1                 1           53.85       108.15  ...   
3             0                 0           42.30      1840.75  ...   
4             1                 1           70.70       151.65  ...   

   StreamingMovies_No  StreamingMovies_No internet service  \
0                   1                                    

In [17]:
#This line both maps the values for 'Female' to 1 and 'Male' to 0 and sets the data type to 'int8'
df_encoded['gender'] = df_encoded['gender'].map({'Female': 1, 'Male': 0}).astype('int8')

In [18]:
#Confirm the gender column has 0 and 1 
df_encoded.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,1,0,1,0,1,0,1,29.85,29.85,...,1,0,0,1,0,0,0,0,1,0
1,5575-GNVDE,0,0,0,0,34,1,0,56.95,1889.5,...,1,0,0,0,1,0,0,0,0,1
2,3668-QPYBK,0,0,0,0,2,1,1,53.85,108.15,...,1,0,0,1,0,0,0,0,0,1
3,7795-CFOCW,0,0,0,0,45,0,0,42.3,1840.75,...,1,0,0,0,1,0,1,0,0,0
4,9237-HQITU,1,0,0,0,2,1,1,70.7,151.65,...,1,0,0,1,0,0,0,0,1,0


In [19]:
#confirm the gender column data type is 'int8' 
print(df_encoded.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 42 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   customerID                               7043 non-null   object 
 1   gender                                   7043 non-null   int8   
 2   SeniorCitizen                            7043 non-null   int64  
 3   Partner                                  7043 non-null   int8   
 4   Dependents                               7043 non-null   int8   
 5   tenure                                   7043 non-null   int64  
 6   PhoneService                             7043 non-null   int8   
 7   PaperlessBilling                         7043 non-null   int8   
 8   MonthlyCharges                           7043 non-null   float64
 9   TotalCharges                             7043 non-null   object 
 10  Churn                                    7043 no

In [20]:
#Change 'TotalCharges' from object to float64
# First, replace empty spaces with NaN, then convert to float64
df_encoded['TotalCharges'] = df_encoded['TotalCharges'].replace(' ', np.nan)
df_encoded['TotalCharges'] = df_encoded['TotalCharges'].astype('float64')

In [21]:
print(df_encoded.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 42 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   customerID                               7043 non-null   object 
 1   gender                                   7043 non-null   int8   
 2   SeniorCitizen                            7043 non-null   int64  
 3   Partner                                  7043 non-null   int8   
 4   Dependents                               7043 non-null   int8   
 5   tenure                                   7043 non-null   int64  
 6   PhoneService                             7043 non-null   int8   
 7   PaperlessBilling                         7043 non-null   int8   
 8   MonthlyCharges                           7043 non-null   float64
 9   TotalCharges                             7032 non-null   float64
 10  Churn                                    7043 no