In [14]:
# Telco Customer Churn - Data Cleaning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
# 1. Load dataset
df = pd.read_csv(r"C:\Users\vohuy\projects\Customer Retention & Churn Analysis\Data\Telco-Customer-Churn.csv")



In [16]:
# 2. Kiểm tra thông tin ban đầu
print(df.shape)
print(df.info())
print(df.isnull().sum())

(7043, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null

In [17]:
# 3. Xử lý cột 'TotalCharges'
# Cột này dạng object vì có giá trị trống ' ' -> chuyển về numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Kiểm tra missing sau khi convert
print("Missing in TotalCharges:", df['TotalCharges'].isnull().sum())

# Xử lý missing (thường ~11 dòng): có thể fill hoặc drop
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

Missing in TotalCharges: 11


In [18]:
# 4. Drop cột ID (không dùng phân tích)
df.drop(columns=['customerID'], inplace=True)

In [19]:
# 5. Chuẩn hóa kiểu dữ liệu
# tenure, MonthlyCharges, TotalCharges phải là numeric
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [20]:
# 6. Encode biến Yes/No thành 1/0
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

In [21]:
# 7. SeniorCitizen đã là 0/1 nhưng có thể đổi thành categorical nếu muốn
df['SeniorCitizen'] = df['SeniorCitizen'].astype('category')

In [22]:
# 🔹 Lưu bản EDA (chưa one-hot encoding)
df.to_csv("Telco-Customer-Churn-EDA.csv", index=False)

print("✅ Saved: Telco-Customer-Churn-EDA.csv (for EDA)")

✅ Saved: Telco-Customer-Churn-EDA.csv (for EDA)


In [23]:
# 8. One-Hot Encoding cho categorical variables nhiều giá trị
categorical_cols = ['gender', 'MultipleLines', 'InternetService',
                    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                    'TechSupport', 'StreamingTV', 'StreamingMovies',
                    'Contract', 'PaymentMethod']

df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


In [24]:
# 9. Kiểm tra lại kết quả
print(df_encoded.head())
print(df_encoded.info())


  SeniorCitizen  Partner  Dependents  tenure  PhoneService  PaperlessBilling  \
0             0        1           0       1             0                 1   
1             0        0           0      34             1                 0   
2             0        0           0       2             1                 1   
3             0        0           0      45             0                 0   
4             0        0           0       2             1                 1   

   MonthlyCharges  TotalCharges  Churn  gender_Male  ...  TechSupport_Yes  \
0           29.85         29.85      0        False  ...            False   
1           56.95       1889.50      0         True  ...            False   
2           53.85        108.15      1         True  ...            False   
3           42.30       1840.75      0         True  ...             True   
4           70.70        151.65      1        False  ...            False   

   StreamingTV_No internet service  StreamingTV_Yes  \
0

In [25]:
# 🔹 Lưu bản Cleaned (cho Modeling)
df_encoded.to_csv("Telco-Customer-Churn-Cleaned.csv", index=False)

print("✅ Saved: Telco-Customer-Churn-Cleaned.csv (for Modeling)")

✅ Saved: Telco-Customer-Churn-Cleaned.csv (for Modeling)
