In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
# Load the dataset
url = 'telco_customer_churn.csv'
df = pd.read_csv(url)

In [38]:
# Display the first few rows of the dataset
print(df.head())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  female              0     yes         no       1           no   
1  5575-GNVDE    male              0      no         no      34          yes   
2  3668-QPYBK    male              0      no         no       2          yes   
3  7795-CFOCW    male              0      no         no      45           no   
4  9237-HQITU  female              0      no         no       2          yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  no phone service             dsl             no  ...               no   
1                no             dsl            yes  ...              yes   
2                no             dsl            yes  ...               no   
3  no phone service             dsl            yes  ...              yes   
4                no     fiber optic             no  ...               no   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [40]:
# Check for missing values
print(df.isnull().sum())

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [16]:
# Handling missing values
df.dropna(subset=['Churn'], inplace=True)  # Drop rows with missing target values
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')  # Convert TotalCharges to numeric

In [18]:
# Separate numeric and non-numeric columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
non_numeric_cols = df.select_dtypes(exclude=['float64', 'int64']).columns

In [20]:
# Fill missing values for numeric columns with mean
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

In [22]:
# Fill missing values for non-numeric columns with mode
df[non_numeric_cols] = df[non_numeric_cols].fillna(df[non_numeric_cols].mode().iloc[0])


In [24]:
# Removing duplicates
df.drop_duplicates(inplace=True)

In [26]:
# Standardizing formats
df['gender'] = df['gender'].str.lower()
df['Partner'] = df['Partner'].str.lower()
df['Dependents'] = df['Dependents'].str.lower()
df['PhoneService'] = df['PhoneService'].str.lower()
df['MultipleLines'] = df['MultipleLines'].str.lower()
df['InternetService'] = df['InternetService'].str.lower()
df['OnlineSecurity'] = df['OnlineSecurity'].str.lower()
df['OnlineBackup'] = df['OnlineBackup'].str.lower()
df['DeviceProtection'] = df['DeviceProtection'].str.lower()
df['TechSupport'] = df['TechSupport'].str.lower()
df['StreamingTV'] = df['StreamingTV'].str.lower()
df['StreamingMovies'] = df['StreamingMovies'].str.lower()
df['Contract'] = df['Contract'].str.lower()
df['PaperlessBilling'] = df['PaperlessBilling'].str.lower()
df['PaymentMethod'] = df['PaymentMethod'].str.lower()
df['Churn'] = df['Churn'].str.lower()


In [28]:
# Define features and target
X = df.drop(columns=['customerID', 'Churn'])
y = df['Churn']

In [30]:
# Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)


In [42]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the size of training and test sets
print(f'Training set size: {X_train.shape[0]}')
print(f'Test set size: {X_test.shape[0]}')


Training set size: 5634
Test set size: 1409
