In [38]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


In [39]:
df = pd.read_csv("Data.csv")

print("Dataset loaded successfully ✅")
print("Dataset shape:", df.shape)


Dataset loaded successfully ✅
Dataset shape: (7043, 21)


In [40]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median())

print("Missing values after cleaning:")
print(df.isnull().sum().sum())

print("Dropping customerID column...")
df.drop("customerID", axis=1, inplace=True)


Missing values after cleaning:
0
Dropping customerID column...


In [41]:
df_encoded = pd.get_dummies(df, drop_first=True)

print("Encoding completed.")
print("Encoded dataset shape:", df_encoded.shape)


Encoding completed.
Encoded dataset shape: (7043, 31)


In [42]:
X = df_encoded.drop("Churn_Yes", axis=1)
y = df_encoded["Churn_Yes"]

print("Feature matrix shape:", X.shape)
print("Target vector shape:", y.shape)


Feature matrix shape: (7043, 30)
Target vector shape: (7043,)


In [43]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (5634, 30)
Testing set size: (1409, 30)


In [44]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling completed")


Feature scaling completed


In [45]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)

print("Logistic Regression training completed")


Logistic Regression training completed


In [46]:
y_pred_log = log_model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("Precision:", precision_score(y_test, y_pred_log))
print("Recall:", recall_score(y_test, y_pred_log))
print("F1 Score:", f1_score(y_test, y_pred_log))

print("Classification Report:")
print(classification_report(y_test, y_pred_log))


Accuracy: 0.8197303051809794
Precision: 0.683076923076923
Recall: 0.5951742627345844
F1 Score: 0.6361031518624641
Classification Report:
              precision    recall  f1-score   support

       False       0.86      0.90      0.88      1036
        True       0.68      0.60      0.64       373

    accuracy                           0.82      1409
   macro avg       0.77      0.75      0.76      1409
weighted avg       0.81      0.82      0.82      1409



In [47]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

print("Decision Tree training completed ✅")


Decision Tree training completed ✅


In [48]:
y_pred_dt = dt_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Precision:", precision_score(y_test, y_pred_dt))
print("Recall:", recall_score(y_test, y_pred_dt))
print("F1 Score:", f1_score(y_test, y_pred_dt))

print("Classification Report:")
print(classification_report(y_test, y_pred_dt))


Accuracy: 0.7097232079489
Precision: 0.45187165775401067
Recall: 0.45308310991957107
F1 Score: 0.4524765729585007
Classification Report:
              precision    recall  f1-score   support

       False       0.80      0.80      0.80      1036
        True       0.45      0.45      0.45       373

    accuracy                           0.71      1409
   macro avg       0.63      0.63      0.63      1409
weighted avg       0.71      0.71      0.71      1409



In [49]:
feature_importance = pd.Series(
    log_model.coef_[0],
    index=X.columns
).sort_values(ascending=False)

print("Top 10 important features:")
print(feature_importance.head(10))


Top 10 important features:
TotalCharges                      0.627137
InternetService_Fiber optic       0.622950
StreamingMovies_Yes               0.230364
StreamingTV_Yes                   0.180316
MultipleLines_Yes                 0.168662
PaperlessBilling_Yes              0.163692
PaymentMethod_Electronic check    0.150779
SeniorCitizen                     0.058427
MultipleLines_No phone service    0.034680
DeviceProtection_Yes              0.028529
dtype: float64


In [50]:
param_grid = {
    "max_depth": [3, 5, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

grid = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring="f1"
)

grid.fit(X_train, y_train)

print("Best parameters found:")
print(grid.best_params_)


Best parameters found:
{'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}


In [51]:
best_dt = grid.best_estimator_
y_pred_best = best_dt.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_best))
print("Precision:", precision_score(y_test, y_pred_best))
print("Recall:", recall_score(y_test, y_pred_best))
print("F1 Score:", f1_score(y_test, y_pred_best))


Accuracy: 0.8076650106458482
Precision: 0.7107438016528925
Recall: 0.46112600536193027
F1 Score: 0.5593495934959349
