In [None]:
# Task 1: Data Preparation )
import pandas as pd
import numpy as np
file_path = r"C:\MLProjects\sklearn.xlsx"
df = pd.read_excel(file_path)
print("Dataset Shape (Rows, Columns):", df.shape)
print("\nMissing Values per Column:\n", df.isnull().sum())
print("\nData Types before preprocessing:\n", df.dtypes)

df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors='coerce')
df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].mean()) 
df = df.drop("customerID", axis=1)
cat_cols = df.select_dtypes(include="object").columns
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

print("\nAfter Preprocessing:")
print("Dataset Shape:", df.shape)
print("\nData Types after preprocessing:\n", df.dtypes)


Dataset Shape (Rows, Columns): (7043, 21)

Missing Values per Column:
 customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

Data Types before preprocessing:
 customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object

In [None]:
output_path = r"C:\MLProjects\prepared_data.xlsx"
df.to_excel(output_path, index=False)
print(f"Preprocessed dataset saved at: {output_path}")

In [None]:
#Task 2
import pandas as pd
from sklearn.model_selection import train_test_split
file_path = r"C:\MLProjects\prepared_data.xlsx"
df = pd.read_excel(file_path)

X = df.drop("Churn_Yes", axis=1)   
y = df["Churn_Yes"]           
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Training Set Shape:", X_train.shape, y_train.shape)
print("Testing Set Shape:", X_test.shape, y_test.shape)


Training Set Shape: (5634, 30) (5634,)
Testing Set Shape: (1409, 30) (1409,)


In [None]:
# task 3
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
file_path = r"C:\MLProjects\prepared_data.xlsx"
df = pd.read_excel(file_path)
X = df.drop("Churn_Yes", axis=1)
y = df["Churn_Yes"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
feature_importances = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": model.feature_importances_
}).sort_values(by="Importance", ascending=False)


print("Top 10 Important Features:")
print(feature_importances.head(10))


Top 10 Important Features:
                           Feature  Importance
3                     TotalCharges    0.192829
1                           tenure    0.174197
2                   MonthlyCharges    0.168527
28  PaymentMethod_Electronic check    0.038726
10     InternetService_Fiber optic    0.038579
25               Contract_Two year    0.030102
13              OnlineSecurity_Yes    0.028420
4                      gender_Male    0.028348
26            PaperlessBilling_Yes    0.025472
5                      Partner_Yes    0.023348


In [None]:
#Task 4
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
file_path = r"C:\MLProjects\prepared_data.xlsx"
df = pd.read_excel(file_path)
X = df.drop("Churn_Yes", axis=1)
y = df["Churn_Yes"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.8041
Decision Tree Accuracy: 0.7417
Random Forest Accuracy: 0.7878
Gradient Boosting Accuracy: 0.7970


In [None]:
# Task 5: Model Training

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
file_path = r"C:\MLProjects\prepared_data.xlsx"
df = pd.read_excel(file_path)
X = df.drop("Churn_Yes", axis=1)  
y = df["Churn_Yes"]               
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model = LogisticRegression(max_iter=5000, random_state=42)
model.fit(X_train_scaled, y_train)


print("Model training completed successfully!")

Model training completed successfully!


In [None]:
# Task 6: Model Evaluation

import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
y_pred = model.predict(X_test_scaled) 
y_pred_prob = model.predict_proba(X_test_scaled)[:, 1]  
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)


print("Model Evaluation Results:")
print(f"Accuracy      : {accuracy:.4f}")
print(f"Precision     : {precision:.4f}")
print(f"Recall        : {recall:.4f}")
print(f"F1-Score      : {f1:.4f}")
print(f"ROC-AUC Score : {roc_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Model Evaluation Results:
Accuracy      : 0.8070
Precision     : 0.6584
Recall        : 0.5668
F1-Score      : 0.6092
ROC-AUC Score : 0.8416

Classification Report:
              precision    recall  f1-score   support

       False       0.85      0.89      0.87      1035
        True       0.66      0.57      0.61       374

    accuracy                           0.81      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409

