In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score, RocCurveDisplay

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

In [None]:
try:
    df = pd.read_csv('data.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'data.csv' not found. Please ensure the file is in the correct directory.")
    exit()

In [None]:
print("\n--- Initial Data Inspection ---")
print("First 5 rows of the dataset:")
print(df.head())
print("\nDataset Info:")
df.info()
print("\nMissing values per column:")
print(df.isnull().sum())
print("\nValue counts for 'Churn':")
print(df['Churn'].value_counts())

In [None]:
df['Churn'] = df['Churn'].astype(int)

In [None]:
if 'Phone' in df.columns:
    df = df.drop('Phone', axis=1)
    print("\n'Phone' column dropped.")

In [None]:
categorical_cols = df.select_dtypes(include='object').columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop('Churn', errors='ignore')

In [None]:
print(f"\nCategorical columns identified: {list(categorical_cols)}")
print(f"Numerical columns identified: {list(numerical_cols)}")

In [None]:
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)y

In [None]:
print("\nDataset after One-Hot Encoding (first 5 rows):")
print(df_encoded.head())
print(f"Shape after encoding: {df_encoded.shape}")

In [None]:
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
print(f"\nTraining set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

In [None]:
print("\nFeatures scaled successfully.")

In [None]:
print("\n--- Training Logistic Regression Model ---")
log_reg_model = LogisticRegression(random_state=42, solver='liblinear')

In [None]:
log_reg_model.fit(X_train_scaled_df, y_train)

In [None]:
print("Logistic Regression model trained successfully.")

In [None]:
print("\n--- Model Interpretation ---")
coefficients = log_reg_model.coef_[0]
feature_names = X_train_scaled_df.columns
intercept = log_reg_model.intercept_[0]

In [None]:
print(f"Model Intercept (bias): {intercept:.4f}")
print("\nModel Coefficients (Log-Odds):")
for feature, coef in zip(feature_names, coefficients):
    print(f"  {feature}: {coef:.4f}")

In [None]:
odds_ratios = np.exp(coefficients)

In [None]:
print("\nOdds Ratios:")
for feature, odds_ratio in zip(feature_names, odds_ratios):
    print(f"  {feature}: {odds_ratio:.4f}")

In [None]:
print("\nInterpretation Notes:")
print("  - An odds ratio > 1 means the odds of churn increase with that feature.")
print("  - An odds ratio < 1 means the odds of churn decrease with that feature.")
print("  - An odds ratio close to 1 means the feature has little effect on churn odds.")

In [None]:
print("\n--- Model Evaluation ---")
y_pred_proba = log_reg_model.predict_proba(X_test_scaled_df)[:, 1]

In [None]:
y_pred = log_reg_model.predict(X_test_scaled_df)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

In [None]:
plt.figure(figsize=(7, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Not Churn (0)', 'Churn (1)'],
            yticklabels=['Not Churn (0)', 'Churn (1)'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
class_report = classification_report(y_test, y_pred, target_names=['Not Churn (0)', 'Churn (1)'])
print("\nClassification Report:")
print(class_report)

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = roc_auc_score(y_test, y_pred_proba)

In [None]:
print(f"\nROC AUC Score: {roc_auc:.4f}")

In [None]:
plt.figure(figsize=(8, 7))
RocCurveDisplay.from_estimator(log_reg_model, X_test_scaled_df, y_test)
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
print("\n--- Conclusion ---")
print("The Logistic Regression model has been trained and evaluated.")
print("The interpretation of coefficients and odds ratios provides insights into feature importance.")
print("The evaluation metrics (accuracy, precision, recall, confusion matrix, and ROC-AUC) give a comprehensive understanding of the model's performance on unseen data.")
print("A higher ROC AUC score indicates better discrimination between the two classes.")