In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
from sklearn.preprocessing import LabelEncoder

# 1. Load and Prepare the Dataset

In [None]:
try:
    df = pd.read_csv('data.csv')
    print("Dataset loaded successfully.")
    print("Dataset shape:", df.shape)
    print("\nFirst 5 rows of the dataset:")
    print(df.head())
except FileNotFoundError:
    print("Error: 'data.csv' not found. Please make sure the dataset file is in the same directory as the script.")
    exit()

# 2. Data Preprocessing

In [None]:
target_column = 'Churn'
y = df[target_column]

In [None]:
X = df.drop(target_column, axis=1)

In [None]:
features_to_drop = ['State', 'Account length', 'Phone number']
X = X.drop(columns=features_to_drop)

In [None]:
for col in ['International plan', 'Voice mail plan']:
    if col in X.columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

In [None]:
le_churn = LabelEncoder()
y = le_churn.fit_transform(y)

# 3. Split Data into Training and Testing Sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

# 4. Train and Tune the Random Forest Model

In [None]:
print("\nStarting hyperparameter tuning with GridSearchCV...")

In [None]:
param_grid = {
    'n_estimators': [100, 200],    
    'max_depth': [10, 20, None],      
    'min_samples_split': [2, 5],    
    'min_samples_leaf': [1, 2], 
    'criterion': ['gini', 'entropy']
}

In [None]:
rf = RandomForestClassifier(random_state=42)

In [None]:
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=2, scoring='f1')

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
best_rf = grid_search.best_estimator_

In [None]:
print("\nHyperparameter tuning finished.")
print("Best Parameters found by GridSearchCV:")
print(grid_search.best_params_)

# 5. Evaluate the Model

In [None]:
y_pred = best_rf.predict(X_test)

In [None]:
print("\n--- Model Evaluation ---")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=[str(c) for c in le_churn.classes_]))


In [None]:
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le_churn.classes_, yticklabels=le_churn.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
print(f"\nOverall Accuracy: {accuracy_score(y_test, y_pred):.4f}")

# 6. Feature Importance Analysis

In [None]:
print("\n--- Feature Importance Analysis ---")

In [None]:
importances = best_rf.feature_importances_
feature_names = X_train.columns

In [None]:
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})

In [None]:
feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)

In [None]:
print("\nTop 10 Most Important Features:")
print(feature_importance_df.head(10))

In [None]:
plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importance_df)
plt.title('Feature Importance from Random Forest')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.tight_layout()
plt.show()