In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc, classification_report, confusion_matrix
from sklearn.feature_selection import RFE
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

In [2]:
# Load the dataset
df = pd.read_csv('churn.csv')

# Display the first few rows
print("First 5 rows of the dataset:")
df.head()

First 5 rows of the dataset:


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Sex,Age,CurrentWorkingStatus,Tenure,Balance,NumOfProducts,ComplaintsLodged,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619.0,France,Female,1.0,42.0,1.0,2.0,0.0,1.0,1.0,1.0,1.0,101348.88,Yes
1,2,15647311,Hill,608.0,Spain,Female,1.0,41.0,1.0,1.0,83807.86,1.0,0.0,0.0,1.0,112542.58,No
2,3,15619304,Onio,502.0,France,Female,1.0,42.0,1.0,8.0,159660.8,3.0,1.0,1.0,0.0,113931.57,Yes
3,4,15701354,Boni,699.0,France,Female,1.0,39.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,?,No
4,5,15737888,Mitchell,850.0,Spain,Female,1.0,43.0,1.0,2.0,125510.82,1.0,0.0,1.0,1.0,79084.1,No


In [3]:
# Get basic information
print("\nDataset shape:", df.shape)
print("\nData types:")
print(df.dtypes)

# Summary statistics
print("\nSummary statistics:")
df.describe()


Dataset shape: (10000, 17)

Data types:
RowNumber                 int64
CustomerId                int64
Surname                  object
CreditScore             float64
Geography                object
Gender                   object
Sex                     float64
Age                     float64
CurrentWorkingStatus    float64
Tenure                  float64
Balance                 float64
NumOfProducts           float64
ComplaintsLodged        float64
HasCrCard               float64
IsActiveMember          float64
EstimatedSalary          object
Exited                   object
dtype: object

Summary statistics:


Unnamed: 0,RowNumber,CustomerId,CreditScore,Sex,Age,CurrentWorkingStatus,Tenure,Balance,NumOfProducts,ComplaintsLodged,HasCrCard,IsActiveMember
count,10000.0,10000.0,9963.0,9805.0,9963.0,9963.0,9963.0,9963.0,9859.0,9963.0,9963.0,9963.0
mean,5000.5,15690940.0,650.624812,0.453952,38.898223,0.996587,5.014453,76501.435194,1.531088,0.207167,0.70551,0.515006
std,2886.89568,71936.19,96.606044,0.4979,10.541477,0.058321,2.89188,62411.069692,0.582007,0.405296,0.455836,0.4998
min,1.0,15565700.0,350.0,0.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,2500.75,15628530.0,584.0,0.0,32.0,1.0,3.0,0.0,1.0,0.0,0.0,0.0
50%,5000.5,15690740.0,652.0,0.0,37.0,1.0,5.0,97234.58,1.0,0.0,1.0,1.0
75%,7500.25,15753230.0,718.0,1.0,44.0,1.0,7.5,127657.84,2.0,0.0,1.0,1.0
max,10000.0,15815690.0,850.0,1.0,92.0,1.0,10.0,250898.09,4.0,1.0,1.0,1.0


In [4]:
# Check for missing values
print("\nMissing values per column:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

# Alternative check for missing or problematic values
print("\nChecking for '?' values or other potential issues:")
for column in df.columns:
    if df[column].dtype == 'object':
        unique_values = df[column].unique()
        if '?' in unique_values:
            print(f"Column '{column}' contains '?' values: {df[column].value_counts()['?']} occurrences")


Missing values per column:
Surname                  86
CreditScore              37
Geography                37
Gender                   37
Sex                     195
Age                      37
CurrentWorkingStatus     37
Tenure                   37
Balance                  37
NumOfProducts           141
ComplaintsLodged         37
HasCrCard                37
IsActiveMember           37
EstimatedSalary         104
dtype: int64

Checking for '?' values or other potential issues:
Column 'EstimatedSalary' contains '?' values: 64 occurrences
