In [None]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline
from copy import deepcopy 

In [None]:
data = pd.read_csv("D:/MyDatasets/ChurnModelling/data.csv")
df = deepcopy(data)
data.shape, df.shape 

In [None]:
df.head()

In [None]:
df.info()

# Observation 
| Feature Name       | Description                                                                 |
|--------------------|-----------------------------------------------------------------------------|
| `RowNumber`        | Index of the row                                                            |
| `CustomerId`       | Unique ID for each customer                                                 |
| `Surname`          | Customer’s last name                                                        |
| `CreditScore`      | Creditworthiness score                                                      |
| `Geography`        | Country of residence                                                        |
| `Gender`           | Male or Female                                                              |
| `Age`              | Customer’s age                                                              |
| `Tenure`           | Number of years the customer has been with the bank                         |
| `Balance`          | Account balance                                                             |
| `NumOfProducts`    | Number of bank products the customer uses (1–4)                             |
| `HasCrCard`        | 1 if the customer has a credit card, 0 otherwise                            |
| `IsActiveMember`   | 1 if the customer is active, 0 if not                                       |
| `EstimatedSalary`  | Estimated annual salary                                                     |
| `Exited`           | Target variable — 1 if the customer churned, 0 if they stayed               |

In [None]:
[len(df[feature].unique()) for feature in df.columns]

In [None]:
for feature in df.columns:
    print(feature)
    print("-"*len(feature))
    print(f"samples:{df[feature].unique().tolist()[:5]}\ndtype:{df[feature].dtype}, count:{len(df[feature].unique())}, falling_category:{"numerical" if df[feature].dtype != "O" and len(df[feature].unique()) > 15 else "categorical"}")
    print("="*140)

In [None]:
unnecessory_features = ["RowNumber", "CustomerId", "Surname"] 
unnecessory_features

In [None]:
[print(col) for col in df.columns if col not in unnecessory_features]
print()

In [None]:
numerical_features = [
    feature for feature in df.columns if df[feature].dtype != "O" and len(df[feature].unique()) > 15 and feature not in unnecessory_features
    ]
categorical_features = [
    feature for feature in df.columns if feature not in numerical_features and feature not in unnecessory_features
    ]
print(f"numerical:{numerical_features}, count:{len(numerical_features)}")
print(f"categorical:{categorical_features}, count:{len(categorical_features)}")

In [None]:
from math import ceil 

plt.figure(figsize=(16, 8))
plt.suptitle("Univariate analysis of numerical features", fontsize=30, fontweight='bold', alpha=0.8, y=1.)
for index, feature in enumerate(numerical_features):
    plt.subplot(ceil(len(numerical_features)/3), 3, index+1)
    sns.kdeplot(df, x=feature, fill=True, color="r")
    plt.xlabel(feature)
    plt.tight_layout()

# Observation 
- CreditScore: Maximum account holders are having credit score of range from 500 to 800

- Age: Maximum account holders are in between age of 20 to 50 

- Balance: Account holders can be grouped on basis of account balance below 50,000 and greater than 50,000. Account holders having less than 50,000 of balance are more in comparision of account holders having balance more than 50,000.

- EstimatedSalary: Account holder are having estimated salary of range from 0 to 2,00,000.

In [None]:
plt.figure(figsize=(12, 10))
plt.suptitle("Univariate analysis of categorical features", fontsize=30, fontweight='bold', alpha=0.8, y=1.)
for index, feature in enumerate(categorical_features):
    plt.subplot(ceil(len(categorical_features)/2), 2, index+1)
    sns.countplot(df, x=feature, color="r")
    plt.xlabel(feature)
    plt.tight_layout()

# Observation 
- Geography: Maximum account holders belongs to France, where Germany and Spain have approximately equal number of account holders.

- Gender: Male account holders are more as compare to female account holders. 

- Tenure: Frequency of account holder who have been with bank for years from 1 to 9 are almost equal in each category where less than 1 year and more than 9 years are less in frequency. 

- NumOfProducts: Frequency of account holder using single a product or two products at the same time are a lot higer, where customers using three and four products at the same time are very less in frequency.

- HasCrCard: A great lead in account holders with holding credit cards but still a lot of account holders are not having credit cards. 

- IsActiveMember: Approximately 50% of account holders are inactive.

- Exited: Some account holders has churned.

In [None]:
plt.figure(figsize=(15, 30))
plt.suptitle("percentage of data captured by each category of each categorical features", fontsize=30, fontweight='bold', alpha=0.8, y=1.)
for index, feature in enumerate(categorical_features):
    plt.subplot(ceil(len(categorical_features)/2), 2, index+1)
    plt.pie(df[feature].value_counts().values, labels=df[feature].value_counts().index, autopct="%1.2f", textprops={'fontsize': 18})
    plt.xlabel(feature, fontdict={'fontsize': 18})
    plt.tight_layout()

In [None]:
# for feature in categorical_features:
#     print("#### "+feature)
#     for category in df[feature].value_counts().index:
#         print(f"- {category} having {(df[feature].value_counts().to_dict()[category])/sum(df[feature].value_counts().values)*100:.2f}% of data")
#     print()

# Observation
#### Geography
- France having 50.14% of data
- Germany having 25.09% of data
- Spain having 24.77% of data

#### Gender
- Male having 54.57% of data
- Female having 45.43% of data

#### Tenure
- 2 having 10.48% of data
- 1 having 10.35% of data
- 7 having 10.28% of data
- 8 having 10.25% of data
- 5 having 10.12% of data
- 3 having 10.09% of data
- 4 having 9.89% of data
- 9 having 9.84% of data
- 6 having 9.67% of data
- 10 having 4.90% of data
- 0 having 4.13% of data

#### NumOfProducts
- 1 having 50.84% of data
- 2 having 45.90% of data
- 3 having 2.66% of data
- 4 having 0.60% of data

#### HasCrCard
- 1 having 70.55% of data
- 0 having 29.45% of data

#### IsActiveMember
- 1 having 51.51% of data
- 0 having 48.49% of data

#### Exited
- 0 having 79.63% of data
- 1 having 20.37% of data

In [None]:
target = "Exited"
target

In [None]:
plt.figure(figsize=(16, 12))
plt.suptitle("Multivariate analysis of numerical features with output feature", fontsize=30, fontweight='bold', alpha=0.8, y=1.)
for index, feature in enumerate(numerical_features):
    plt.subplot(ceil(len(numerical_features)/2), 2, index+1)
    sns.barplot(df, y=feature, hue=target)
    plt.xlabel(target)
    plt.ylabel(feature)
    plt.tight_layout()

# Observation
- Age: After age of 35 customers are usually churn.

In [None]:
categorical_features.remove(target)

In [None]:
categorical_features

In [None]:
plt.figure(figsize=(16, 12))
plt.suptitle("Multivariate analysis of categorical features with output feature", fontsize=30, fontweight='bold', alpha=0.8, y=1.)
for index, feature in enumerate(categorical_features):
    plt.subplot(ceil(len(categorical_features)/2), 2, index+1)
    sns.countplot(df, x=feature, hue=target)
    plt.tight_layout()

# Observation 
- Geography: Consumers from Germany is very minimum but they have churned a lot as compare to other region. 

- NumOfProducts: Consumers are consuming only a single product at a time churns a lot. 

- HasCrCard: Account holders with credit cards chuns more but are also higer in frequency as compare to non credit card holder. 

- IsActiveMember: Accounts which are not active member are more likely to churn. 

In [None]:
df[numerical_features].describe()

In [None]:
sns.heatmap(df[numerical_features].corr(), annot=True)

In [None]:
sns.pairplot(df[numerical_features])