# Bank Marketing - Exploratory Data Analysis (EDA)

## Objective
The goal of this notebook is to perform a comprehensive exploratory analysis of the Bank Marketing dataset to understand the data distribution, identify patterns, and uncover relationships between features and the target variable (term deposit subscription).

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Setting visualization styles
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

## 1. Data Loading and Overview

In [3]:
# Load the dataset
# Using the same path as the modeling notebook
df = pd.read_csv("sample_data/bank-additional-full.csv", sep=';')

# Display first few rows
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
# Dataset Info
print(f"Shape of dataset: {df.shape}")
df.info()

Shape of dataset: (41188, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  c

In [5]:
# Statistical Summary
df.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911
std,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


In [None]:
# Check for Missing Values
print("Missing Values:\n", df.isnull().sum())

In [None]:
# Check for Duplicates
duplicates = df.duplicated().sum()
print(f"Duplicate Rows: {duplicates}")

if duplicates > 0:
    print("Dropping duplicates...")
    df.drop_duplicates(inplace=True)
    print(f"New shape: {df.shape}")

## 2. Target Variable Analysis (`y`)

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(x='y', data=df, palette='viridis')
plt.title('Distribution of Target Variable (y)')
plt.show()

# Calculate class imbalance
target_counts = df['y'].value_counts(normalize=True) * 100
print(f"Class Distribution:\n{target_counts}")

## 3. Univariate Analysis
Analyzing independent variables one by one.

In [None]:
# Separate Numerical and Categorical columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Remove 'y' from categorical if present (though it's target)
if 'y' in categorical_cols:
    categorical_cols.remove('y')
    
print("Numeric Columns:", numeric_cols)
print("Categorical Columns:", categorical_cols)

### Numerical Features Distribution

In [None]:
# Plot histograms for numeric features
df[numeric_cols].hist(bins=20, figsize=(15, 10), layout=(4, 3))
plt.suptitle('Histograms of Numerical Features')
plt.show()

In [None]:
# Boxplots to check outliers
plt.figure(figsize=(15, 10))
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(4, 3, i)
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()

### Categorical Features Distribution

In [None]:
for col in categorical_cols:
    plt.figure(figsize=(10, 4))
    sns.countplot(y=col, data=df, order=df[col].value_counts().index, palette='muted')
    plt.title(f'Distribution of {col}')
    plt.show()

## 4. Bivariate Analysis
Analyzing the relationship between features and the target variable.

### Numeric vs Target

In [None]:
# Boxplots of Numeric features by Target
plt.figure(figsize=(15, 15))
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(4, 3, i)
    sns.boxplot(x='y', y=col, data=df)
    plt.title(f'{col} vs Target (y)')
plt.tight_layout()
plt.show()

Key observations here usually include:
- `duration`: Often shows a very strong relationship (longer calls -> higher subscription rate), but be careful as this is not known before the call.
- `euribor3m`, `nr_employed`: Economic indicators usually show distinct distributions for 'yes' vs 'no'.

### Categorical vs Target

In [None]:
for col in categorical_cols:
    plt.figure(figsize=(10, 6))
    # Calculate proportion of 'yes' for each category
    prop_df = df.groupby(col)['y'].value_counts(normalize=True).unstack()
    if 'yes' in prop_df.columns:
        prop_df = prop_df.sort_values('yes', ascending=False)
    
    prop_df.plot(kind='bar', stacked=True, colormap='viridis', figsize=(10, 5))
    plt.title(f'Subscription Rate by {col}')
    plt.ylabel('Proportion')
    plt.legend(title='Target (y)', loc='upper right')
    plt.show()

## 5. Correlation Analysis

In [None]:
# Correlation Matrix for numerical features
plt.figure(figsize=(12, 10))
corr_matrix = df[numeric_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Numerical Features')
plt.show()

Strong correlations to note:
- `emp.var.rate`, `euribor3m`, `nr.employed` are often highly correlated with each other, representing the economic context.

## 6. Key Insights & Conclusion
*Add your summary of findings here based on the plots above.*