<a href="https://colab.research.google.com/github/karim-mammadov/Kaggle-Datasets-MyMLProject/blob/main/Bank_Marketing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d henriqueyamahata/bank-marketing

In [None]:
import zipfile

In [None]:
with zipfile.ZipFile('/content/bank-marketing.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# DATA VISUALIZATION

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df=pd.read_csv('/content/bank-additional-full.csv', sep=';')
df

In [None]:
sns.countplot(x='y', data=df)
plt.title('Target Variable Distribution (Yes/No)')
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='job', hue='y', data=df)
plt.title('Job vs Target')
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.boxplot(x='y', y='age', data=df)
plt.title('Age distribution by Target')
plt.show()

In [None]:
sns.countplot(x='month', hue='y', data=df, order=['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec'])
plt.title('Month vs Target')
plt.show()

sns.countplot(x='day_of_week', hue='y', data=df)
plt.title('Day of Week vs Target')
plt.show()

In [None]:
sns.boxplot(x='y', y='emp.var.rate', data=df)
plt.title('Employment Variation Rate by Target')
plt.show()

In [None]:
sns.boxplot(x='y', y='campaign', data=df)
plt.title('Number of Contacts (Campaign) by Target')
plt.show()

In [None]:
for col in ['default', 'housing', 'loan']:
    plt.figure()
    sns.countplot(x=col, hue='y', data=df)
    plt.title(f'{col} vs Target')
    plt.show()

In [None]:
df['age_group'] = pd.cut(df['age'], bins=[18,30,45,60,100], labels=['18-30','31-45','46-60','60+'])
sns.countplot(x='age_group', hue='y', data=df)
plt.title('Age Group vs Target')
plt.show()

In [None]:
numeric_df = df.select_dtypes(include=[np.number])
plt.figure(figsize=(12,10))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix (numeric features only)')
plt.show()

# DATA CLEANING

In [None]:
df.isna().sum()

In [None]:
df['y'].isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

# Build a Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
df['y'] = df['y'].map({'yes': 1, 'no': 0})

In [None]:
df['y']

In [None]:
X = df.drop('y',axis=1)
y = df['y'].copy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
import numpy as np

In [None]:
num_feature= X_train.select_dtypes(include=[np.number]).columns
cat_feature= X_train.select_dtypes(exclude=[np.number]).columns

In [None]:
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


cat_pipeline = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])


transformer = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_feature),
        ('cat', cat_pipeline, cat_feature)
    ]
)



full_pipeline = Pipeline(steps=[
    ('transformer', transformer),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [None]:
full_pipeline.fit(X_train, y_train)

In [None]:
full_pipeline.score(X_train, y_train), full_pipeline.score(X_test, y_test)

In [None]:
estimator = DecisionTreeClassifier(random_state=42, min_samples_leaf=2, max_depth=10)

full_pipeline_1 = Pipeline(steps=[
    ('transformer', transformer),
    ('estimator', DecisionTreeClassifier(random_state=42, min_samples_leaf=1, max_depth=8))
])

In [None]:
full_pipeline_1.fit(X_train, y_train)

In [None]:
full_pipeline_1.score(X_train, y_train), full_pipeline_1.score(X_test, y_test)

In [None]:
estimator_1 = RandomForestClassifier(random_state=42, min_samples_leaf=3, max_depth=10, n_estimators=200)
full_pipeline_2 = Pipeline(steps=[
    ('transformer', transformer),
    ('estimator1',estimator_1)
])

In [None]:
full_pipeline_2.fit(X_train, y_train)

In [None]:
full_pipeline_2.score(X_train, y_train), full_pipeline_2.score(X_test, y_test)