## Step 1: Import the libraies

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

## Step-2 Import the dataset

In [2]:
df = pd.read_csv("1.csv")

In [3]:
df

Unnamed: 0.1,Unnamed: 0,age,marital,education,default,balance,housing,loan,contact,day,...,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,duration_std
0,0,30,married,primary,no,1787,no,no,cellular,19,...,0,0,0,0,0,0,0,1,0,-0.711861
1,1,33,married,secondary,no,4789,yes,yes,cellular,11,...,0,0,0,0,1,0,0,0,0,-0.169194
2,2,35,single,tertiary,no,1350,yes,no,cellular,16,...,0,1,0,0,0,0,0,0,0,-0.303898
3,3,30,married,tertiary,no,1476,yes,yes,unknown,3,...,0,1,0,0,0,0,0,0,0,-0.250017
4,4,59,married,secondary,no,0,yes,no,unknown,5,...,0,0,0,0,0,0,0,0,0,-0.146102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,4516,33,married,secondary,no,-333,yes,no,cellular,30,...,0,0,0,0,1,0,0,0,0,0.250315
4517,4517,57,married,tertiary,yes,-3313,yes,yes,unknown,9,...,0,0,0,1,0,0,0,0,0,-0.427057
4518,4518,57,married,secondary,no,295,no,no,cellular,19,...,0,0,0,0,0,0,1,0,0,-0.434754
4519,4519,28,married,secondary,no,1137,no,no,cellular,6,...,0,0,0,0,0,0,0,0,0,-0.519426


## Step 3: Separate features and target variable

In [4]:
X = df.drop("y", axis=1)   
y = df["y"]               

## Step 4: Apply data preprocessing (Feature Scaling)

In [5]:
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col])
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


## Step 5: Define multiple advanced models

In [6]:
models = {
"Logistic Regression": LogisticRegression(max_iter=5000),
"Random Forest": RandomForestClassifier(),
"Support Vector Machine": SVC(probability=True),
"Gradient Boosting": GradientBoostingClassifier()
}

## Step 6: Perform k-fold cross-validation

In [10]:
# scoring = ['accuracy', 'precision', 'recall', 'f1']

# for name, model in models.items():
#     scores = cross_validate(model, X_scaled, y, cv=5, scoring=scoring)
    
#     print(f"\n{name}")
#     print("Accuracy:", np.mean(scores['test_accuracy']))
#     print("F1 Score:", np.mean(scores['test_f1']))

y = y.map({'no': 0, 'yes': 1})

from sklearn.model_selection import cross_validate
import numpy as np

scoring = ['accuracy', 'precision', 'recall', 'f1']

for name, model in models.items():
    scores = cross_validate(model, X_scaled, y, cv=5, scoring=scoring)
    
    print(f"\n{name}")
    print("Accuracy:", np.mean(scores['test_accuracy']))
    print("Precision:", np.mean(scores['test_precision']))
    print("Recall:", np.mean(scores['test_recall']))
    print("F1 Score:", np.mean(scores['test_f1']))



Logistic Regression
Accuracy: 0.8882997604263434
Precision: 0.5445421245421246
Recall: 0.19386446886446884
F1 Score: 0.28521015366650204

Random Forest
Accuracy: 0.8838737593507066
Precision: 0.5291738274257071
Recall: 0.3456593406593407
F1 Score: 0.40032303542601444

Support Vector Machine
Accuracy: 0.889626216203002
Precision: 0.6109977540469531
Recall: 0.12474358974358975
F1 Score: 0.20628363670259892

Gradient Boosting
Accuracy: 0.7712646066591697
Precision: 0.4657179092567259
Recall: 0.4245970695970696
F1 Score: 0.3534265587099819


## Step 7: Hyperparameter tuning using GridSearchCV (Random Forest)

In [20]:

# Check columns first
print("Columns in dataset:")
print(df.columns)

# Set correct target column name
target_column = "y"   # Change this if your target name is different

# Separate features and target
X = df.drop(columns=[target_column])
y = df[target_column]

# Clean and encode target
y = y.astype(str).str.strip().str.lower()
y = y.map({'no': 0, 'yes': 1})

print("Unique target values after mapping:", y.unique())

# Remove rows with missing target
mask = ~y.isnull()
X = X[mask]
y = y[mask]

# Convert categorical features
X = pd.get_dummies(X, drop_first=True)

# Scaling (optional for Random Forest)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

# GridSearchCV
grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

grid.fit(X_scaled, y)

# Results
print("Best Parameters:", grid.best_params_)
print("Best F1 Score:", grid.best_score_)












Columns in dataset:
Index(['Unnamed: 0', 'age', 'marital', 'education', 'default', 'balance',
       'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign',
       'pdays', 'previous', 'poutcome', 'y', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'job_unknown', 'duration_std'],
      dtype='object')
Unique target values after mapping: [0 1]
Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}
Best F1 Score: 0.4343803190105618


## Step 8: Select the best performing model

In [19]:
best_model = grid.best_estimator_


print("Best Model:", best_model)
print("Best Parameters:", grid.best_params_)
print("Best F1 Score:", grid.best_score_)

Best Model: RandomForestClassifier(max_depth=20, min_samples_split=5, random_state=42)
Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}
Best F1 Score: 0.4343803190105618
