In [1]:
!pip install ucimlrepo



In [2]:
from ucimlrepo import fetch_ucirepo 
bank_marketing = fetch_ucirepo(id=222) 

In [3]:
# metadata 
print(bank_marketing.metadata) 

# variable information 
print(bank_marketing.variables) 

{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'ID': 277, 'type': 'NATIVE', 'title': 'A data-driven approach to predict the s

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
df = pd.DataFrame(bank_marketing.data.features)
df['Y'] = bank_marketing.data.targets

In [5]:
selected_features = ["job", "marital", "loan", "default", "previous", "campaign", "Y"]
df = df[selected_features]

In [6]:
df.head()

Unnamed: 0,job,marital,loan,default,previous,campaign,Y
0,management,married,no,no,0,1,no
1,technician,single,no,no,0,1,no
2,entrepreneur,married,yes,no,0,1,no
3,blue-collar,married,no,no,0,1,no
4,,single,no,no,0,1,no


In [None]:
label_encoders = {}
for col in df.columns:
    print(f"Processing `{col}` \t\tcolumn \t\t/ `{df[col].dtype}`...")
    if df[col].dtype == 'object':  # Nếu cột có kiểu dữ liệu là object (categorical)
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        print(f">> le.fit_transform(df[col])={le.fit_transform(df[col])}")
        label_encoders[col] = le  # Lưu lại encoder để dùng sau (nếu cần)

Processing `job` 		column 		/ `object`...
>> le.fit_transform(df[col])=[4 9 2 ... 5 1 2]
Processing `marital` 		column 		/ `object`...
>> le.fit_transform(df[col])=[1 2 1 ... 1 1 1]
Processing `loan` 		column 		/ `object`...
>> le.fit_transform(df[col])=[0 0 1 ... 0 0 0]
Processing `default` 		column 		/ `object`...
>> le.fit_transform(df[col])=[0 0 0 ... 0 0 0]
Processing `previous` 		column 		/ `int64`...
Processing `campaign` 		column 		/ `int64`...
Processing `Y` 		column 		/ `object`...
>> le.fit_transform(df[col])=[0 0 0 ... 1 0 0]


In [8]:
print("df:\n", df)


df:
        job  marital  loan  default  previous  campaign  Y
0        4        1     0        0         0         1  0
1        9        2     0        0         0         1  0
2        2        1     1        0         0         1  0
3        1        1     0        0         0         1  0
4       11        2     0        0         0         1  0
...    ...      ...   ...      ...       ...       ... ..
45206    9        1     0        0         0         3  1
45207    5        0     0        0         0         2  1
45208    5        1     0        0         3         5  1
45209    1        1     0        0         0         4  0
45210    2        1     0        0        11         2  0

[45211 rows x 7 columns]


In [9]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Y'])
Y = df['Y']
X_trainval, X_testval, y_trainval, y_testval = train_test_split(X, Y, test_size=0.1, random_state=42)

In [10]:
print("X_trainval:\n", X_trainval)
print("X_testval:\n", X_testval)
print("y_trainval:\n", y_trainval)
print("y_testval:\n", y_testval)

X_trainval:
        job  marital  loan  default  previous  campaign
14565    3        1     0        0         0         3
20546    4        0     0        0         0         2
34495    1        1     0        0         0         1
13814    7        1     0        0         0         1
42153    4        1     0        0         2         1
...    ...      ...   ...      ...       ...       ...
11284    3        2     0        0         0         1
44732    8        2     0        0         1         1
38158    9        0     0        0         0         1
860      5        1     0        0         0         1
15795    1        1     0        0         0        10

[40689 rows x 6 columns]
X_testval:
        job  marital  loan  default  previous  campaign
3776     1        1     0        0         0         1
9928     7        2     0        0         0         2
33409    8        2     0        0         0         1
31885    4        1     0        0         1         1
15738    4    

In [11]:
print(f"X_trainval shape: {X_trainval.shape}")
print(f"X_testval shape: {X_testval.shape}")
print(f"y_trainval shape: {y_trainval.shape}")
print(f"y_testval shape: {y_testval.shape}")

X_trainval shape: (40689, 6)
X_testval shape: (4522, 6)
y_trainval shape: (40689,)
y_testval shape: (4522,)


In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_trainval = scaler.fit_transform(X_trainval)
X_testval = scaler.fit_transform(X_testval)

In [13]:
print("X_trainval:\n", X_trainval)
print("X_testval:\n", X_testval)
print("y_trainval:\n", y_trainval)
print("y_testval:\n", y_testval)

X_trainval:
 [[-0.4107439  -0.27640684 -0.43798147 -0.13703517 -0.249556    0.07606371]
 [-0.10494626 -1.92258923 -0.43798147 -0.13703517 -0.249556   -0.24488996]
 [-1.02233917 -0.27640684 -0.43798147 -0.13703517 -0.249556   -0.56584362]
 ...
 [ 1.42404194 -1.92258923 -0.43798147 -0.13703517 -0.249556   -0.56584362]
 [ 0.20085138 -0.27640684 -0.43798147 -0.13703517 -0.249556   -0.56584362]
 [-1.02233917 -0.27640684 -0.43798147 -0.13703517 -0.249556    2.32273935]]
X_testval:
 [[-1.00431118 -0.27004835 -0.42615067 -0.12076341 -0.27441343 -0.6038018 ]
 [ 0.81677891  1.35599512 -0.42615067 -0.12076341 -0.27441343 -0.26292892]
 [ 1.12029392  1.35599512 -0.42615067 -0.12076341 -0.27441343 -0.6038018 ]
 ...
 [-1.3078262  -0.27004835  2.34658791 -0.12076341 -0.27441343  0.07794395]
 [ 0.51326389 -0.27004835 -0.42615067 -0.12076341  0.17486157  0.07794395]
 [-1.3078262  -1.89609182 -0.42615067 -0.12076341 -0.27441343 -0.26292892]]
y_trainval:
 14565    0
20546    0
34495    0
13814    0
42153 

In [14]:

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

param_grid = {
    'n_estimators': [50, 100, 200],        
    'max_depth': [10, 20, None],           
    'min_samples_split': [2, 5, 10],      
    'min_samples_leaf': [1, 2, 4]         
}

clf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_trainval, y_trainval)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_testval)

accuracy = accuracy_score(y_testval, y_pred)
report = classification_report(y_testval, y_pred)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)


Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
Accuracy: 0.8777
Classification Report:
               precision    recall  f1-score   support

           0       0.88      1.00      0.93      3968
           1       1.00      0.00      0.00       554

    accuracy                           0.88      4522
   macro avg       0.94      0.50      0.47      4522
weighted avg       0.89      0.88      0.82      4522

