In [22]:
!pip install ucimlrepo



In [21]:
from ucimlrepo import fetch_ucirepo 
bank_marketing = fetch_ucirepo(id=222) 

In [23]:
X = bank_marketing.data.features 
y = bank_marketing.data.targets 

In [24]:
# metadata 
print(bank_marketing.metadata) 
  
# variable information 
print(bank_marketing.variables) 

{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'ID': 277, 'type': 'NATIVE', 'title': 'A data-driven approach to predict the s

In [28]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
df = pd.DataFrame(bank_marketing.data.features)
df['y'] = bank_marketing.data.targets
label_encoders = {}
for col in df.columns:
    if df[col].dtype == 'object':  # Nếu cột có kiểu dữ liệu là object (categorical)
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le  # Lưu lại encoder để dùng sau (nếu cần)

In [29]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['y'])
y = df['y']
X_trainval, X_testval, y_trainval, y_testval = train_test_split(X, y, test_size=0.2, random_state=42)


In [30]:
print(f"X_trainval shape: {X_trainval.shape}")
print(f"X_testval shape: {X_testval.shape}")
print(f"y_trainval shape: {y_trainval.shape}")
print(f"y_testval shape: {y_testval.shape}")

X_trainval shape: (36168, 16)
X_testval shape: (9043, 16)
y_trainval shape: (36168,)
y_testval shape: (9043,)


In [33]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

scaler = StandardScaler()
X_trainval = scaler.fit_transform(X_trainval)
X_testval = scaler.transform(X_testval)

In [34]:

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

param_grid = {
    'n_estimators': [50, 100, 200],        
    'max_depth': [10, 20, None],           
    'min_samples_split': [2, 5, 10],      
    'min_samples_leaf': [1, 2, 4]         
}

clf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_trainval, y_trainval)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_testval)

accuracy = accuracy_score(y_testval, y_pred)
report = classification_report(y_testval, y_pred)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)


KeyboardInterrupt: 